In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv('/ceph/dpandya/notsofar/newNotsofar/test.csv')

In [6]:
import torch
import torchaudio
import numpy as np
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
import os

# --- Configuration ---
#AUDIO_PATH = "your_audio_file.wav" # <--- IMPORTANT: Replace with your audio file path
AUDIO_PATH = df.iloc[0]['audio_files'] # <--- IMPORTANT: Replace with your audio file path
OUTPUT_DIR = "/ceph/dpandya/notsofar/newNotsofar/"
MIN_SILENCE_DURATION_MS = 200 # Minimum silence duration to consider a break between segments
SPEECH_PAD_MS = 100 # Add a small padding to speech segments (e.g., 100ms before and after)

# --- 1. Load VAD Model ---
print("1. Loading Silero VAD model...")
# Using a pre-trained Silero VAD model (e.g., 'silero_vad_large')
# You can explore other models from the Silero VAD repo if needed
# https://github.com/snakers4/silero-vad
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                              model='silero_vad',
                              force_reload=False, # Set to True to always download the latest model
                              trust_repo=True) # Trust the repository for loading

(get_speech_timestamps, _, read_audio, _, _) = utils

# --- 2. Load and Preprocess Audio ---
print(f"2. Loading audio from {AUDIO_PATH}...")
# Use pydub to handle various audio formats and ensure consistent sample rate
try:
    audio_segment = AudioSegment.from_file(AUDIO_PATH)
    # Resample to 16kHz for VAD and Whisper, as they are typically trained on it
    if audio_segment.frame_rate != 16000:
        print(f"Resampling audio from {audio_segment.frame_rate}Hz to 16000Hz...")
        audio_segment = audio_segment.set_frame_rate(16000)
    
    # Export to a temporary WAV file for torchaudio to load easily
    temp_wav_path = "temp_audio_16khz.wav"
    audio_segment.export(temp_wav_path, format="wav")

    # Load with torchaudio for VAD
    # Ensure audio is mono
    audio_data, sample_rate = torchaudio.load(temp_wav_path)
    if audio_data.shape[0] > 1:
        audio_data = torch.mean(audio_data, dim=0, keepdim=True) # Convert to mono
    
    # Remove the temporary file
    os.remove(temp_wav_path)

except Exception as e:
    print(f"Error loading audio: {e}")
    print("Please ensure the audio file exists and is a supported format (e.g., .wav, .mp3).")
    exit()

# --- 3. Apply VAD to get speech timestamps ---
print("3. Applying VAD to detect speech segments...")
# VAD operates on a single channel (mono) audio
speech_timestamps = get_speech_timestamps(audio_data.squeeze(), model, sampling_rate=sample_rate)

# --- 4. Refine VAD Segments (Merge close segments and add padding) ---
print("4. Refining VAD segments...")
refined_segments = []
if speech_timestamps:
    current_start = speech_timestamps[0]['start']
    current_end = speech_timestamps[0]['end']

    for i in range(1, len(speech_timestamps)):
        next_start = speech_timestamps[i]['start']
        next_end = speech_timestamps[i]['end']

        # Calculate silence duration in samples
        silence_duration_samples = next_start - current_end
        silence_duration_ms = (silence_duration_samples / sample_rate) * 1000

        # If the silence between segments is less than MIN_SILENCE_DURATION_MS, merge them
        if silence_duration_ms < MIN_SILENCE_DURATION_MS:
            current_end = max(current_end, next_end) # Extend current segment
        else:
            # Add padding and append the completed segment
            refined_segments.append({
                'start': max(0, current_start - int(SPEECH_PAD_MS * sample_rate / 1000)),
                'end': min(audio_data.shape[1], current_end + int(SPEECH_PAD_MS * sample_rate / 1000))
            })
            current_start = next_start
            current_end = next_end
    
    # Add the last refined segment
    refined_segments.append({
        'start': max(0, current_start - int(SPEECH_PAD_MS * sample_rate / 1000)),
        'end': min(audio_data.shape[1], current_end + int(SPEECH_PAD_MS * sample_rate / 1000))
    })

print(f"Found {len(refined_segments)} refined speech segments.")
for i, seg in enumerate(refined_segments):
    print(f"  Segment {i+1}: Start={seg['start']/sample_rate:.2f}s, End={seg['end']/sample_rate:.2f}s")


# --- 5. Extract and Process Segments with Whisper ---
print("5. Initializing Whisper large model...")
# Using 'openai/whisper-large-v3' for the largest and latest model
# You might need to specify device="cuda" if you have a GPU
try:
    pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", device=0 if torch.cuda.is_available() else -1)
except Exception as e:
    print(f"Error initializing Whisper model: {e}")
    print("Ensure you have a suitable backend (e.g., PyTorch) and sufficient memory.")
    print("If you have a GPU, ensure CUDA is properly installed.")
    exit()

print(f"6. Processing {len(refined_segments)} speech segments with Whisper...")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

all_transcriptions = []
for i, segment in enumerate(refined_segments):
    start_sample = segment['start']
    end_sample = segment['end']
    
    segment_data = audio_data[:, start_sample:end_sample]

    # Save temporary segment for Whisper pipeline
    temp_segment_path = os.path.join(OUTPUT_DIR, f"segment_{i:04d}.wav")
    sf.write(temp_segment_path, segment_data.squeeze().numpy(), sample_rate)

    print(f"  Transcribing segment {i+1} ({start_sample/sample_rate:.2f}s - {end_sample/sample_rate:.2f}s)...")
    try:
        # Pass the file path to the pipeline for processing
        result = pipe(temp_segment_path)
        transcription = result["text"]
        all_transcriptions.append({
            'segment_id': i + 1,
            'start_time_s': start_sample / sample_rate,
            'end_time_s': end_sample / sample_rate,
            'text': transcription
        })
        print(f"    Transcription: {transcription}")
    except Exception as e:
        print(f"    Error transcribing segment {i+1}: {e}")
        all_transcriptions.append({
            'segment_id': i + 1,
            'start_time_s': start_sample / sample_rate,
            'end_time_s': end_sample / sample_rate,
            'text': f"Error: {e}"
        })
    finally:
        # Clean up temporary segment file
        os.remove(temp_segment_path)

print("\n--- All Transcriptions ---")
for entry in all_transcriptions:
    print(f"[{entry['start_time_s']:.2f}s - {entry['end_time_s']:.2f}s] {entry['text']}")

# Optionally, save all transcriptions to a file
with open(os.path.join(OUTPUT_DIR, "full_transcription.txt"), "w", encoding="utf-8") as f:
    for entry in all_transcriptions:
        f.write(f"[{entry['start_time_s']:.2f}s - {entry['end_time_s']:.2f}s] {entry['text']}\n")
print(f"\nFull transcription saved to {os.path.join(OUTPUT_DIR, 'full_transcription.txt')}")

1. Loading Silero VAD model...


Using cache found in /home/dpandya/.cache/torch/hub/snakers4_silero-vad_master


2. Loading audio from /ceph/dpandya/notsofar/eval_set/240825.1_eval_full_with_GT/MTG/MTG_32052/sc_plaza_0/ch0.wav...
3. Applying VAD to detect speech segments...
4. Refining VAD segments...
Found 58 refined speech segments.
  Segment 1: Start=6.33s, End=7.78s
  Segment 2: Start=8.00s, End=11.01s
  Segment 3: Start=11.42s, End=21.41s
  Segment 4: Start=21.69s, End=27.97s
  Segment 5: Start=28.19s, End=30.47s
  Segment 6: Start=30.65s, End=34.53s
  Segment 7: Start=34.59s, End=36.00s
  Segment 8: Start=36.03s, End=40.13s
  Segment 9: Start=40.32s, End=46.08s
  Segment 10: Start=46.85s, End=51.75s
  Segment 11: Start=51.87s, End=57.60s
  Segment 12: Start=57.66s, End=59.20s
  Segment 13: Start=59.23s, End=69.57s
  Segment 14: Start=70.01s, End=73.25s
  Segment 15: Start=73.41s, End=75.23s
  Segment 16: Start=75.26s, End=79.39s
  Segment 17: Start=79.97s, End=89.63s
  Segment 18: Start=90.59s, End=93.67s
  Segment 19: Start=93.69s, End=106.88s
  Segment 20: Start=107.04s, End=109.25s
  Seg

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


    Transcription:  Okay, so here.
  Transcribing segment 2 (8.00s - 11.01s)...




    Transcription:  We got a big budget in the
  Transcribing segment 3 (11.42s - 21.41s)...




    Transcription:  office regarding snacks. Oh, right. Yeah. About snacks or luncheon snacks? No, regarding snacks. Okay.
  Transcribing segment 4 (21.69s - 27.97s)...




    Transcription:  And I know all of us and other people in the office, I just called you guys, but there's also other people in the office that have different preferences
  Transcribing segment 5 (28.19s - 30.47s)...




    Transcription:  with foods that have food restrictions.
  Transcribing segment 6 (30.65s - 34.53s)...




    Transcription:  So even though we have a big budget, we want to make the most out of it.
  Transcribing segment 7 (34.59s - 36.00s)...




    Transcription:  Is there something that you guys would
  Transcribing segment 8 (36.03s - 40.13s)...




    Transcription:  prefer to have or not to have an office. Big budget means
  Transcribing segment 9 (40.32s - 46.08s)...




    Transcription:  on the quantity we can buy or the quality we can buy? Quality. Quality.
  Transcribing segment 10 (46.85s - 51.75s)...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


    Transcription:  I'm nuts about nuts. So we're talking about quality. Pistachio.
  Transcribing segment 11 (51.87s - 57.60s)...




    Transcription:  Pistachios, pecans, and cashews. Macadamia. Macadamia. Brazil.
  Transcribing segment 12 (57.66s - 59.20s)...




    Transcription:  Brazil, no.
  Transcribing segment 13 (59.23s - 69.57s)...




    Transcription:  Yeah, but who eats it? Monkey nuts. I do. Healthy people. That's right. They're the best snack. They're not healthy. We want healthy bodies, healthy brains here. I don't care for healthy stuff.
  Transcribing segment 14 (70.01s - 73.25s)...




    Transcription:  You don't care about... No, like, I think it's kind of a waste of money.
  Transcribing segment 15 (73.41s - 75.23s)...




    Transcription:  All right, so what are you looking for?
  Transcribing segment 16 (75.26s - 79.39s)...




    Transcription:  I think we should have like ice cream. Ice cream is not the snack.
  Transcribing segment 17 (79.97s - 89.63s)...




    Transcription:  I think it's a snack. Chocolate bars. At least if we have a soft serve machine, we can have goat's milk.
  Transcribing segment 18 (90.59s - 93.67s)...




    Transcription:  Go on. A small con machine.
  Transcribing segment 19 (93.69s - 106.88s)...




    Transcription:  popcorn is nice yeah popcorn machine stinks out the office like you can't believe it stinks out the office yeah it smells so good would you want that it smells like popcorn is a very
  Transcribing segment 20 (107.04s - 109.25s)...




    Transcription:  We love our child boards and
  Transcribing segment 21 (109.37s - 113.86s)...




    Transcription:  No unhealthy snacks for what I'm supposed to eat. No, no, no. No natural popcorn without the butter.
  Transcribing segment 22 (114.11s - 116.83s)...




    Transcription:  That's not popcorn then.
  Transcribing segment 23 (117.66s - 133.35s)...




    Transcription:  Popcorn without butter. Are you talking about candy and things like that? Candy, Sour Patch. Does that mean that we have... Wait, wait, wait. Do we have to say... I mean, that's just like... Does that mean that we have... Perhaps everyone else. I know that we have the budget, but... But we can have self-control. I want my unhealthy snacks. Mm-hmm.
  Transcribing segment 24 (133.37s - 137.76s)...




    Transcription:  Does that mean also because we have to merge it that we have to remodel the kitchen?
  Transcribing segment 25 (138.24s - 156.29s)...




    Transcription:  to be able to get all of that. No, no, no. Well, we don't have enough. We need it all for snacks. People are hungry. I know, but we don't have enough in a food to store all of that. Do we need more storage? Do we need more cabinets? Do we need another? No, we've got the walk-in fridges already. That's our big product.
  Transcribing segment 26 (156.80s - 159.14s)...




    Transcription:  Maybe we can just bring in another walk-in fridge.
  Transcribing segment 27 (159.33s - 175.20s)...




    Transcription:  I suggest that we have a barista, let's say, in the morning hours. That's a nice thing. And then a barista turns into an ice cream soda maker, let's say, just after lunch so that people can have something sweet and they can carry them on for the day.
  Transcribing segment 28 (175.23s - 184.74s)...




    Transcription:  That's really healthy. That's really healthy. Yes, yes, yes. I'm thinking about your calorie requirements.
  Transcribing segment 29 (184.77s - 188.35s)...




    Transcription:  Yeah. And we have to think seriously about people's caffeine requirements.
  Transcribing segment 30 (188.45s - 190.37s)...




    Transcription:  That's the very first.
  Transcribing segment 31 (190.43s - 193.51s)...




    Transcription:  I think everyone drinks at least four cups a day.
  Transcribing segment 32 (193.60s - 194.82s)...




    Transcription:  And it was healthy.
  Transcribing segment 33 (194.88s - 200.10s)...




    Transcription:  It won't just be dreams. Are we taking care of their health or are we just taking care of what they want? Both.
  Transcribing segment 34 (200.29s - 200.83s)...




    Transcription:  Στος.
  Transcribing segment 35 (200.93s - 207.78s)...




    Transcription:  Because we know what sugar does to you. Well, I think that if we're going to have these baristas in this, then I want a real sushi barista.
  Transcribing segment 36 (207.93s - 214.21s)...




    Transcription:  station as well sushi is not the snack total snack and it's the healthiest food
  Transcribing segment 37 (214.30s - 224.00s)...




    Transcription:  in this world and it keeps me motivated. It's good having a sashimi buffet. Again, then we need to upgrade our kitchen.
  Transcribing segment 38 (224.22s - 229.92s)...




    Transcription:  Now it takes more budget off. I mean, we do have a pretty big budget that we can add another few
  Transcribing segment 39 (230.91s - 241.19s)...




    Transcription:  islands. It's also using the product, you know, we could have the robot make the sushi. For me, I can't compare it to
  Transcribing segment 40 (241.41s - 243.65s)...




    Transcription:  technology.
  Transcribing segment 41 (243.74s - 248.90s)...




    Transcription:  For me, snacks is something that can stay overnight without getting ruined.
  Transcribing segment 42 (249.05s - 252.64s)...




    Transcription:  I don't see sushi like an option because
  Transcribing segment 43 (252.73s - 255.01s)...




    Transcription:  It's a snack is a snack is a snack.
  Transcribing segment 44 (255.20s - 261.92s)...




    Transcription:  it has to be able to be well we can listen well then what about our dim sum robot
  Transcribing segment 45 (262.24s - 264.67s)...




    Transcription:  wagons coming around to our desks.
  Transcribing segment 46 (264.73s - 266.88s)...




    Transcription:  Yeah.
  Transcribing segment 47 (267.97s - 272.61s)...




    Transcription:  All right, so Rachel, you wanted to talk about more kitchen remodeling.
  Transcribing segment 48 (273.12s - 276.42s)...




    Transcription:  We need a quote for that. Let's do that.
  Transcribing segment 49 (276.48s - 280.58s)...




    Transcription:  At the next meeting, we'll have the list of the new snacks we're going to
  Transcribing segment 50 (280.64s - 282.75s)...




    Transcription:  bring in that need to be
  Transcribing segment 51 (282.81s - 283.49s)...




    Transcription:  you know,
  Transcribing segment 52 (283.52s - 291.11s)...




    Transcription:  maybe storage differently and then we'll fix it. So you sort that out by the 25th?
  Transcribing segment 53 (291.23s - 307.71s)...




    Transcription:  specialists in ice cream soda making that we can bring and how much it will cost to rent it. And then the dim sum wagon, I'll speak to our tech guy because we can put one of our products on the floor and we can do quality control like that as well. Great.
  Transcribing segment 54 (307.77s - 326.24s)...




    Transcription:  Are we going to bring this to a vote with all the employees or is this going to be decided just by us? I think if we bring it to a vote, it's going to be too controversial. Nobody will ever agree on anything. We're going to make the decision for the whole company. We'll make the decision and people leave the company because of it. Are we going to bring it to a vote in our group?
  Transcribing segment 55 (326.97s - 330.79s)...




    Transcription:  Can we do that next week? No, I didn't hear the vote.
  Transcribing segment 56 (330.81s - 333.22s)...




    Transcription:  Do we want to put it for a vote next week?
  Transcribing segment 57 (333.31s - 355.14s)...




    Transcription:  Yes, let's bring it to a vote next week. Next week, we'll bring it to a vote in our group. Next week, once we have a little bit more data. Okay, if you want to share with the person next to you at work. Yeah, absolutely. And then come back with more. No, no, I think we should keep it hush-hush for now. Like we said, we're not opening it for a vote. We're deciding on this. We'll come back with more input next week, and then we'll choose which direction we'll do.
  Transcribing segment 58 (355.36s - 357.19s)...




    Transcription:  Yeah. Before that. Okay.

--- All Transcriptions ---
[6.33s - 7.78s]  Okay, so here.
[8.00s - 11.01s]  We got a big budget in the
[11.42s - 21.41s]  office regarding snacks. Oh, right. Yeah. About snacks or luncheon snacks? No, regarding snacks. Okay.
[21.69s - 27.97s]  And I know all of us and other people in the office, I just called you guys, but there's also other people in the office that have different preferences
[28.19s - 30.47s]  with foods that have food restrictions.
[30.65s - 34.53s]  So even though we have a big budget, we want to make the most out of it.
[34.59s - 36.00s]  Is there something that you guys would
[36.03s - 40.13s]  prefer to have or not to have an office. Big budget means
[40.32s - 46.08s]  on the quantity we can buy or the quality we can buy? Quality. Quality.
[46.85s - 51.75s]  I'm nuts about nuts. So we're talking about quality. Pistachio.
[51.87s - 57.60s]  Pistachios, pecans, and cashews. Macadamia. Macadamia. Brazil.
[57.66s - 59.2

In [13]:
import re

def clean_text(s):
    # Convert to string to handle potential non-string inputs gracefully
    s = str(s)
    
    # 1. Remove text enclosed in angle brackets (e.g., <ST>, <UNKNOWN>)
    s = re.sub(r'<[^>]+>', '', s)
    
    # 2. Remove punctuation
    # This regex matches any character that is NOT a word character (alphanumeric + underscore) or whitespace
    s = re.sub(r'[^\w\s]', '', s)
    
    # 3. Remove extra spaces (replace multiple spaces with a single space)
    s = re.sub(r'\s+', ' ', s)
    
    # 4. Remove leading and trailing spaces
    s = s.strip()
    
    # 5. Convert to lowercase for case-insensitive comparison
    return s.lower()


clean_text(pd.read_json(df.iloc[0]['transcription_files']).iloc[1]['text'])

'we got a we got a big budget in the office uh regarding snacks uh yeah'

In [18]:
import torch
import torchaudio
import numpy as np
import soundfile as sf
from pydub import AudioSegment
from transformers import pipeline
import os
import json
from jiwer import wer # For Word Error Rate calculation

# --- Configuration ---
AUDIO_PATH = df.iloc[0]['audio_files']  # <--- IMPORTANT: Replace with your audio file path
LABEL_JSON_PATH = df.iloc[0]['transcription_files'] # <--- IMPORTANT: Replace with your JSON label file path
OUTPUT_DIR = "/ceph/dpandya/notsofar/newNotsofar/"

# --- 1. Load Whisper Model ---
print("1. Initializing Whisper large model...")
# Using 'openai/whisper-large-v3' for the largest and latest model
# You might need to specify device="cuda" if you have a GPU
try:
    pipe = pipeline("automatic-speech-recognition", model="openai/whisper-medium", device=0 if torch.cuda.is_available() else -1)
except Exception as e:
    print(f"Error initializing Whisper model: {e}")
    print("Ensure you have a suitable backend (e.g., PyTorch) and sufficient memory.")
    print("If you have a GPU, ensure CUDA is properly installed.")
    exit()

# --- 2. Load and Preprocess Audio ---
print(f"2. Loading audio from {AUDIO_PATH}...")
try:
    audio_segment = AudioSegment.from_file(AUDIO_PATH)
    # Resample to 16kHz, as Whisper is typically trained on it
    if audio_segment.frame_rate != 16000:
        print(f"Resampling audio from {audio_segment.frame_rate}Hz to 16000Hz...")
        audio_segment = audio_segment.set_frame_rate(16000)
    
    # Export to a temporary WAV file for torchaudio to load easily
    temp_wav_path = "temp_audio_16khz_for_eval.wav"
    audio_segment.export(temp_wav_path, format="wav")

    # Load with torchaudio for segment extraction
    # Ensure audio is mono
    audio_data, sample_rate = torchaudio.load(temp_wav_path)
    if audio_data.shape[0] > 1:
        audio_data = torch.mean(audio_data, dim=0, keepdim=True) # Convert to mono
    
    # Remove the temporary file
    os.remove(temp_wav_path)

except Exception as e:
    print(f"Error loading audio: {e}")
    print("Please ensure the audio file exists and is a supported format (e.g., .wav, .mp3).")
    exit()

# --- 3. Load JSON Label File ---
print(f"3. Loading label data from {LABEL_JSON_PATH}...")
try:
    with open(LABEL_JSON_PATH, 'r', encoding='utf-8') as f:
        ground_truth_segments = json.load(f)
    print(f"Loaded {len(ground_truth_segments)} ground truth segments.")
except FileNotFoundError:
    print(f"Error: Label file not found at {LABEL_JSON_PATH}")
    exit()
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {LABEL_JSON_PATH}. Please check file format.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred while loading JSON: {e}")
    exit()


# --- 4. Process Labeled Segments with Whisper and Collect Results ---
print("4. Transcribing labeled segments with Whisper and collecting results...")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Lists to store ground truth and hypothesis for WER calculation
ground_truth_texts = []
hypothesis_texts = []
detailed_results = []

for i, segment_info in enumerate(ground_truth_segments):
    try:
        start_time_s = segment_info['start_time']
        end_time_s = segment_info['end_time']
        ground_truth_text = segment_info['text']
        speaker_id = segment_info.get('speaker', 'N/A') # Get speaker if present, default to N/A

        start_sample = int(start_time_s * sample_rate)
        end_sample = int(end_time_s * sample_rate)

        if not clean_text(ground_truth_text):
            continue
        # Ensure segment boundaries are within audio limits
        start_sample = max(0, start_sample)
        end_sample = min(audio_data.shape[1], end_sample)
        
        # Skip very short or invalid segments
        if end_sample <= start_sample:
            print(f"  Skipping invalid segment {i+1} (start_time: {start_time_s:.2f}s, end_time: {end_time_s:.2f}s).")
            continue

        segment_data = audio_data[:, start_sample:end_sample]

        # Save temporary segment for Whisper pipeline
        temp_segment_path = os.path.join(OUTPUT_DIR, f"eval_segment_{i:04d}.wav")
        sf.write(temp_segment_path, segment_data.squeeze().numpy(), sample_rate)

        #print(f"  Processing segment {i+1} (Speaker: {speaker_id}, {start_time_s:.2f}s - {end_time_s:.2f}s)...")
        #print(f"    Ground Truth: \"{ground_truth_text}\"")

        # Transcribe
        result = pipe(temp_segment_path)
        hypothesis_text = clean_text(result["text"])
        #print(f"    Hypothesis:   \"{hypothesis_text}\"")
        ground_truth_text = clean_text(ground_truth_text)
        ground_truth_texts.append(ground_truth_text)
        hypothesis_texts.append(hypothesis_text)
        detailed_results.append({
            'segment_id': i + 1,
            'speaker': speaker_id,
            'start_time_s': start_time_s,
            'end_time_s': end_time_s,
            'ground_truth': ground_truth_text,
            'hypothesis': hypothesis_text
        })
        
    except KeyError as ke:
        print(f"Error: Missing key in segment {i+1}: {ke}. Segment: {segment_info}")
    except Exception as e:
        print(f"Error processing segment {i+1}: {e}")
    finally:
        # Clean up temporary segment file
        if os.path.exists(temp_segment_path):
            os.remove(temp_segment_path)

1. Initializing Whisper large model...
2. Loading audio from /ceph/dpandya/notsofar/eval_set/240825.1_eval_full_with_GT/MTG/MTG_32052/sc_plaza_0/ch0.wav...
3. Loading label data from /ceph/dpandya/notsofar/eval_set/240825.1_eval_full_with_GT/MTG/MTG_32052/gt_transcription.json...
Loaded 284 ground truth segments.
4. Transcribing labeled segments with Whisper and collecting results...




In [19]:
# --- 5. Calculate and Report WER ---
print("\n--- ASR Evaluation Results ---")
if ground_truth_texts and hypothesis_texts:
    # Calculate WER
    overall_wer = wer(ground_truth_texts, hypothesis_texts)
    print(f"\nOverall Word Error Rate (WER): {overall_wer:.4f}")

    # Optionally, save detailed results
    results_json_path = os.path.join(OUTPUT_DIR, "detailed_asr_results_medium.json")
    with open(results_json_path, "w", encoding="utf-8") as f:
        json.dump(detailed_results, f, ensure_ascii=False, indent=4)
    print(f"Detailed results saved to {results_json_path}")

    # Optionally, save ground truth and hypothesis for debugging
    with open(os.path.join(OUTPUT_DIR, "ground_truth.txt"), "w", encoding="utf-8") as f:
        for text in ground_truth_texts:
            f.write(text + "\n")
    with open(os.path.join(OUTPUT_DIR, "hypothesis_medium.txt"), "w", encoding="utf-8") as f:
        for text in hypothesis_texts:
            f.write(text + "\n")
    print(f"Ground truth and hypothesis texts saved to {OUTPUT_DIR}/ground_truth.txt and {OUTPUT_DIR}/hypothesis.txt")

else:
    print("No valid segments were processed for evaluation.")


--- ASR Evaluation Results ---

Overall Word Error Rate (WER): 0.5136
Detailed results saved to /ceph/dpandya/notsofar/newNotsofar/detailed_asr_results_medium.json
Ground truth and hypothesis texts saved to /ceph/dpandya/notsofar/newNotsofar//ground_truth.txt and /ceph/dpandya/notsofar/newNotsofar//hypothesis.txt
