In [1]:
import pandas as pd
import numpy as np
import whisperx

  from .autonotebook import tqdm as notebook_tqdm
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []


In [4]:
import json

def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return sorted(data, key=lambda x: x['start_time'])

In [2]:
device = "cuda"
audio_file = "audio.mp3"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v3", device, compute_type=compute_type)

No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.3.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../home/dpandya/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.2.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.3.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [3]:
df = pd.read_csv('/ceph/dpandya/notsofar/newNotsofar/test.csv')

In [5]:
import pandas as pd
from typing import List, Dict, Tuple

def prepare_ground_truth_transcript(label_data: List[Dict]) -> str:
    """
    Prepares a single, time-ordered ground truth transcript from label data
    that may contain overlapping speech.

    Args:
        label_data (List[Dict]): A list of dictionaries, where each dictionary
                                 represents a row from your label file and contains
                                 'word_timing' (list of lists: [word, start_time, end_time]).

    Returns:
        str: A space-separated string of all words, ordered by their start time.
    """
    all_words_with_timestamps = []

    # Collect all words with their timings from all speakers
    for entry in label_data:
        # Each entry['word_timing'] is a list of [word, start_time, end_time]
        for word_info in entry['word_timing']:
            all_words_with_timestamps.append(word_info)

    # Sort all words by their start time
    # word_info is [word, start_time, end_time] so index 1 is start_time
    all_words_with_timestamps.sort(key=lambda x: x[1])

    # Join the words to form the final ground truth transcript
    ground_truth_words = [word_info[0] for word_info in all_words_with_timestamps]
    return " ".join(ground_truth_words)

# --- Example Usage ---
# Let's simulate some label data for a segment with overlapping speech
sample_label_data = read_json_file(df.iloc[0]['transcription_files'])

# When you read your actual label file, you'll need to filter relevant rows
# For an entire meeting, you'd pass all rows for that meeting to this function.
# If you segment your audio, you'd pass rows within the segment's time range.

ground_truth = prepare_ground_truth_transcript(sample_label_data)
print(f"Prepared Ground Truth: {ground_truth}")

Prepared Ground Truth: ok so why we here we got mmm a we got a big budget in the office uh regarding snacks hmm oh uh great yeah oh regarding um snacks or yeah <fill/> lunch and snacks no regarding snacks snacks ok ok um and i know all of us and other people in the office i just called you guys but there's also other people in office that have different preferences uh for foods they have food restrictions um mmm so even though yeah we have a big budget we wanna make the most out of it so is there something that you guys would prefer to have or not to have in the office a big budget means uh on the quantity we can buy or the quality the we can quality quality buy <unknown/> quality i yeah <filllaugh/> would go for <unknown/> quality yeah mmm yeah <filllaugh/> i'm mmm nuts about nuts so if we're talking <filllaugh/> about quality pistachios we're talking about pistachios <unknown/> pecans and cashews macadamias mmm yeah and macadamia isn't that kind of a that's waste right of money like 

In [6]:
audio = whisperx.load_audio(df.iloc[0]['audio_files'])
result = model.transcribe(audio)

Detected language: en (1.00) in first 30s of audio...


In [11]:
preds_raw = []
for entry in result['segments']:
    preds_raw.append(entry['text'])
preds = " ".join(preds_raw)[1:]

In [14]:
import re

def normalize_text(text: str) -> str:
    """
    Normalizes text for WER calculation: converts to lowercase,
    removes punctuation, and collapses multiple spaces.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # Collapse multiple spaces
    return text

gt_clean = normalize_text(ground_truth)
preds_clean = normalize_text(preds)

In [21]:
import jiwer

def calculate_wer(ground_truth_transcripts: List[str], whisper_predictions: List[str]) -> float:
    """
    Calculates the Word Error Rate (WER) between ground truth and ASR predictions.

    Args:
        ground_truth_transcripts (List[str]): List of normalized ground truth transcripts.
        whisper_predictions (List[str]): List of normalized ASR predictions.

    Returns:
        float: The calculated WER.
    """
    # Load the WER metric
    #wer_metric = jiwer("wer")

    # The wer_metric.compute method expects lists of references and hypotheses
    # It automatically handles alignment and error counting.
    wer = jiwer.wer(reference=ground_truth_transcripts, hypothesis=whisper_predictions)
    return wer

In [23]:
calculate_wer(gt_clean, preds_clean)

0.42830655129789863