In [1]:
!pip install git+https://github.com/openai/whisper.git
!pip install jiwer torchaudio peft transformers accelerate

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-0iie8jf9
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-0iie8jf9
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
import zipfile
import os

# Unzip both PEFT models
model1_zip_path = "whisper-large-peft-malay-e20.zip"
model2_zip_path = "whisper-large-v3-ft-malay-peft-epoch-20.zip"

model1_dir = "rdee"
model2_dir = "clt"

with zipfile.ZipFile(model1_zip_path, 'r') as zip_ref:
    zip_ref.extractall(model1_dir)

with zipfile.ZipFile(model2_zip_path, 'r') as zip_ref:
    zip_ref.extractall(model2_dir)

In [3]:
import zipfile

with zipfile.ZipFile("final_segments.zip", 'r') as zip_ref:
    zip_ref.extractall("final_segments")

In [4]:
import os
import torch
import pandas as pd
import torchaudio
from jiwer import wer
from peft import PeftModel
from transformers import WhisperProcessor, WhisperForConditionalGeneration

device = "cuda" if torch.cuda.is_available() else "cpu"
base_model_name = "openai/whisper-large-v3"
processor = WhisperProcessor.from_pretrained(base_model_name)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="ms", task="transcribe")

# STEP 4: Define transcription function
def transcribe(model, audio_path):
    waveform, sr = torchaudio.load(audio_path)
    if sr != 16000:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)

    inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features.to(device)

    with torch.no_grad():
        predicted_ids = model.generate(input_features=inputs, forced_decoder_ids=forced_decoder_ids)

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    del waveform, inputs, predicted_ids
    torch.cuda.empty_cache()
    return transcription.strip()

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [5]:
df = pd.read_csv("aligned_final_segments.csv")
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

# Add columns for storing outputs
df["Model 1 Transcription"] = ""
df["Model 1 WER"] = None
df["Model 2 Transcription"] = ""
df["Model 2 WER"] = None

# Optional: Limit batch size to reduce memory spikes
BATCH_SIZE = 10

In [6]:
try:
    print("Pass 2: Evaluating model clt013...")
    base2 = WhisperForConditionalGeneration.from_pretrained(base_model_name).to(device)
    model2 = PeftModel.from_pretrained(base2, "clt/whisper-large-v3-ft-malay-peft-epoch-20").to(device).eval()
except RuntimeError as e:
    print("CUDA out of memory. Switching model 2 to CPU...")
    device2 = "cpu"
    base2 = WhisperForConditionalGeneration.from_pretrained(base_model_name).to(device2)
    model2 = PeftModel.from_pretrained(base2, "clt013").to(device2).eval()

# Transcribe with model 2
for idx in range(len(df)):
    if idx % BATCH_SIZE == 0:
        print(f"clt013 progress: {idx}/{len(df)}")

    audio_file = df.at[idx, "Audio File"]
    reference = df.at[idx, "Text Chunk"]
    if not os.path.exists(audio_file): continue

    try:
        hyp2 = transcribe(model2, audio_file)
        df.at[idx, "Model 2 Transcription"] = hyp2
        df.at[idx, "Model 2 WER"] = round(wer(reference.lower(), hyp2.lower()), 3)
    except Exception as e:
        print(f"[clt013] Error on {audio_file}: {e}")

del model2, base2
torch.cuda.empty_cache()

Pass 2: Evaluating model clt013...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

clt013 progress: 0/285


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


clt013 progress: 10/285
clt013 progress: 20/285
clt013 progress: 30/285
clt013 progress: 40/285
clt013 progress: 50/285
clt013 progress: 60/285
clt013 progress: 70/285
clt013 progress: 80/285
clt013 progress: 90/285
clt013 progress: 100/285
clt013 progress: 110/285
clt013 progress: 120/285
clt013 progress: 130/285
clt013 progress: 140/285
clt013 progress: 150/285
clt013 progress: 160/285
clt013 progress: 170/285
clt013 progress: 180/285
clt013 progress: 190/285
clt013 progress: 200/285
clt013 progress: 210/285
clt013 progress: 220/285
clt013 progress: 230/285
clt013 progress: 240/285
clt013 progress: 250/285
clt013 progress: 260/285
clt013 progress: 270/285
clt013 progress: 280/285


In [7]:
# Evaluate Model 1 (rdee)
print("Pass 1: Evaluating model rdee...")
base1 = WhisperForConditionalGeneration.from_pretrained(base_model_name).to(device)
model1 = PeftModel.from_pretrained(base1, "rdee").to(device).eval()

for idx in range(len(df)):
    if idx % BATCH_SIZE == 0:
        print(f"rdee progress: {idx}/{len(df)}")

    audio_file = df.at[idx, "Audio File"]
    reference = df.at[idx, "Text Chunk"]
    if not os.path.exists(audio_file): continue

    try:
        hyp1 = transcribe(model1, audio_file)
        df.at[idx, "Model 1 Transcription"] = hyp1
        df.at[idx, "Model 1 WER"] = round(wer(reference.lower(), hyp1.lower()), 3)
    except Exception as e:
        print(f"[rdee] Error on {audio_file}: {e}")

Pass 1: Evaluating model rdee...




rdee progress: 0/285
rdee progress: 10/285
rdee progress: 20/285
rdee progress: 30/285
rdee progress: 40/285
rdee progress: 50/285
rdee progress: 60/285
rdee progress: 70/285
rdee progress: 80/285
rdee progress: 90/285
rdee progress: 100/285
rdee progress: 110/285
rdee progress: 120/285
rdee progress: 130/285
rdee progress: 140/285
rdee progress: 150/285
rdee progress: 160/285
rdee progress: 170/285
rdee progress: 180/285
rdee progress: 190/285
rdee progress: 200/285
rdee progress: 210/285
rdee progress: 220/285
rdee progress: 230/285
rdee progress: 240/285
rdee progress: 250/285
rdee progress: 260/285
rdee progress: 270/285
rdee progress: 280/285


In [8]:
df.to_csv("wer_comparison_results_20percent.csv", index=False)
from google.colab import files
files.download("wer_comparison_results_20percent.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [9]:
from jiwer import wer, cer, mer, wil, wip

# Define metrics computation
def compute_metrics(truths, hyps):
    return {
        "WER": round(wer(truths, hyps), 3),
        "CER": round(cer(truths, hyps), 3),
        "MER": round(mer(truths, hyps), 3),
        "WIL": round(wil(truths, hyps), 3),
        "WIP": round(wip(truths, hyps), 3),
    }

# Join references and hypotheses for each model
ref_sentences = df["Text Chunk"].str.lower().tolist()
model1_hyp = df["Model 1 Transcription"].fillna("").str.lower().tolist()
model2_hyp = df["Model 2 Transcription"].fillna("").str.lower().tolist()

# Calculate metrics
model1_scores = compute_metrics(ref_sentences, model1_hyp)
model2_scores = compute_metrics(ref_sentences, model2_hyp)

# Display in a comparison table
import pandas as pd

metrics_df = pd.DataFrame({
    "Metric": list(model1_scores.keys()),
    "Model 1 (rdee)": list(model1_scores.values()),
    "Model 2 (clt013)": list(model2_scores.values())
})

import IPython.display as display
display.display(metrics_df)

Unnamed: 0,Metric,Model 1 (rdee),Model 2 (clt013)
0,WER,0.285,0.311
1,CER,0.17,0.168
2,MER,0.282,0.306
3,WIL,0.432,0.48
4,WIP,0.568,0.52
