In [1]:
# Install dependencies
!pip install -q transformers torchaudio jiwer

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m120.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [12]:
import os
import pandas as pd
import torchaudio
import zipfile
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from jiwer import wer
from tqdm import tqdm

# === Step 1: Define paths ===
csv_path = "/content/aligned_final_segments.csv"
zip_path = "/content/final_segments.zip"
audio_dir = "/content/final_segments"
output_csv = "/content/transcription_results.csv"

# === Step 2: Unzip audio files ===
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(audio_dir)

# === Step 3: Load CSV ===
df = pd.read_csv(csv_path)

# === Step 4: Load models and their matching processors ===
device = "cuda" if torch.cuda.is_available() else "cpu"

model_small = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
processor_small = WhisperProcessor.from_pretrained("openai/whisper-small")

model_large = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").to(device)
processor_large = WhisperProcessor.from_pretrained("openai/whisper-large-v3")

# Convert large model to FP16 for CUDA (for efficiency)
if device == "cuda":
    model_large = model_large.half()

# === Step 5: Define transcription function ===
def transcribe(model, processor, audio_path):
    waveform, sr = torchaudio.load(audio_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)

    waveform_np = waveform.mean(dim=0).numpy()  # ensure mono
    inputs = processor(waveform_np, sampling_rate=16000, return_tensors="pt").input_features.to(device)

    # Match input dtype to model (important for FP16 on GPU)
    inputs = inputs.to(dtype=next(model.parameters()).dtype)

    forced_decoder_ids = processor.get_decoder_prompt_ids(language="malay", task="transcribe")

    with torch.no_grad():
        predicted_ids = model.generate(input_features=inputs, forced_decoder_ids=forced_decoder_ids)

    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

# === Step 6: Prepare result columns ===
df["small_transcription"] = ""
df["large_transcription"] = ""
df["wer_small"] = 0.0
df["wer_large"] = 0.0

# === Step 7: Loop and transcribe ===
for idx, row in tqdm(df.iterrows(), total=len(df)):
    audio_file_path = os.path.join("/content", row["Audio File"])
    if not os.path.exists(audio_file_path):
        print(f"Missing file: {audio_file_path}")
        continue

    ground_truth = row["Text Chunk"]

    try:
        hyp_small = transcribe(model_small, processor_small, audio_file_path)
        hyp_large = transcribe(model_large, processor_large, audio_file_path)

        df.at[idx, "small_transcription"] = hyp_small
        df.at[idx, "large_transcription"] = hyp_large
        df.at[idx, "wer_small"] = wer(ground_truth.lower(), hyp_small.lower())
        df.at[idx, "wer_large"] = wer(ground_truth.lower(), hyp_large.lower())

    except Exception as e:
        print(f"Error at row {idx}: {e}")

# === Step 8: Save results ===
df.to_csv(output_csv, index=False)
print("Transcription and WER calculation complete. Results saved to:", output_csv)

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

100%|██████████| 2848/2848 [2:05:32<00:00,  2.64s/it]

Transcription and WER calculation complete. Results saved to: /content/transcription_results.csv



