In [None]:
import os
import glob
import pandas as pd
from tqdm import tqdm
import json
from transformers import WhisperForConditionalGeneration, WhisperProcessor, BitsAndBytesConfig
import torch
from utils.clean_transcript import clean
from peft import PeftModel
import librosa
from IPython.display import Audio, display

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # available GPU
lang = 'bew' # which language to see
num_examples = 5 # how many sentences to transcribe/view/play

In [None]:
def generate(model, data, processor, proxy_lang):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_dtype = next(model.parameters()).dtype
    model.to(device).eval()
    forced_decoder_ids = processor.get_decoder_prompt_ids(language=proxy_lang, task="transcribe")
    predictions = []
    filepaths = []
    with torch.no_grad():
        for filepath in tqdm(data):      
            audio = librosa.load(filepath, offset=0, duration=30, mono=True, sr=16_000)[0]
            inputs = processor(audio=[audio], sampling_rate=16_000, return_tensors='pt')
            input_features = inputs.input_features.to(model.device)
            input_features = input_features.to(dtype=input_dtype)      
            
           # Generate output token IDs
            predicted_ids = model.generate(
                input_features,
                forced_decoder_ids=forced_decoder_ids,
                max_new_tokens=200
            )
            transcriptions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
            predictions += transcriptions
            filepaths.append(filepath)
    model.to("cpu")
    return predictions, filepaths

In [None]:
def get_model(config, model_dir, lang):
    if config['lora']:  
        # quantize
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16
        )
        model = WhisperForConditionalGeneration.from_pretrained(
            config["whisper_model"],
            quantization_config=bnb_config
        )
        model = PeftModel.from_pretrained(model, f"{model_dir}/{lang}")        
        model.print_trainable_parameters()
    else:
        model = WhisperForConditionalGeneration.from_pretrained(f"{model_dir}/{lang}")
    return model 

In [None]:
with open("config.json", "r") as f:
    config = json.load(f)
    f.close()

In [None]:
audio_paths = f"/cs-data-01/drd92/mozilla-asr-challenge/mcv-sps-st-09-2025/sps-corpus-1.0-2025-09-05-{lang}/audios/*"
model_dir = "results/whisper-large-v3/final_models"

In [None]:
proxy_lang = config["proxy_langs"][lang]
processor = WhisperProcessor.from_pretrained(
    config["whisper_model"],
    language=proxy_lang,
    task="transcribe"
)

In [None]:
data = glob.glob(audio_paths)[:num_examples]

In [None]:
model = get_model(config, model_dir, lang)

In [None]:
predictions, filepaths = generate(model, data, processor, proxy_lang)
predictions = [clean(p) for p in predictions]
rows = []
for i, p in enumerate(predictions):
    rows.append([filepaths[i].split("/")[-1], p])
lang_df = pd.DataFrame(rows, columns=["audio_file", "sentence"])

In [None]:
gold_transcripts_df = pd.read_csv(
    f"/cs-data-01/drd92/mozilla-asr-challenge/mcv-sps-st-09-2025/sps-corpus-1.0-2025-09-05-{lang}/ss-corpus-{lang}.tsv",
    sep = '\t'
)

In [None]:
for i, datum in enumerate(data):
    display(Audio(datum))
    print('pred:', lang_df.loc[i]['sentence'].strip())
    print('gold:', gold_transcripts_df[gold_transcripts_df['audio_file'] == datum.split(os.path.sep)[-1]]['transcription'].item())