In [None]:
!nvidia-smi

In [None]:
!apt update
!apt install libcudnn9-cuda-12
!pip install nvidia-cublas-cu12 nvidia-cudnn-cu12==9.*

!export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`


In [None]:
!pip3 install faster-whisper ctranslate2 datasets sacrebleu -q
!pip install -U unbabel-comet jiwer -q

In [4]:
from datasets import load_dataset, Dataset, Audio
import numpy as np

In [5]:
import os
cache_dir = "/content/huggingface_cache"
os.makedirs(cache_dir, exist_ok=True)

# Set ALL Hugging Face related cache directories
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets")
os.environ["HF_HOME"] = os.path.join(cache_dir, "hf_home")
os.environ["HF_ASSETS_CACHE"] = os.path.join(cache_dir, "assets")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(cache_dir, "hub")
os.environ["HF_MODULES_CACHE"] = os.path.join(cache_dir, "modules")

# Create all directories
for dir_path in [os.environ["TRANSFORMERS_CACHE"],
                os.environ["HF_DATASETS_CACHE"],
                os.environ["HF_HOME"],
                os.environ["HF_ASSETS_CACHE"],
                os.environ["HUGGINGFACE_HUB_CACHE"],
                os.environ["HF_MODULES_CACHE"]]:
    os.makedirs(dir_path, exist_ok=True)

# Force datasets to use the new cache
from datasets import config
config.HF_DATASETS_CACHE = os.environ["HF_DATASETS_CACHE"]

# Load the Dataset

In [None]:
dataset_name = "kreasof-ai/bem-eng-IWSLT2025"
test_dataset = load_dataset(dataset_name, trust_remote_code=True, split="test")

In [None]:
test_dataset

In [None]:
for sample in test_dataset:
    print(sample["audio"])  # Access the audio column
    break  # Just to check the first example

# Create Arrays for audio

In [None]:
references = test_dataset["translation"]
print(references) # Check if the references is list

In [10]:
audio_arrays = [a["array"].astype(np.float32) for a in test_dataset["audio"]]

In [11]:

# audio_arrays

In [12]:
assert len(audio_arrays) == len(references)

In [None]:
len(audio_arrays)

In [None]:
len(references)

# Cascaded

## transcribe

In [15]:
# model_id = "kreasof-ai/whisper-medium-bem2en"
# output_dir="ct2-whisper-medium-transcription-finetuned"
# commit_hash= "2b91ce20bd264d43947d18db44d7d08e84ae49ee"

# !ct2-transformers-converter \
# --model {model_id} \
# --output_dir {output_dir} \
# --revision {commit_hash} \
# --quantization float16 \
# --copy_files tokenizer_config.json\
# --force

In [None]:
model_id = "kreasof-ai/whisper-medium-bem2en"
output_dir="ct2-whisper-medium-transcription-finetuned"
commit_hash = "46d8836114c5369ffee656a67a06e65ba68cb77d"

!ct2-transformers-converter \
--model {model_id} \
--output_dir {output_dir} \
--quantization float16 \
--copy_files tokenizer_config.json\
--force

In [92]:
from faster_whisper import WhisperModel

model_name = output_dir

model = WhisperModel(model_name, device="cuda", compute_type="float16")

In [None]:
test_dataset[0]

In [94]:
tgt_lang = "en"

transcriptions = []
segments, info = model.transcribe(audio_arrays[0],
                                beam_size=5,
                                language=tgt_lang,
                                vad_filter=True
                                )
transcription = " ".join([segment.text.strip() for segment in segments])
transcriptions.append(transcription)

In [None]:
print(transcriptions)

In [None]:
from tqdm.auto import tqdm

tgt_lang = "en"

transcriptions = []

for audio_array in tqdm(audio_arrays, total=len(audio_arrays)):
    segments, info = model.transcribe(audio_array,
                                        beam_size=5,
                                        language=tgt_lang,
                                        vad_filter=True)
    transcription = " ".join([segment.text.strip() for segment in segments])
    transcriptions.append(transcription)

In [None]:
print(*transcriptions[:20], sep="\n")

## Evaluation

In [98]:
# with open("medium-asr-finetuned.txt") as f:
#     transcriptions = f.read().splitlines()
# transcriptions[:4]

In [99]:
references = test_dataset["sentence"]

In [None]:
len(references) == len(transcriptions)

In [None]:
references[:4]

In [None]:
transcriptions[:4]

In [None]:
import jiwer

# Calculate WER
wer = jiwer.wer(references, transcriptions)
# print(wer)
wer = round(wer * 100, 2)  # Convert to percentage

print(model_name)
print("WER")
print(wer)

In [50]:
# with open(f"whisper-medium-baseline.txt", "w") as f:
#     for transcription in transcriptions:
#         f.write(transcription + "\n")