In [None]:
!pip install --upgrade pip -q
!pip install -U unbabel-comet -q
!pip install ctranslate2 -q
!pip install datasets -q
!!pip3 install faster-whisper -q

In [None]:
!apt update
!apt install libcudnn9-cuda-12
!pip install nvidia-cublas-cu12 nvidia-cudnn-cu12==9.*

!export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`


In [5]:
from datasets import load_dataset
import numpy as np

In [1]:
import os
cache_dir = "/content/huggingface_cache"
os.makedirs(cache_dir, exist_ok=True)

# Set ALL Hugging Face related cache directories
os.environ["TRANSFORMERS_CACHE"] = os.path.join(cache_dir, "transformers")
os.environ["HF_DATASETS_CACHE"] = os.path.join(cache_dir, "datasets")
os.environ["HF_HOME"] = os.path.join(cache_dir, "hf_home")
os.environ["HF_ASSETS_CACHE"] = os.path.join(cache_dir, "assets")
os.environ["HUGGINGFACE_HUB_CACHE"] = os.path.join(cache_dir, "hub")
os.environ["HF_MODULES_CACHE"] = os.path.join(cache_dir, "modules")

# Create all directories
for dir_path in [os.environ["TRANSFORMERS_CACHE"],
                os.environ["HF_DATASETS_CACHE"],
                os.environ["HF_HOME"],
                os.environ["HF_ASSETS_CACHE"],
                os.environ["HUGGINGFACE_HUB_CACHE"],
                os.environ["HF_MODULES_CACHE"]]:
    os.makedirs(dir_path, exist_ok=True)

# Force datasets to use the new cache
from datasets import config
config.HF_DATASETS_CACHE = os.environ["HF_DATASETS_CACHE"]

### restart the machine after installing the unbabel-comet

# Transcription

In [None]:
dataset_name = "kreasof-ai/bem-eng-IWSLT2025"
test_dataset = load_dataset(dataset_name, trust_remote_code=True, split="test")

In [None]:
test_dataset

In [None]:
audio_arrays = [a["array"].astype(np.float32) for a in test_dataset["audio"]]

In [None]:
model_id = "kreasof-ai/whisper-medium-bem2en"
output_dir="ct2-whisper-medium-transcription-finetuned"
commit_hash= "2b91ce20bd264d43947d18db44d7d08e84ae49ee"

!ct2-transformers-converter \
--model {model_id} \
--output_dir {output_dir} \
--revision {commit_hash} \
--quantization float16 \
--copy_files tokenizer_config.json\
--force

In [None]:
from faster_whisper import WhisperModel

model_name = output_dir

model = WhisperModel(model_name, device="cuda", compute_type="float16")

In [None]:
tgt_lang = "en"

transcriptions = []
segments, info = model.transcribe(audio_arrays[0],
                                beam_size=5,
                                language=tgt_lang,
                                vad_filter=True
                                )
transcription = " ".join([segment.text.strip() for segment in segments])
transcriptions.append(transcription)

In [None]:
from tqdm.auto import tqdm

tgt_lang = "en"

transcriptions = []

for audio_array in tqdm(audio_arrays, total=len(audio_arrays)):
    segments, info = model.transcribe(audio_array,
                                        beam_size=5,
                                        language=tgt_lang,
                                        vad_filter=True)
    transcription = " ".join([segment.text.strip() for segment in segments])
    transcriptions.append(transcription)

In [None]:
source_sentences = transcriptions

# Translation

In [None]:
# mt_model_name = "kreasof-ai/nllb-200-distilled-600M-bem2en-flores200"
# output_mt = "ct2-nllb-be-en-finetuned"
# commit_hash = "3dc4fe6449ec5ac06e45fcebc26e5221c6a8d7f5"
# !ct2-transformers-converter --model {mt_model_name} --revision {commit_hash} --quantization float16 --output_dir {output_mt} --force

In [None]:
mt_model_name = "kreasof-ai/nllb-200-distilled-600M-bem2en-flores200"
output_mt = "ct2-nllb-be-en-finetuned"
commit_hash = "b7ab3b4345d080c83dd98b9c51d970d8d27dd18b"
!ct2-transformers-converter --model {mt_model_name} --revision {commit_hash} --quantization float16 --output_dir {output_mt} --force

In [None]:
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model

In [None]:
import os

directory = "/content/"

ct2_model_name = output_mt

ct_model_path = os.path.join(directory, ct2_model_name)
sp_model_path = os.path.join(directory, "flores200_sacrebleu_tokenizer_spm.model")

In [None]:
import ctranslate2
import sentencepiece as spm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

translator = ctranslate2.Translator(ct_model_path, device=device)

In [None]:
# Translate

# src_lang = "ben_Beng"
# tgt_lang = "eng_Latn"

src_lang = "bem_Latn"
tgt_lang = "eng_Latn"

beam_size = 5

source_sents = [sent.strip() for sent in source_sentences]
target_prefix = [[tgt_lang]] * len(source_sents)

# Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

# Translate the source sentences
translations = translator.translate_batch(source_sents_subworded,
                                          batch_type="tokens",
                                          max_batch_size=2024,
                                          beam_size=beam_size,
                                          target_prefix=target_prefix)
translations = [translation.hypotheses[0] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]

print(*translations_desubword[:10], sep="\n")

In [None]:
# translations_desubword

In [None]:
lst_data = [{"src": src, "mt":mt, "ref":ref} for src, mt, ref in zip(source_sentences, translations_desubword, references)]

In [None]:
lst_data[:2]

In [None]:
len(lst_data)

In [None]:
from comet import download_model, load_from_checkpoint

In [None]:
model_path = download_model("masakhane/africomet-mtl")
model = load_from_checkpoint(model_path)
# data = [
#     {
#         "src": "Pali Cimo, abasayanshi ukufuma pe sukulu lya Stanford University School of Medicine babilishe ukupangwa kwa kabombelo kakupimina akapya akengasobolola insandesande mu misango yashiko; akantu kamo akanono akengapulintwa elyo akengapangwa ukubomfya amapulinta ya inkjet pa mutengo wa U.S cent imo cilakamo.",
#         "mt": "On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells; a smaller one that can be printed and manufactured using an inkjet printer for about one U.S. cent each.",
#         "ref": "On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each."
#     },
#     {
#         "src": "Kensha wa ndeke aishibikwe nge ntungulushi ya Squadron Dilokrit Pattavee.",
#         "mt": "The pilot was identified as Squadron Leader Dilokrit Pattavee.",
#         "ref": "The pilot was identified as Squadron Leader Dilokrit Pattavee."
#     }
# ]
data = lst_data
model_output = model.predict(data, batch_size=8, gpus=1)
print (model_output)

In [None]:
sys_score = model_output["system_score"]

In [None]:
print("COMET:", round(sys_score*100, 2))

In [None]:
sys_score

In [None]:
0.5829142446651789 >0.5828630467324625