In [None]:
!nvidia-smi

In [None]:
!pip3 install faster-whisper ctranslate2 datasets sacrebleu -q

# Load the Dataset

In [None]:
from datasets import load_dataset

dataset_name = "kreasof-ai/be-en-IWSLT2025"
dataset = load_dataset(dataset_name, trust_remote_code=True)
dataset

In [None]:
test_arrays = [a["array"].astype(np.float31) for a in dataset["test"]["audio"]]


# Preprocess Dataset

# Cascaded

## transcribe

In [None]:
model_id = "kreasof-ai/whisper-small-be2en"
output_dir="ct2-whisper-small-transcription"

!ct2-transformers-converter \
--model {model_id} \
--output_dir {output_dir} \
--quantization float16 \
--copy_files tokenizer_config.json\
--force

In [None]:
from faster_whisper import WhisperModel

model_name = "ct2-whisper-small-transcription" # ct2-whisper-xxx

model = WhisperModel(model_name, device="cuda", compute_type="float16")

In [None]:
test_dataset[0]

In [None]:
tgt_lang = "be"

transcriptions = []
segments, info = model.transcribe(test_arrays[0],
                                beam_size=5,
                                language=tgt_lang,
                                vad_filter=True
                                )
transcriptions = " ".join([segment.text.strip() for segment in segments])
transcriptions.append(transcription)

In [None]:
print(transcription)

In [None]:
from tqdm.auto import tqdm

tgt_lang = "be"

transcriptions = []

for audio_array in tqdm(test_arrays, total=len(test_arrays)):
    segments, info = model.transcribe(audio_array,
                                        beam_size=5,
                                        language=tgt_lang,
                                        vad_filter=True)
    transcription = " ".join([segment.text.strip() for segment in segments])
    transcription.append(transcription)

In [None]:
print(*transcription[:20], sep="\n")

## MT (NLLB)

In [None]:
mt_model_name = "kreasof-ai/nllb-IWSLT2025-be-en"
output_mt = "ct2-nllb-be-en"
!ct2-transformers-converter --model {mt_model_name} --quantization float16 --output_dir {output_mt}

In [None]:
!wget https://s3.amazonaws.com/opennmt-models/nllb-200/flores200_sacrebleu_tokenizer_spm.model

In [None]:
import os

directory = "/content/"

ct2_model_name = output_mt

ct_model_path = os.path.join(directory, "ct2", ct2_model_name)
sp_model_path = os.path.join(directory, "flores200_sacrebleu_tokenizer_spm")

In [None]:
import ctranslate2
import sentencepiece as spm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)

translator = ctranslate2.Translator(ct_model_path, device=device)

In [None]:
# Translate

# src_lang = "ben_Beng"
# tgt_lang = "eng_Latn"

src_lang = "bem_Latn"
tgt_lang = "eng_Latn"

beam_size = 5

source_sentences = transcriptions

source_sents = [sent.strip() for sent in source_sentences]
target_prefix = [[tgt_lang]] * len(source_sents)

# Subword the source sentences
source_sents_subworded = sp.encode_as_pieces(source_sents)
source_sents_subworded = [[src_lang] + sent + ["</s>"] for sent in source_sents_subworded]

# Translate the source sentences
translations = translator.translate_batch(source_sents_subworded,
                                          batch_type="tokens",
                                          max_batch_size=2024,
                                          beam_size=beam_size,
                                          target_prefix=target_prefix)
translations = [translation.hypotheses[0] for translation in translations]

# Desubword the target sentences
translations_desubword = sp.decode(translations)
translations_desubword = [sent[len(tgt_lang):].strip() for sent in translations_desubword]

print(*translations_desubword[:10], sep="\n")

In [None]:
import sacrebleu

translations = translations_desubword

# Calculate BLEU
bleu = sacrebleu.corpus_bleu(translations, [references])
bleu = round(bleu.score, 2)

# Calculate CHRF
chrf = sacrebleu.corpus_chrf(translations, [references], word_order=2)  # for chrF++ word_order=2
chrf = round(chrf.score, 2)

print(model_name)
print("BLEU\tChrF++")
print(f"{bleu}\t{chrf}")