In [1]:
!pip install evaluate openai-whisper datasets
!pip install sacrebleu unbabel-comet
!pip install -U bitsandbytes

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinu

In [2]:
from whisper.normalizers.basic import BasicTextNormalizer
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
from datasets import load_dataset

raw_datasets = load_dataset("csv", data_files="/content/drive/My Drive/dataset_asr.csv")
print(raw_datasets)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['transcription', 'sentence', 'translation', 'hypothesis_clean', 'sentence_clean', 'translation_clean'],
        num_rows: 1000
    })
})


In [5]:
raw_datasets["train"].features

{'transcription': Value(dtype='string', id=None),
 'sentence': Value(dtype='string', id=None),
 'translation': Value(dtype='string', id=None),
 'hypothesis_clean': Value(dtype='string', id=None),
 'sentence_clean': Value(dtype='string', id=None),
 'translation_clean': Value(dtype='string', id=None)}

In [6]:
print(raw_datasets["train"][:1]["transcription"])
print(raw_datasets["train"][:1]["sentence"])
print(raw_datasets["train"][:1]["translation"])
print(raw_datasets["train"][:1]["hypothesis_clean"])
print(raw_datasets["train"][:1]["sentence_clean"])
print(raw_datasets["train"][:1]["translation_clean"])

[' Ey, ey, ég vel hvort ey.']
['Ei, ei ja veelkord ei.']
['No, no, and again, no.']
[' ey ey ég vel hvort ey ']
['ei ei ja veelkord ei ']
['no no and again no ']


In [7]:
max_tok_length = 275
from transformers import AutoTokenizer

checkpoint = "facebook/nllb-200-distilled-600M"
# from flores200_codes import flores_codes
src_code = "est_Latn"
tgt_code = "eng_Latn"
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint,
    padding=True,
    pad_to_multiple_of=8,
    src_lang=src_code,
    tgt_lang=tgt_code,
    truncation=False,
    max_length=max_tok_length,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [8]:
def preprocess_function(sample):
    model_inputs = tokenizer(
        sample["transcription"],
        text_target=sample["translation"],
    )
    return model_inputs

In [9]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
!pip install -U bitsandbytes



In [11]:
import torch
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [12]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint, quantization_config=quantization_config
)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Inference

In [13]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_pretrained(
    checkpoint,
)
print(generation_config)

GenerationConfig {
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "eos_token_id": 2,
  "max_length": 200,
  "pad_token_id": 1
}



In [14]:
test_batch_size = 32
batch_tokenized_test = tokenized_datasets["train"].batch(test_batch_size)

Batching examples:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
number_of_batches = len(batch_tokenized_test["transcription"])
output_sequences = []
for i in range(number_of_batches):
    inputs = tokenizer(
        batch_tokenized_test["transcription"][i],
        max_length=max_tok_length,
        truncation=False,
        return_tensors="pt",
        padding=True,
    )
    output_batch = model.generate(
        generation_config=generation_config,
        input_ids=inputs["input_ids"].cuda(),
        attention_mask=inputs["attention_mask"].cuda(),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_code),
        max_length=max_tok_length,
        num_beams=1,
        do_sample=False,
    )
    output_sequences.extend(output_batch.cpu())



In [16]:
decoded_preds = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
references = tokenizer.batch_decode(
    tokenized_datasets["train"]["labels"], skip_special_tokens=True
)

In [17]:
decoded_preds[:1]

['Hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey, hey']

In [18]:
references[:2]

['No, no, and again, no.',
 'All these problems had made Sirli consider ending her life.']

In [19]:
raw_datasets["train"]["transcription"][:2]

[' Ey, ey, ég vel hvort ey.',
 ' Kõik kirjeldatud probleimid on Sirlele pähe toonud ka elust loobumise mõtteid.']

In [20]:
from whisper.normalizers.basic import BasicTextNormalizer

normalizer = BasicTextNormalizer()
decoded_preds_clean = [normalizer(text) for text in decoded_preds]
references_clean = [normalizer(text) for text in references]

In [21]:
from evaluate import load

metric = load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [22]:
result = metric.compute(predictions=decoded_preds_clean, references=references_clean)
print(f'BLEU score: {result["score"]:.1f}')

BLEU score: 22.1


In [23]:
from evaluate import load

comet_metric = load("comet")

Downloading builder script:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [24]:
comet_score = comet_metric.compute(
    predictions=decoded_preds_clean,
    references=references_clean,
    sources=raw_datasets["train"]["transcription"],
)
print(f"COMET: {comet_score['mean_score'] * 100:.2f} %")

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


COMET: 75.20 %
