# Pretraining for ASR

In [3]:
# installing libs
# !pip3 install torch torchvision torchaudio datasets transformers soundfile jiwer --index-url https://download.pytorch.org/whl/cu118
# !pip3 install librosa --index-url https://pypi.org/simple

In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import re
import torch
import torch.nn as nn
import numpy as np

from datasets import load_dataset, disable_caching
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC, Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Encoder

## Finetuning Wav2Vec2 model on CTC loss (5 points)


In this task you have to create pipeline for finetuning pretrained multilingual Wav2Vec2 model on belarusian audio from [Fleurs](https://huggingface.co/datasets/google/fleurs) dataset.

#### Prepare data

In [6]:
fleurs = load_dataset("google/fleurs", "be_by", split=["train", "validation", "test"])

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [7]:
fleurs[0]["transcription"][9]

'вышыня двух пілонаў складае 83 метры даўжыня моста - 378 метраў праезная частка складаецца з дзвюх палос шырыня кожнай - 3,50 м'

In this task, you should:

* filter all samples, where `transcription` includes digits. Hint: take care of specific belarussian symbols "і", "ў";
* remove punctuation from `transcription`.

In [8]:
import string

train, val, test = fleurs

digit_re = re.compile(r"\d")
punct_table = str.maketrans("", "", string.punctuation + "«»—…“”‘’")

def clean_text(text: str) -> str:
    text = text.translate(punct_table)
    return text

def filter_no_digits(batch):
    return not digit_re.search(batch["transcription"])

def remove_punct(batch):
    batch["transcription"] = clean_text(batch["transcription"])
    return batch

In [9]:
print(test)

print(f"before clean: {test[0]["transcription"]}")
print(f"after clean: {clean_text(test[0]["transcription"])}")
print(filter_no_digits(test[0]))

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 967
})
before clean: дэль потра меў перавагу ў пачатку другога сэту аднак калі лік стаў 6-6 спатрэбіўся тай-брэйк
after clean: дэль потра меў перавагу ў пачатку другога сэту аднак калі лік стаў 66 спатрэбіўся тайбрэйк
False


In [13]:
preprocessed_train = train.filter(filter_no_digits).map(remove_punct) # YOUR CODE HERE
preprocessed_val = val.filter(filter_no_digits).map(remove_punct) # YOUR CODE HERE

print(len(train), len(preprocessed_train))

2433 1927


#### Train tokenizer

There you should train your own BPE tokenizer based on texts from Fleurs dataset using [HuggingFace tokenizer](https://huggingface.co/docs/tokenizers/en/training_from_memory).

In [14]:
from tokenizers import Tokenizer, models, trainers, normalizers, pre_tokenizers, decoders

PAD_TOKEN = "[PAD]"
BOS_TOKEN = "[BOS]"
EOS_TOKEN = "[EOS]"
UNK_TOKEN = "[UNK]"
VOCAB_SIZE = 1000

tokenizer = Tokenizer(models.BPE(unk_token=UNK_TOKEN))
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel()

special_tokens = [PAD_TOKEN, BOS_TOKEN, EOS_TOKEN, UNK_TOKEN]
bpe_trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

In [15]:
tokenizer.train_from_iterator(
    (x["transcription"] for x in preprocessed_train), trainer=bpe_trainer
)

In [16]:
print(preprocessed_train[0]["transcription"])
encoded = tokenizer.encode(preprocessed_train[0]["transcription"])
print(encoded.tokens)
print(tokenizer.decode(encoded.ids, skip_special_tokens=False))

у той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі
['Ñĥ', 'ĠÑĤÐ¾Ð¹', 'ĠÐ¶', 'Ð°', 'ĠÑĩÐ°Ñģ', 'ĠÐ¿', 'Ð°Ð±Ð»Ñĸ', 'Ð·Ñĥ', 'ĠÐ°Ð´', 'ĠÐ²ÐµÑĢ', 'Ð°Ð³', 'Ð¾Ð´', 'Ð½ÑĭÑħ', 'ĠÐ¼Ð°ÑĢ', 'ÑĪ', 'ÑĢÑĥ', 'ÑĤÐ°Ñŀ', 'ĠÑĥ', 'Ð²Ð°ÑĢ', 'Ð²', 'Ð°Ð½Ð½Ñı', 'ĠÐ±', 'Ð°Ð·', 'ÑĸÑĢ', 'Ð°Ð²', 'Ð°Ð»Ð°ÑģÑı', 'ĠÐ²ÐµÐ»ÑĮÐ¼Ñĸ', 'ĠÐ¼', 'Ð°Ð»Ð°', 'ĠÐºÐ°ÑĢ', 'Ð°Ð±', 'Ð»', 'Ñĳ', 'Ñŀ', 'ĠÐºÐ°ÑĢ', 'Ð°Ð»Ðµ', 'Ñŀ', 'ÑģÐºÐ°Ð³Ð°', 'ĠÑĦ', 'Ð»Ð¾', 'ÑĤÑĥ', 'ĠÑĤÐ°Ð¼Ñĥ', 'ĠÑĪÑĤÐ¾', 'ĠÐ°Ð´', 'Ð¼Ñĸ', 'ÑĢ', 'Ð°Ð»', 'Ñĭ', 'ĠÐ°Ñģ', 'ÑĨ', 'ÐµÑĢ', 'Ð°Ð³', 'Ð°Ð»ÑĸÑģÑı', 'ĠÑĸÑħ', 'ĠÐ¿Ð°ÑĤ', 'Ð°Ð¿', 'Ð»ÐµÐ½Ð½Ñı', 'ĠÐ½Ñı', 'Ð¼Ðµ', 'ÑĨ', 'ÐºÑĸ', 'Ð¼Ñĸ', 'ĠÐ¿', 'Ð°Ð²Ðµ', 'ÑĤ', 'ÑĢÐ°Ð½', 'ÑĭÐ¼Ñĸ', 'ĠÑģÑĸ', 'Ð»', 'Ð°Ð¼Ñĸ']
у той жа час паблізу ад верагодных маршрутаў уварвання базіравалася вельмі мала караблёў каралеўскага флоту таму што адміралы асцерагаліся іх патаплення нямецкімі паветранымі сіламі


#### Loading model and preprocessor

In [17]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
   "facebook/wav2vec2-xls-r-300m"
)
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    ctc_loss_reduction="mean", 
    pad_token_id=tokenizer.token_to_id(PAD_TOKEN),
    vocab_size=tokenizer.get_vocab_size(),
)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Data processor and data collator 

In [14]:
preprocessed_train[0]["audio"]

{'path': 'train/10009414287632395082.wav',
 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00031281,
        -0.00038069, -0.00132966]),
 'sampling_rate': 16000}

In [18]:
class CtcDataProcessor:
    def __init__(self, tokenizer, feature_extractor):
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor

    def __call__(self, row):
        """
            Function applies tokenizer on row['transcription'] and applies feature extractor on audio column in row.
            Input: dict with transcription and audio fields
            Output: original dict includes `labels` column with tokenized sequence and `input_values` column with computed spectrogram.
        """
        # YOUR CODE HERE
        features = self.feature_extractor(
            row["audio"]["array"], sampling_rate=row["audio"]["sampling_rate"]
        )
        labels = self.tokenizer.encode(row["transcription"]).ids

        return {
            "input_values": features["input_values"][0],
            "labels": labels
        }

In [19]:
data_processor = CtcDataProcessor(tokenizer, feature_extractor)
train = preprocessed_train.map(data_processor, keep_in_memory=True, remove_columns=preprocessed_train.column_names)
val = preprocessed_val.map(data_processor, keep_in_memory=True, remove_columns=preprocessed_val.column_names)

Map:   0%|          | 0/1927 [00:00<?, ? examples/s]

Map:   0%|          | 0/355 [00:00<?, ? examples/s]

In [20]:
from dataclasses import dataclass
from typing import Dict, List, Union

@dataclass
class CTCDataCollator:
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        labels_batch = []
        max_label_length = max(len(l["input_ids"]) for l in label_features)
        
        for label in label_features:
            label_ids = label["input_ids"]
            remainder = [-100] * (max_label_length - len(label_ids))
            labels_batch.append(label_ids + remainder)

        batch["labels"] = torch.tensor(labels_batch, dtype=torch.long)

        return batch

#### Inference and metrics computing

There you should use simple greedy straregy for CTC output decoding. 

Hint: Don't forget about padding value -100 in reference.

Hint: Don't forget about CTC output format.

In [21]:
from itertools import groupby
from evaluate import load

wer_metric = load("wer")

class MetricsComputer:
    def __call__(self, pred):
        """
            Input: object with fields `predictions` for CTC model output and `label_ids` for tokenized reference;
            Output: dict with key `wer` and computed wer
        """
        preds_logits = pred.predictions
        label_ids = pred.label_ids
        blank_id = tokenizer.token_to_id(PAD_TOKEN)
        
        pred_ids = np.argmax(preds_logits, axis=-1)
        
        batch_pred_ids = []
        for seq in pred_ids:
            filtered_seq = [token for token, _ in groupby(seq) if token != blank_id]
            batch_pred_ids.append(filtered_seq)
        
        batch_label_ids = []
        for seq in label_ids:
            filtered_seq = [token for token in seq if token != -100]
            batch_label_ids.append(filtered_seq)
        
        pred_str = [tokenizer.decode(ids) for ids in batch_pred_ids]
        label_str = [tokenizer.decode(ids) for ids in batch_label_ids]
    
        print(f"Prediction: '{pred_str[0]}'")
        print(f"Reference: '{label_str[0]}'")
        
        wer = wer_metric.compute(predictions=pred_str, references=label_str)
        print(f"WER: {wer:.4f}")
        return {"wer": wer}

In [None]:
wer_metric.compute(predictions=["hello world"], references=["hello word"])

0.5

#### Overfitting on train batch

In this task you should check pipeline correctness by overfitting on you need to finetune Wav2Vec2 model and achieve 50 WER or lower accuracy on val set.

In [28]:
from transformers import TrainingArguments
from transformers import logging
logging.set_verbosity_info()

training_args = TrainingArguments(
    output_dir="test",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8, 
    eval_strategy="steps",
    max_steps=3000,
    fp16=True,
    save_steps=500,
    eval_steps=1,
    logging_strategy="steps",
    logging_steps=5,
    logging_first_step=True,
    learning_rate=1e-4,
    weight_decay=1e-3,
    warmup_steps=100,
    gradient_checkpointing=True,
    report_to="none",
    log_level="info",
)

PyTorch: setting up devices


In [29]:
from transformers import Trainer, TrainerCallback
from IPython.display import display, clear_output

trainer = Trainer(
    model=model,
    data_collator=CTCDataCollator(feature_extractor=feature_extractor, padding=True),
    args=training_args,
    compute_metrics=MetricsComputer(),
    train_dataset=train,
    eval_dataset=val,
)

max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [None]:
trainer.train()