This notebook should be executed on a GPU runtime.

In [28]:
# Download nessecary libraries
!pip install datasets --quiet
!pip install transformers[torch] --quiet
!pip install --upgrade accelerate --quiet
!pip install evaluate --quiet
!pip install jiwer --quiet

In [29]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [30]:
# Load the preprocessed dataset
from datasets import DatasetDict

ds = DatasetDict.load_from_disk("/content/drive/MyDrive/ASR_Colab/dataset.hf")

# Remove unnessecary columns for network traning to save RAM
model_columns = ['input_values', 'input_length', 'labels']
ds['train'] = ds['train'].remove_columns([col for col in ds['train'].column_names if col not in model_columns])
ds['test'] = ds['test'].remove_columns([col for col in ds['test'].column_names if col not in model_columns])

To know what is the role of tokenizer, feature extractor, data collator & etc. in this model, visit https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [31]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("/content/drive/MyDrive/ASR_Colab/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [32]:
# show a row of our processed dataset and how they are encoded
from IPython.display import Audio

example = ds['train'][0]

print(example["labels"])
print(example["input_values"][0:10])

print(tokenizer.decode(example["labels"]))
Audio(example["input_values"], rate=16000)

[25, 93, 13, 90, 4, 26, 58, 12, 49, 89, 25, 57, 93, 90, 22, 17, 84, 93, 81, 12, 90, 93, 70, 93, 19, 70, 4, 57, 14, 58, 49, 17, 12, 32, 58, 89, 17, 70, 17, 49, 93, 90, 57, 57, 12, 32, 23, 14, 10, 93]
[-5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05, -5.5451117077609524e-05]
ز جمله اشپزی مصرف خام و تولید شراب پرورش میابند.


In [33]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Data collator pads each of the voices and transcripts for them have a same size
# lets see what collator does to some random rows:
example_features = [
    ds["train"][0],
    ds["train"][1],
    ds["train"][20],
]
example_batch = data_collator(example_features)
{k:v.shape for k,v in example_batch.items()}

{'input_values': torch.Size([3, 111744]),
 'attention_mask': torch.Size([3, 111744]),
 'labels': torch.Size([3, 58])}

In [34]:
# feel free to adjust these parameters if training doesn't converge
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=processor.tokenizer.vocab_size
)
model.freeze_feature_encoder()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.weight', 'lm_head.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Change the following parameters if you deem nessecary, specially if you run out of GPU RAM or don't reach the desired performance
from transformers import TrainingArguments

save_dir = 'wav2vec_cache'
training_args = TrainingArguments(
    output_dir=save_dir,
    group_by_length=False,
    per_device_train_batch_size=1,  # using small batch size to prevent GPU running out of memory
    gradient_accumulation_steps=10, # using mini-batches to prevent GPU running out of memory
    evaluation_strategy="steps",
    num_train_epochs=1.92,             # increasing number of epochs can increase model performance, but takes longer time to train
    fp16=True,
    save_steps=50,
    eval_steps=50,
    logging_steps=30,
    learning_rate=3e-4,
    warmup_steps=20,
    save_total_limit=1,
    load_best_model_at_end = True
)

In [37]:
# Load wer(Word Error Rate) metric
import evaluate

wer_metric = evaluate.load("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [38]:
from transformers import Trainer
import numpy as np

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=processor.feature_extractor,
)

In [39]:
model.config.ctc_zero_infinity = True

In [40]:
trainer.train()

Step,Training Loss,Validation Loss,Wer
50,15.6925,3.107164,1.0
100,3.0123,2.999214,1.0
150,2.9878,2.992619,1.0
200,2.9773,2.984707,1.0
250,2.9542,2.962382,1.0
300,2.9317,2.925771,1.0
350,2.9113,2.883813,1.0
400,2.8848,2.852507,0.999937
450,2.7779,2.599732,1.0




Step,Training Loss,Validation Loss,Wer
50,15.6925,3.107164,1.0
100,3.0123,2.999214,1.0
150,2.9878,2.992619,1.0
200,2.9773,2.984707,1.0
250,2.9542,2.962382,1.0
300,2.9317,2.925771,1.0
350,2.9113,2.883813,1.0
400,2.8848,2.852507,0.999937
450,2.7779,2.599732,1.0
500,2.4957,1.697484,0.98646




TrainOutput(global_step=886, training_loss=2.5777162935071822, metrics={'train_runtime': 5342.5052, 'train_samples_per_second': 1.657, 'train_steps_per_second': 0.166, 'total_flos': 1.8363512333838874e+18, 'train_loss': 2.5777162935071822, 'epoch': 1.92})

In [41]:
# save model weights for the evaluation step
trainer.save_model("./drive/MyDrive/ASR_Colab/model_weights")