# Setup

### install required libraries

In [3]:
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install librosa
!pip install evaluate -U
!pip install jiwer



### HuggingFace Hub login

for common voice dataset access

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Get Dataset

We'll get the hindi subset of the Fleurs dataset and the Common Voice 13 dataset (more "Hindi" data)

In [5]:
from datasets import load_dataset, DatasetDict

fleurs = DatasetDict()
fleurs["train"] = load_dataset("google/fleurs", "hi_in", split="train")
fleurs["valid"] = load_dataset("google/fleurs", "hi_in", split="validation")

common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "hi", split="train")
common_voice["valid"] = load_dataset("mozilla-foundation/common_voice_13_0", "hi", split="validation")

print(fleurs)
print(common_voice)

Downloading builder script:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/132M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/249M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/250k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/473k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading builder script:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.65k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/118M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/65.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/94.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/119M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/24.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/844k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/210k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 4479it [00:00, 59393.89it/s]


Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 2281it [00:00, 71639.79it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 2947it [00:00, 64202.64it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 3487it [00:00, 67868.56it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 706it [00:00, 67959.03it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 2120
    })
    valid: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 239
    })
})
DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 4479
    })
    valid: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2281
    })
})


removing additional metadata information which we don't need

In [6]:
fleurs = fleurs.remove_columns(['id', 'num_samples', 'path', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'])
common_voice = common_voice.remove_columns(['client_id', 'path', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'])
print(fleurs)
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 2120
    })
    valid: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 239
    })
})
DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4479
    })
    valid: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2281
    })
})


combining both datasets

In [7]:
from datasets import concatenate_datasets, Audio

# rename 'sentence' column in common voice dataset to align with fleurs dataset
common_voice = common_voice.rename_column('sentence', 'transcription')

# downsample audio in common voice dataset to align sampling rate with fleurs dataset
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

# now we combine
ds = DatasetDict()
ds['train'] = concatenate_datasets([fleurs['train'], common_voice['train']])
ds['valid'] = concatenate_datasets([fleurs['valid'], common_voice['valid']])

ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 6599
    })
    valid: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 2520
    })
})

# Load a WhisperProcessor

It combines both whisper feature extractor and tokenizer.

Feature Extractor to pre-processes the raw audio inputs by padding/truncating them to length of 30s and then converting them to log-Mel spectrograms.

Tokenizer to post-processes the model outputs (index of predicted text) to text format.

In [8]:
from transformers import WhisperProcessor

model_checkpoint="openai/whisper-medium"

processor = WhisperProcessor.from_pretrained(model_checkpoint, language="Hindi", task="transcribe") # these arguments specifies the tokenizer to append the language token and the task token to the start of the sequence

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

# Prepare dataset for training

In [9]:
# dataset preparation function
def prepare_dataset(batch):

    audio = batch["audio"]

    # compute log-Mel input features from input audio array using the feature extractor
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids using the tokenizer
    batch["labels"] = processor.tokenizer(batch["transcription"]).input_ids
    return batch


In [10]:
# apply the dataset preparation function to all training examples
ds = ds.map(prepare_dataset, remove_columns=ds.column_names["train"])
ds

Map:   0%|          | 0/6599 [00:00<?, ? examples/s]

Map:   0%|          | 0/2520 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 6599
    })
    valid: Dataset({
        features: ['input_features', 'labels'],
        num_rows: 2520
    })
})

# Define a data collator
to take our pre-processed data and convert them to PyTorch tensors.

In [11]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods

        # get the log-Mel input features
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        # no padding is applied as they are already padded, it is to convert to pytorch tensors
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length and convert to pytorch tensors
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding tokens with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

initialise the data collator

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Evaluation metrics

we need to define a `compute_metrics` function to evaluate the model using the WER metric.

In [13]:
# we'll load the WER metric from evaluate library

import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [14]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    # (undoing the step we applied in the data collator to ignore padded tokens correctly in the loss)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # decodes the predicted and label ids to strings
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # computes the WER between the predictions and reference labels
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


# Load a pre-trained checkpoint

we need to load a pre-trained checkpoint and configure it correctly for training.



In [15]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

In [16]:
# we'll train the model to predict the correct language and task instead of having forced token ids control it
model.config.forced_decoder_ids = None
# there are some tokens that are completely supressed during generation, we'll disable that
model.config.suppress_tokens = []

# Define the training arguments

In [17]:
from transformers import Seq2SeqTrainingArguments

bs=2
epochs=20
lr=1e-5

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium-hindi",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=bs//2,
    gradient_accumulation_steps=16,  # increase by 2x for every 2x decrease in batch size
    num_train_epochs=epochs,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=lr,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    gradient_checkpointing=True,
    fp16=True,
    predict_with_generate=True,
    generation_max_length=225,
    report_to="none",
    load_best_model_at_end=True,
    seed=42,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

# Train

 forward the training arguments to the trainer along with our model, dataset, data collator and `compute_metrics` function

In [18]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=ds["train"],
    eval_dataset=ds["valid"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

And train

In [19]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


OutOfMemoryError: ignored

In [None]:
import torch
torch.cuda.empty_cache()

# Push the model to Hub

In [None]:
trainer.push_to_hub()