In [None]:
## Define the pre-trained model checkpoint to use for fine-tuning ##
model_checkpoint = "facebook/wav2vec2-large-xlsr-53"

## Set the batch size for training ##
batch_size = 32

In [None]:
## Install the required libraries and packages ##
%%capture  # Suppresses the output of the installations

!pip install datasets    # Library needed to access the dataset
!pip install transformers==4.11.3   # Library needed to access Hugging Face's state-of-the-art pre-trained transformer models
!pip install librosa    # Library needed for audio and music processing and analysis
!pip install jiwer    # Library needed to calculate the word error rate (WER) of the ASR models
!pip install ipywidgets   # Library for interactive widgets in Jupyter notebooks
!pip install torch   # Library needed for the PyTorch deep and machine learning framework

In [None]:
from datasets import load_dataset

## Load the male_18to49_yrs dataset split of the Samrómur Milljón dataset from Hugging Face ##
samromur_milljon = load_dataset("language-and-voice-lab/samromur_milljon", split="male_18to49_yrs")

In [None]:
## Check the column names and the number of rows ##
samromur_milljon

In [None]:
from datasets import DatasetDict

## Split the dataset into training and test datasets ##
split_datasets = samromur_milljon.train_test_split(test_size=0.2, seed=11) # Setting a seed for reproducibility

## Further split the training dataset into training and validation datasets ##
train_val_datasets = split_datasets["train"].train_test_split(test_size=0.125, seed=11) # Setting the same seed here

## Creating a DatasetDict to hold the adjusted splits ##
dataset = DatasetDict({
    "train": train_val_datasets["train"], # Use the larger part of the train split for training, 70% of the dataset
    "validation": split_datasets["test"], # Use the initial test split as validation, 10% of the dataset
    "test": train_val_datasets["test"] # Use the smaller part of the train split for testing during training, 20% of the dataset
})

In [None]:
## Check the content of the train, validation and test splits ##
dataset

In [None]:
## Function to rename the 'normalized_text' feature to 'text' ##
def rename_normalized_text(example):
    example['text'] = example['normalized_text']
    del example['normalized_text']
    return example

## Apply the transformation to the dataset ##
dataset = dataset.map(rename_normalized_text, remove_columns=['normalized_text'])

## Check the new dataset ##
print(dataset)

In [None]:
## Remove unnecessary columns from the dataset ##
dataset = dataset.remove_columns(["speaker_id", "gender", "age", "duration", 'verified_with'])

In [None]:
import random
import pandas as pd
from IPython.display import display, HTML

## Function to display random examples from the dataset ##
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

## Display the 10 random elements from the training set, generated by the previous function, after removing 'audio' and 'audio_id' columns ##
show_random_elements(dataset["train"].remove_columns(["audio", "audio_id"]), num_examples=10)

In [None]:
import re

## Define a list of special characters that should be ignored in the transcriptions ##
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

## Function to remove special characters from the transcriptions and convert the text to lowercase ##
def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

## Make use of the previous function to remove special characters from the dataset ##
dataset = dataset.map(remove_special_characters)

In [None]:
## As previously, display random elements, but now from the cleaned dataset ##
show_random_elements(dataset["train"].remove_columns(["audio", "audio_id"]))

In [None]:
## Function to extract unique characters from the text in the batch ##
def extract_all_chars(batch):
  all_text = " ".join(batch["text"]) # Join the text in the batch into a single string
  vocab = list(set(all_text)) # Then create an unordered list of unique characters from the joined text
  return {"vocab": [vocab], "all_text": [all_text]} # Returns a dictionary with the unique characters

In [None]:
## Mapping the previously created 'extract_all_chars' function to the dataset ##
vocabs = dataset.map(
  extract_all_chars, # Function to apply
  batched=True, # Apply function to batches of the dataset
  batch_size=-1, # Use the full dataset as one batch (-1 means to use the full dataset)
  keep_in_memory=True, # Keep all batches in memory during the processing
  remove_columns=dataset.column_names["train"] # Remove the original columns after transformation
)

In [None]:
## Merge the unique characters from both the training and test datasets into a single list ##
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

In [None]:
## Create a dictionary where each unique character is assigned a unique numerical ID ##
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

## Display the created vocabulary dictionary ##
vocab_dict

In [None]:
## Make the pipe character use the same ID as the space character ##
vocab_dict["|"] = vocab_dict[" "]

## Remove the space character from the dictionary ##
del vocab_dict[" "]

In [None]:
## Add an entry for unknown characters with the next available ID ##
vocab_dict["[UNK]"] = len(vocab_dict)

## Add an entry for padding with the next available ID ##
vocab_dict["[PAD]"] = len(vocab_dict)

## Print the total number of entries in the dictionary after these additions ##
len(vocab_dict)

In [None]:
import json

## Save the vocabulary dictionary to a file named 'vocab.json' ##
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)


In [None]:
from transformers import AutoConfig

# Load the configuration of the pre-trained model
config = AutoConfig.from_pretrained(model_checkpoint)

# Determine the tokenizer type based on the model configuration
tokenizer_type = config.model_type if config.tokenizer_class is None else None

# If the model doesn't have a specific tokenizer class, keep the config; otherwise, set it to None
config = config if config.tokenizer_class is not None else None

In [None]:
from transformers import AutoTokenizer

## Initialize the tokenizer from the pre-trained model ##
tokenizer = AutoTokenizer.from_pretrained(
  "./", # Path to the model directory or model name
  config=config, # Configuration for the tokenizer
  tokenizer_type=tokenizer_type, # Type of tokenizer to use
  unk_token="[UNK]", # Token to represent unknown words
  pad_token="[PAD]", # Token to use for padding
  word_delimiter_token="|" # Token to use as a word delimiter
)

In [None]:
## Extract the model checkpoint name by splitting the path and taking the last part ##
model_checkpoint_name = model_checkpoint.split("/")[-1]

## Create a repository name by appending the seed identifier to the model checkpoint name ##
repo_name = f"{model_checkpoint_name}-seed-11"

In [None]:
## Access the first audio sample of the first recording in the training dataset ##
dataset["train"][0]["audio"]

In [None]:
import IPython.display as ipd
import numpy as np
import random

## Pick a random audio sample from the training dataset ##
rand_int = random.randint(0, len(dataset["train"]))

print(dataset["train"][rand_int]["text"]) # Print the text for the chosen audio sample

## Play the chosen audio sample ##
ipd.Audio(data=np.asarray(dataset["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000) # Convert audio to numpy array, then play the audio and set the playback rate to 16000 Hz

In [None]:
import random
import numpy as np

## Pick a random audio sample from the training dataset ##
rand_int = random.randint(0, len(dataset["train"]))

print("Target text:", dataset["train"][rand_int]["text"]) # Print the text for the chosen audio sample
print("Input array shape:", np.asarray(dataset["train"][rand_int]["audio"]["array"]).shape) # Print the shape of the audio array for the chosen sample
print("Sampling rate:", dataset["train"][rand_int]["audio"]["sampling_rate"]) # Print the sampling rate of the audio for the chosen sample

In [None]:
from transformers import AutoFeatureExtractor

## Load a pre-trained feature extractor from the specified model checkpoint ##
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

In [None]:
from transformers import Wav2Vec2Processor

## Combine the feature extractor and tokenizer into a single processor ##
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
## Function to prepare the dataset by processing audio and text data ##
def prepare_dataset(batch):
    audio = batch["audio"] # Extract the audio from the batch
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0] # Process the audio data to extract input values
    batch["input_length"] = len(batch["input_values"]) # Get the length of the input values
    
    ## Process the target text data to get the corresponding labels (the token IDs) ##
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    
    return batch

In [None]:
## Process the dataset using the 'prepare_dataset' function in parallel, remove original columns and use 4 processes for faster processing ##
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4)

In [None]:
## Setting the maximum input length in seconds ##
max_input_length_in_sec = 5.0

## Then filter out audio samples longer than the maximum length in the training set ##
dataset["train"] = dataset["train"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

In [None]:
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

## DataCollator for padding CTC (Connectionist Temporal Classification) inputs and labels ##
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        ## Separate inputs and labels as they require different padding methods ##
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        ## Pad input features ##
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",  # Return as PyTorch tensors
        )

        ## Pad label features ##
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",  # Return as PyTorch tensors
            )

        ## Replace padding tokens in labels with -100 to correctly ignore them in the loss calculation ##
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

In [None]:
## Initialize the data collator with the processor and padding set to True ##
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
## Import the load_metric function from the datasets library ##
from datasets import load_metric

## Load the Word Error Rate (WER) metric ##
wer_metric = load_metric("wer")

In [None]:
## Function to compute the Word Error Rate (WER) metric for model predictions ##
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id # Replace -100 in the labels as they are a special value for padding.

    ## Decode the predictions and labels to texts ##
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str) # Compute the Word Error Rate (WER) between the predictions and the reference labels

    return {"wer": wer}

In [None]:
from transformers import AutoModelForCTC

# Load the pre-trained model and configure it for Connectionist Temporal Classification (CTC)
model = AutoModelForCTC.from_pretrained(
    model_checkpoint, # The model checkpoint to use
    attention_dropout=0.1, # The dropout rate for the attention layers
    hidden_dropout=0.1, # The dropout rate for the hidden layers
    feat_proj_dropout=0.0, # The dropout rate for the feature projection layer
    mask_time_prob=0.05, # The probability of masking time steps for data augmentation
    layerdrop=0.1, # The probability of dropping entire layers
    ctc_loss_reduction="mean", # Loss reduction for the CTC loss
    pad_token_id=processor.tokenizer.pad_token_id, # The pad token ID for the tokenizer
    vocab_size=len(processor.tokenizer) # The vocabulary size is set to match the tokenizer's vocabulary size
)

In [None]:
from transformers import TrainingArguments

## Define the training arguments for the model ##
training_args = TrainingArguments(
  output_dir=repo_name, # Directory where the model checkpoints will be saved
  group_by_length=True, # Group sequences of similar lengths together
  per_device_train_batch_size=batch_size, # Batch size for training, set to 32
  gradient_accumulation_steps=2, # Number of steps to accumulate gradients before updating model parameters
  evaluation_strategy="steps", # Evaluation strategy to use during training
  num_train_epochs=3, # Number of training epochs
  gradient_checkpointing=True, # Enable gradient checkpointing to save memory
  fp16=False, # 16-bit (mixed) precision training won't be used
  save_steps=500, # Save checkpoint every 500 steps
  eval_steps=500, # Evaluate the model every 500 steps
  logging_steps=500, # Log training progress every 500 steps
  learning_rate=3e-4, # Initial learning rate for training
  warmup_steps=500, # Number of warmup steps for learning rate scheduler
  save_total_limit=5, # Limit of the total number of saved checkpoints
  push_to_hub=False, # Disable pushing model to Huggingface Hub
  lr_scheduler_type="linear", # Learning rate will decrease linearly
)

In [None]:
from transformers import Trainer

## Initialize the Trainer class ##
trainer = Trainer(
    model=model, # The model that is to be trained
    data_collator=data_collator, # The data collator for processing batches
    args=training_args, # The training arguments
    compute_metrics=compute_metrics, # Function to calculate WER-scores
    train_dataset=dataset["train"], # The training dataset
    eval_dataset=dataset["test"], # The evaluation dataset during training
    tokenizer=processor.feature_extractor, # The tokenizer to be used
)

In [None]:
## Start the training process ##
trainer.train()

## If training was interrupted and has to be recommenced, it can be resumed from the last checkpoint by uncommenting the line below and providing the last checkpoint path ##
# trainer.train(resume_from_checkpoint="path to the most recent checkpoint if training was interrupted for some reason")

In [None]:
## Save the trained model to the specified directory ##
trainer.save_model("/all trainings/seed testing - 11, 12, 13/Model - seed 11")

## Save the tokenizer to the specified directory ##
tokenizer.save_pretrained("/all trainings/seed testing - 11, 12, 13/Model - seed 11")