The steps are mostly based on these refrences:
- [Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)
- [Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO](https://github.com/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)

You can consult them if you want to see a more detailed procedure.

The second one also contains valuable hints on how to preprocess the persian text for our purpose.

In [None]:
# Download nessecary libraries
!pip install datasets==2.10.0 --quiet
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Loading the dataset

In [None]:
# If you wish your data to persist even when you shutdown colab, save your intermediate results to your google drive
# Then in the code you can change the saving path of files to ./drive/MyDrive/ path, which is your google drive disk
from os import path,system,mkdir
from google.colab import drive

drive.mount('/content/drive/')
if not path.exists('./drive/MyDrive/ASR_Colab'):
  mkdir('./drive/MyDrive/ASR_Colab')

Mounted at /content/drive/


In [None]:
# modify the path to dataset if nessecary
dataet_path = './drive/MyDrive/ML_Project/dataset.zip'
if not path.exists('dataset'):
  system(f'unzip -qq "{dataet_path}" -d "/content/"')

In [None]:
# Load csv
# We use pandas for data import, and datasets lib to prepear our data.
# These two libs (pandas.Dataframe & datasets.Dataset) are convertable as shown bellow, so use whichever you find more convenient
import pandas as pd
from datasets import Dataset
#upload the new transcripts
transcripts = pd.read_csv('transcripts.csv')
ds = Dataset.from_pandas(transcripts)
transcripts.head()

Unnamed: 0,voice_filename,transcript,accent,gender,tone
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question
1,voice_2.mp3,همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal
2,voice_3.mp3,دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal
3,voice_4.mp3,شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal
4,voice_5.mp3,باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولید...,فارسی,male,normal


In [None]:
# Take a look at unique letters in our dataset
from functools import reduce
present_chars = reduce(lambda a, b: set((*a,*b)), list(ds['transcript']))
print(present_chars)

{'غ', 'ﺎ', 'ﺯ', 'م', 'ﺩ', 'ﭘ', '\xad', 'ا', 'ﻥ', 'ِ', 'ى', '/', 'ﺣ', '“', 'ﮔ', 'ﻭ', 'ﻧ', 'ﺖ', 'گ', 'ق', 'ذ', '؛', ')', 'ٔ', 'ﻤ', '!', '(', 'ﭽ', 'س', 'د', 'ﺘ', 'ﻠ', '–', '…', 'ﺁ', 'ﺫ', 'ﺗ', 'ک', 'ش', 'ﺭ', 'ص', ':', 'ز', ',', 'ﺴ', 'ﻣ', 'پ', 'ﯽ', 'ﺤ', '-', 'ت', 'ؤ', 'ض', 'ﺟ', 'ُ', 'ﻖ', 'ر', 'ﺶ', 'ﻫ', 'ﯼ', '۵', 'خ', 'ه', 'ء', 'ﺮ', 'ﻬ', 'ﻢ', 'ً', 'ج', 'ﻡ', 'ﺑ', 'ﻃ', 'ﺕ', '۶', 'ی', 'ث', ' ', 'ك', 'ﺷ', 'ي', 'ژ', '،', '”', 'ـ', '5', '\t', '3', 'ظ', 'َ', 'ّ', 'ﻝ', 'ۀ', 'ﯿ', 'ﻔ', '.', 'ﺛ', 'ﯾ', '\xa0', 'أ', 'ح', 'ﻏ', 'ﮏ', 'ﻦ', 'ع', '۷', 'ﺍ', '«', 'چ', 'ﻌ', '\u200c', 'ﺨ', '»', 'ﺥ', 'ﻒ', '1', '\n', 'ﻨ', 'ئ', 'ب', 'ﺼ', 'ﻪ', '\u202c', '؟', 'و', 'ْ', 'ﻓ', 'ﻩ', 'ﺳ', '0', 'ﻮ', 'آ', 'ل', 'ﮐ', '\u202b', 'ٍ', 'ﮑ', 'ﺪ', '"', 'ف', 'ط', 'ن', 'ﺲ'}


## Preprocessing text & audio

In [None]:
# Some of the listed chars are the same, but have different representations(like 'ب' & 'ﺑ')
# They should get combined(one of them gets mapped to the other)
# complete the following dict:
import re

char_mappings = {'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ﺳ':"س", 'ى': 'ی',
                'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
                "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ",'ﺨ':"خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
                "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه", 'ﺛ':"ث",
                'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
                'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻣ':"م", 'ﮑ':"ک", 'ﻥ': "ن", 'ﻬ':"ه", 'ﻧ': "ن",'ﺼ':"ص",'ﻫ':"ه", 'ﺣ':"ح", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ", "ﺟ":"ج", "ﺗ":"ت", "ﺤ":"ح", "ﻔ":"ف",'ﻓ':"ف", "ﮏ":"ک", 'ﭽ':"چ", 'ﻏ':"ع"}

def multiple_replace(batch, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    batch['transcript'] = re.sub(pattern, lambda m: chars_to_mapping[m.group()], batch['transcript'])
    return batch

ds = ds.map(lambda batch: multiple_replace(batch,char_mappings))

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
# Some chars don't have any sound, so they should get removed
# Don't remove the ' ' (space) though, as the model should learn to predict when each word ends
# handle the transcripts containing numbers as you deem nessecary
# complete the following list:
import string

char_removals = ['ِ','\u200c','(',')',',','!','?','؟','\u202c','\u202b','\xa0','\xad','«','»','…','ٍ','\n','ـ','–','M','/','“','”','\t','؛',':','٪','S','0','1','3','6','۵','۶','۷','.'] + list(string.ascii_letters + string.digits)
print(char_removals[11])
print(char_removals[12])
def remove_special_characters(batch,char_removals):
    chars_to_ignore_regex = f"""[{"".join(char_removals)}]"""
    batch['transcript'] = re.sub(chars_to_ignore_regex, '', batch['transcript']).lower() + " "
    return batch

ds = ds.map(lambda batch: remove_special_characters(batch,char_removals))

­
«


Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
# The resulting vocab(list of letters):
vocab = reduce(lambda a, b: set((*a,*b)), ds['transcript'])
print(vocab)

{'غ', 'ﺯ', 'ص', 'ز', 'م', 'ژ', '،', 'ﺥ', 'ﻒ', 'ا', 'ﻨ', 'پ', 'ئ', '-', 'ت', 'ب', 'ظ', 'ض', 'َ', 'ّ', 'ﻝ', 'ُ', 'ﻖ', 'ک', 'ر', 'ﺶ', 'ْ', 'و', 'گ', 'ق', 'ﯼ', 'ذ', 'ﻩ', 'خ', '"', 'ٔ', 'آ', 'ه', 'ل', 'ح', 'س', 'ء', 'د', 'ﻠ', ' ', 'ً', 'ﺁ', 'ج', 'ف', 'ﻦ', 'ع', 'ﻃ', 'ﺫ', 'ﺕ', 'ث', 'چ', 'ی', 'ط', 'ن', 'ش', 'ﺲ'}


In [None]:
# Wav2Vec requires some special tokens to be added to vocab
# We also replace ' '(space) with '|' for more visibility
# The vocab should get saved as a json file and later get used by the model
vocab_dict = {v: k for k, v in enumerate(vocab)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

To know what is the role of tokenizer, feature extractor, data collator & etc. in this model, visit https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [None]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
# Tokenizing the transcripts and then load,convert to mono channel and resample audio files at 16 KHz
import librosa
import warnings

def prepare_dataset(batch):
  file_path = path.join('dataset','voices',batch['voice_filename'])
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    speech_array, sampling_rate = librosa.load(file_path,mono=True,sr=16000)

    batch["input_values"] = processor(speech_array, sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids

  return batch

ds = ds.map(prepare_dataset)

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
# To reduce GPU memory usage, filter out voice samples that are too long:
max_input_length_in_sec = 15
ds = ds.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Filter:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
ds = ds.train_test_split(0.2)

# A report on dataset length:
ds

DatasetDict({
    train: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 4611
    })
    test: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 1153
    })
})

In [None]:
# Save for later use
ds.save_to_disk("./drive/MyDrive/ASR_Colab/dataset.hf")

Flattening the indices:   0%|          | 0/4611 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4611 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1153 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1153 [00:00<?, ? examples/s]