The steps are mostly based on these refrences:
- [Fine-tuning XLS-R for Multi-Lingual ASR with 🤗 Transformers](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2)
- [Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO](https://github.com/m3hrdadfi/notebooks/blob/main/Fine_Tune_XLSR_Wav2Vec2_on_Persian_ShEMO_ASR_with_%F0%9F%A4%97_Transformers_ipynb.ipynb)

You can consult them if you want to see a more detailed procedure.

The second one also contains valuable hints on how to preprocess the persian text for our purpose.

In [None]:
# Download nessecary libraries
!pip install datasets==2.10.0 --quiet
!pip install transformers --quiet

## Loading the dataset

In [None]:
# If you wish your data to persist even when you shutdown colab, save your intermediate results to your google drive
# Then in the code you can change the saving path of files to ./drive/MyDrive/ path, which is your google drive disk
from os import path,system,mkdir
from google.colab import drive

drive.mount('/content/drive/')
if not path.exists('./drive/MyDrive/ASR_Colab'):
  mkdir('./drive/MyDrive/ASR_Colab')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# modify the path to dataset if nessecary
dataet_path = './drive/MyDrive/ML_Project_ASR/dataset.zip'
if not path.exists('dataset'):
  system(f'unzip -qq "{dataet_path}" -d "/content/"')

In [None]:
# Load csv
# We use pandas for data import, and datasets lib to prepear our data.
# These two libs (pandas.Dataframe & datasets.Dataset) are convertable as shown bellow, so use whichever you find more convenient
import pandas as pd
from datasets import Dataset

my_path = './drive/MyDrive/ML/transcripts.csv'
transcripts = pd.read_csv(my_path, encoding='utf-8')
ds = Dataset.from_pandas(transcripts)
transcripts.head()

Unnamed: 0,voice_filename,transcript,accent,gender,tone
0,voice_1.mp3,چرا این‌‌‌‌طور فکر می‌‌‌‌کنی؟,فارسی,male,question
1,voice_2.mp3,همیشه من و تو راجع به آن با هم صحبت کرده‌‌‌‌ایم,فارسی,male,normal
2,voice_3.mp3,دنیا در حال گذار به‌‌‌‌سمت پایداری است,فارسی,male,normal
3,voice_4.mp3,شاخصی که باید عملکرد تسلا را با آن اندازه بگیریم,فارسی,male,normal
4,voice_5.mp3,باید تعداد واقعاً غیرقابل‌‌‌‌تصوری باتری تولید...,فارسی,male,normal


In [None]:
# Take a look at unique letters in our dataset
from functools import reduce
present_chars = reduce(lambda a, b: set((*a,*b)), list(ds['transcript']))
print(len(present_chars), present_chars)

163 {'پ', 'ﺴ', 'ﻔ', 'ﺎ', 'ۀ', '2', 'ﯽ', 'ﻃ', '٨', '8', ' ', '0', 'گ', 'ْ', ',', 'ن', 'ﻖ', 'ﻏ', 'ي', 'ﺘ', 'َ', 'آ', '۸', 'ٔ', 'ﺪ', 'ﺩ', '۳', 'ی', 'ﺷ', 'ﯾ', '۰', 'ﺗ', 'ط', 'چ', 'ﻦ', 'ﻩ', '\u202c', '٪', 'ﮐ', 'ﺨ', ')', '»', '؛', '"', 'أ', '\n', 'ﺁ', 'ﭽ', 'ﺶ', '«', 'ﻢ', 'ﻓ', 'س', 'ﻤ', 'ﻥ', 'ﺲ', '۶', '\xad', 'ﯼ', 'ف', 'ً', 'ﮏ', '…', 'ﭘ', 'ﻠ', '۲', 'ّ', 'ﺼ', 'م', 'ﮔ', 'ه', '5', '4', 'ع', 'ا', 'ث', 'M', '7', 'ظ', '\t', '٬', '\u200c', 'ﻝ', 'V', 'ﺳ', 'ﻒ', 'ص', '.', '٫', 'ٍ', 'ﻮ', 'ﻭ', '\xa0', '6', 'ح', '۹', 'ﺥ', 'ﯿ', 'ﺟ', 'ك', 'د', 'ﻌ', 'ﺭ', '۷', 'ﺕ', 'ﻡ', 'ﺫ', 'ﻪ', 'غ', '٥', '–', '؟', '،', '/', 'ئ', '۴', 'ﺣ', 'ﺑ', 'ﻣ', '9', 'ت', '!', 'ﻫ', ':', 'ﺮ', '“', 'ﺍ', 'خ', 'ﻬ', '”', 'ر', '3', 'ش', 'ق', 'ک', 'ﺯ', 'و', 'ﻧ', '1', 'ز', '۱', 'ﺛ', 'ء', '۵', 'ل', 'ؤ', 'ج', 'ُ', '(', 'ذ', 'ژ', 'ب', 'ـ', '\u202b', 'S', 'ى', 'ض', 'ﺖ', 'ﺤ', 'ﻨ', '-', 'ِ', 'ﮑ'}


## Preprocessing text & audio

In [None]:
# Some of the listed chars are the same, but have different representations(like 'ب' & 'ﺑ')
# They should get combined(one of them gets mapped to the other)
# complete the following dict:
import re

char_mappings = {'ك': 'ک', 'دِ': 'د', 'بِ': 'ب', 'زِ': 'ز', 'ذِ': 'ذ', 'شِ': 'ش', 'سِ': 'س', 'ى': 'ی',
                'ي': 'ی', 'أ': 'ا', 'ؤ': 'و', "ے": "ی", "ۀ": "ه", "ﭘ": "پ", "ﮐ": "ک", "ﯽ": "ی",
                "ﺎ": "ا", "ﺑ": "ب", "ﺘ": "ت", "ﺧ": "خ", "ﺩ": "د", "ﺱ": "س", "ﻀ": "ض", "ﻌ": "ع",
                "ﻟ": "ل", "ﻡ": "م", "ﻢ": "م", "ﻪ": "ه", "ﻮ": "و", 'ﺍ': "ا", 'ة': "ه",
                'ﯾ': "ی", 'ﯿ': "ی", 'ﺒ': "ب", 'ﺖ': "ت", 'ﺪ': "د", 'ﺮ': "ر", 'ﺴ': "س", 'ﺷ': "ش",
                'ﺸ': "ش", 'ﻋ': "ع", 'ﻤ': "م", 'ﻥ': "ن", 'ﻧ': "ن", 'ﻭ': "و", 'ﺭ': "ر", "ﮔ": "گ",
                "ٔ": " ی", "ﮑ": "ک", "ﮏ": "ک", "ﺤ": "ح", "ﺳ": "س", "ﺯ": "ز", "ﺫ": "ذ",
                "ﺶ": "ش", "ﺟ": "ج", "ﺗ": "ت", "ﻨ": "ن", "ﻫ": "ه", "ﺼ": "ص", "ﻝ": "ل", "ﻦ": "ن",
                "ﺥ": "خ", "ﻬ": "ه", "ﻩ": "ه", "ﭽ": "چ", "ﺕ": "ت", "ﺨ": "خ", "ﻓ": "ف", "ﺣ": "ح",
                "ﻏ": "غ", "ﯼ": "ی", "ﻔ": "ف" , "ﻠ": "ل", "ﺛ": "ث", "ﻒ": "ف", "ﻃ": "ط",
                "ﺲ": "س", "ﻖ": "ق", "ﻣ": "م", "ﺁ": "آ"}

def multiple_replace(batch, chars_to_mapping):
    pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
    batch['transcript'] = re.sub(pattern, lambda m: chars_to_mapping[m.group()], batch['transcript'])
    return batch

ds = ds.map(lambda batch: multiple_replace(batch,char_mappings))

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
# Some chars don't have any sound, so they should get removed
# Don't remove the ' ' (space) though, as the model should learn to predict when each word ends
# handle the transcripts containing numbers as you deem nessecary
# complete the following list:
import string

char_removals = ['ِ','\u200c','(',')','!','،','\u202c','«','…','ٍ','\n','ـ'] + list(string.ascii_letters + string.digits) \
                + ['\xad', '\t', '\u202b', '\xa0', 'َ', 'ّ', '؛', 'ً', '/', 'ء', '”', 'ُ', '٬', ':', '“', '٪', '"', '؟', '.', 'ْ', '»', '٫', ',', '–', '-']
                # + ['۰', '۱', '۲', '۳', '۴', '٥', '۶', '۷', '٨', '۹', '۸', '۵'] \

def remove_special_characters(batch,char_removals):
    chars_to_ignore_regex = f"""[{"".join(char_removals)}]"""
    batch['transcript'] = re.sub(chars_to_ignore_regex, '', batch['transcript']).lower() + " "
    return batch

ds = ds.map(lambda batch: remove_special_characters(batch,char_removals))

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
# The resulting vocab(list of letters):
vocab = reduce(lambda a, b: set((*a,*b)), ds['transcript'])
print(len(vocab), vocab)

47 {'پ', 'ت', 'ص', 'خ', 'ر', 'س', '٨', 'ش', 'ح', ' ', 'گ', '۶', 'ق', 'ن', 'ف', '۹', 'ک', 'و', 'آ', 'د', '۸', 'ز', '۱', '۵', '۷', 'ل', '۲', '۳', 'ج', 'ی', 'م', 'ذ', 'ژ', 'غ', '۰', 'ب', 'ط', 'ه', '٥', 'چ', 'ض', 'ع', 'ا', 'ث', 'ئ', 'ظ', '۴'}


In [None]:
persian_alphabet = ['آ','ا', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن', 'و', 'ه', 'ی']

In [None]:
for v in vocab:
    if v not in persian_alphabet:
        print(v)

٨
 
۶
۹
۸
۱
۵
۷
۲
۳
۰
٥
ئ
۴


In [None]:
# Wav2Vec requires some special tokens to be added to vocab
# We also replace ' '(space) with '|' for more visibility
# The vocab should get saved as a json file and later get used by the model
vocab_dict = {v: k for k, v in enumerate(vocab)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

print(vocab_dict)

import json
with open('./drive/MyDrive/ASR_Colab/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

{'پ': 0, 'ت': 1, 'ص': 2, 'خ': 3, 'ر': 4, 'س': 5, '٨': 6, 'ش': 7, 'ح': 8, 'گ': 10, '۶': 11, 'ق': 12, 'ن': 13, 'ف': 14, '۹': 15, 'ک': 16, 'و': 17, 'آ': 18, 'د': 19, '۸': 20, 'ز': 21, '۱': 22, '۵': 23, '۷': 24, 'ل': 25, '۲': 26, '۳': 27, 'ج': 28, 'ی': 29, 'م': 30, 'ذ': 31, 'ژ': 32, 'غ': 33, '۰': 34, 'ب': 35, 'ط': 36, 'ه': 37, '٥': 38, 'چ': 39, 'ض': 40, 'ع': 41, 'ا': 42, 'ث': 43, 'ئ': 44, 'ظ': 45, '۴': 46, '|': 9, '[UNK]': 47, '[PAD]': 48}


To know what is the role of tokenizer, feature extractor, data collator & etc. in this model, visit https://huggingface.co/blog/fine-tune-xlsr-wav2vec2

In [None]:
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer("./drive/MyDrive/ASR_Colab/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
# Tokenizing the transcripts and then load,convert to mono channel and resample audio files at 16 KHz
import librosa
import warnings

def prepare_dataset(batch):
#   file_path = path.join('dataset','voices',batch['voice_filename'])
  file_path = './drive/MyDrive/ML/dataset/voices/' + batch['voice_filename']
  with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    speech_array, sampling_rate = librosa.load(file_path,mono=True,sr=16000)

    batch["input_values"] = processor(speech_array, sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids

  return batch

ds = ds.map(prepare_dataset)

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
# To reduce GPU memory usage, filter out voice samples that are too long:
max_input_length_in_sec = 15
ds = ds.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Filter:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [None]:
ds = ds.train_test_split(0.2)

# A report on dataset length:
ds

DatasetDict({
    train: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 4611
    })
    test: Dataset({
        features: ['voice_filename', 'transcript', 'accent', 'gender', 'tone', 'input_values', 'input_length', 'labels'],
        num_rows: 1153
    })
})

In [None]:
print(ds['train'][0])

{'voice_filename': 'voice_2980.mp3', 'transcript': 'به مرکز مهم تجارت و بازرگانی تبدیل شد ', 'accent': 'فارسی', 'gender': 'male', 'tone': 'normal', 'input_values': [-0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.017734158784151077, -0.0177

In [None]:
# Save for later use
ds.save_to_disk("./drive/MyDrive/ASR_Colab/dataset.hf")

Flattening the indices:   0%|          | 0/4611 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/4611 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1153 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/1153 [00:00<?, ? examples/s]