# Dataset preparation for training XLSR ASR models

This notebook creates JSON files so that datasets can be read by Huggingface Datasets library.

In [27]:
from pathlib import Path

In [28]:
data_dir = Path('/workspace/data/mal') # The path where you have downloaded and extracted datasets
output_dir = Path('/workspace/data/mal/hf-dataset') # The output directory for new HF Datasets
output_dir.mkdir(exist_ok=True)

# Set dataset folders under data_dir
dataset_folders = {
    'iiit': 'iiit_mal_abi',
    'openslr': 'openslr',
    'indic-tts': 'indic-tts-ml',
    'msc-reviewed': 'msc-reviewed-speech-v1.0+20200825',
}

## IIIT MAL Dataset
The dataset was downloaded from [here](http://speech.iiit.ac.in/index.php/research-svl/69.html)

In [29]:
import json

def output_dataset(dataset, output_path):
    """
    Function to output given dataset to a directory
    Each sample is written as JSON file
    """
    output_path.mkdir(parents=True, exist_ok=True)
    for i, sample in enumerate(dataset):
        sample_dict = {"path": sample[0], "sentence":sample[1]}
        with open(output_path/f'sample_{i}.json', 'w') as outfile:
            json.dump(sample_dict, outfile)
    return i + 1

In [30]:
def get_iiit_mal_dataset(dataset_path):
    """
    Generator to read IIIT dataset
    """
    text_file = dataset_path / 'etc' / 'txt.done.data.utf8'
    wav_dir = dataset_path / 'wav'
    
    with open(text_file, encoding='utf-8') as data_file:
        for line in data_file:
            cleaned_line = line[2:-4]
            file_id = cleaned_line[:8]
            text = cleaned_line[11:]
            wav_path = wav_dir / f"{file_id}.wav" 
            wav_path_str = str(wav_path.absolute())
            if wav_path.exists():
                yield (wav_path_str, text)
    

In [31]:
# Get dataset generator and write it
dataset = get_iiit_mal_dataset(data_dir/dataset_folders['iiit'])
output_dataset(dataset, output_dir/dataset_folders['iiit'])

1000

## Openslr

The dataset was downloaded from [here](http://openslr.org/63/). Put the TSV files and extracted dataset files into a directory. It should have following directories/files :
```
- line_index_female.tsv
- line_index_male.tsv
- ml_in_female (directory)
- ml_in_male (directory)
```

In [32]:
import csv

In [33]:
def get_openslr_dataset(dataset_path, subset_type):
    """Get generator for OPENSLR dataset"""
    meta_file = dataset_path / f"line_index_{subset_type}.tsv"
    wav_dir = dataset_path / f"ml_in_{subset_type}"
    with open(meta_file, encoding='utf-8') as csv_file:
        reader = csv.reader(csv_file, delimiter='\t')
        for row in reader:
            text = row[1]
            file_id = row[0]
            wav_path = wav_dir / f"{file_id}.wav"
            wav_path_str = str(wav_path.absolute())
            if wav_path.exists():
                yield (wav_path_str, text)
        
    

In [34]:
# Set dataset splits
types = ['female', 'male']
# Create HF datasets
for subset_type in types:
    dataset = get_openslr_dataset(data_dir/dataset_folders['openslr'], subset_type)
    count = output_dataset(dataset, output_dir/dataset_folders['openslr'] / subset_type)
    print(count)

2103
2023


## Indic TTS

The dataset was downloaded from [here](https://www.kaggle.com/kavyamanohar/indic-tts-malayalam-speech-corpus)

In [35]:
def get_indic_tts_dataset(dataset_path, subset_type):
    """Generator to get INDIC TTS dataset"""
    subset_folder = dataset_path / f"mono_{subset_type}_1" / f"mono_{subset_type}"
    meta_file = subset_folder / "txt.done.data_original"
    if not meta_file.exists():
        meta_file = subset_folder / "txt.done.data"
    wav_dir = subset_folder / 'wav'
    with open(meta_file, encoding='utf-8') as data_file:
        for line in data_file:
            cleaned_line = line[1:-4]
            file_id = cleaned_line[:15]
            text = cleaned_line[17:]
            wav_path = wav_dir / f"{file_id}.wav" 
            wav_path_str = str(wav_path.absolute())
            if wav_path.exists():
                yield (wav_path_str, text)

In [36]:
# Set dataset splits
types = ['female', 'male']
# Create HF datasets
for subset_type in types:
    dataset = get_indic_tts_dataset(data_dir/dataset_folders['indic-tts'], subset_type)
    count = output_dataset(dataset, output_dir/dataset_folders['indic-tts'] / subset_type)
    print(count)

2950
5649


## MSC Reviewed

The dataset from SMC was downloaded from [here](https://blog.smc.org.in/malayalam-speech-corpus/)

In [37]:
def get_msc_reviewed_dataset(dataset_path):
    """Get generator for MSC reviewed"""
    meta_file = dataset_path / 'metadata.tsv'
    with open(meta_file) as csv_file:
        reader = csv.reader(csv_file, delimiter='\t')
        headers = next(reader)
        print(headers)
        for row in reader:
            wav_path = dataset_path / row[1]
            wav_path_str = str(wav_path.absolute())
            text = row[4]
            if wav_path.exists():
                yield (wav_path_str, text)

In [38]:
# Create HF Datasets
dataset = get_msc_reviewed_dataset(data_dir/dataset_folders['msc-reviewed'])
output_dataset(dataset, output_dir/dataset_folders['msc-reviewed'])

['speechid', 'speechpath', 'speaker_id', 'review_score', 'transcript', 'category', 'speaker_gender', 'speaker_age']


1541