# Setup

### install required libraries

In [1]:
!pip install datasets # to get training data
!pip install transformers # to train whisper model



### HuggingFace Hub login

for common voice dataset access

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Get Dataset

We'll get the hindi subset of the Fleurs dataset and the Common Voice 13 dataset (more "Hindi" data)

In [3]:
from datasets import load_dataset, DatasetDict

fleurs = DatasetDict()
fleurs["train"] = load_dataset("google/fleurs", "hi_in", split="train")
fleurs["valid"] = load_dataset("google/fleurs", "hi_in", split="validation")

common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "hi", split="train")
common_voice["valid"] = load_dataset("mozilla-foundation/common_voice_13_0", "hi", split="validation")

print(fleurs)
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 2120
    })
    valid: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 239
    })
})
DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 4479
    })
    valid: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2281
    })
})


removing additional metadata information which we don't need

In [4]:
fleurs = fleurs.remove_columns(['id', 'num_samples', 'path', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'])
common_voice = common_voice.remove_columns(['client_id', 'path', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'])
print(fleurs)
print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 2120
    })
    valid: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 239
    })
})
DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4479
    })
    valid: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2281
    })
})


combining both datasets

In [5]:
from datasets import concatenate_datasets, Audio

# rename 'sentence' column in common voice dataset to align with fleurs dataset
common_voice = common_voice.rename_column('sentence', 'transcription')

# resample audio in common voice dataset to align sampling rate with fleurs dataset
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

# now we combine
ds = DatasetDict()
ds['train'] = concatenate_datasets([fleurs['train'], common_voice['train']])
ds['valid'] = concatenate_datasets([fleurs['valid'], common_voice['valid']])

ds

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 6599
    })
    valid: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 2520
    })
})