## 1. Data Ingestion

In [16]:
from stt import CommonVoiceDataset
from utils import NAME_DATASET, TARGET_LANGUAGE, RAW_DATA_PATH, PROCESSED_DATA_PATH

In [2]:
# Create an instance of the CommonVoiceDataset class
cv_dataset = CommonVoiceDataset(name_dataset=NAME_DATASET, language=TARGET_LANGUAGE)

# Load the dataset
cv_dataset.load_dataset()

In [3]:
# Save the dataset to a local directory
cv_dataset.save_to_disk(RAW_DATA_PATH)

Saving the dataset (0/1 shards):   0%|          | 0/4906 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2211 [00:00<?, ? examples/s]

In [4]:
del cv_dataset

In [5]:
# Load the dataset from a local directory
dataset = CommonVoiceDataset(name_dataset=NAME_DATASET, language=TARGET_LANGUAGE)
dataset.load_from_disk(RAW_DATA_PATH)
dataset = dataset.get_dataset()

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 4906
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2211
    })
})

## 2. Data Preprocessing

In [7]:
from utils import MODEL_NAME, SIMILAR_TARGET_LANGUAGE, TASK, NUM_PROC
from utils import MAX_INPUT_LENGTH

from stt import DatasetPreprocessor

In [8]:
preprocessor = DatasetPreprocessor(MODEL_NAME, SIMILAR_TARGET_LANGUAGE, TASK, MAX_INPUT_LENGTH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
dataset = dataset.select_columns(["audio", "sentence"])

In [10]:
dataset["train"].features

{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [11]:
dataset = preprocessor.prepare_dataset(dataset, NUM_PROC)

Map:   0%|          | 0/4906 [00:00<?, ? examples/s]

Map:   0%|          | 0/2211 [00:00<?, ? examples/s]

In [14]:
dataset = preprocessor.filter_dataset(dataset)

Filter:   0%|          | 0/4906 [00:00<?, ? examples/s]

In [15]:
dataset['train']

Dataset({
    features: ['input_features', 'labels', 'input_length'],
    num_rows: 4906
})

In [17]:
dataset.save_to_disk(PROCESSED_DATA_PATH)

Saving the dataset (0/10 shards):   0%|          | 0/4906 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/2211 [00:00<?, ? examples/s]