In [1]:
from datasets import load_dataset, Dataset, Audio, DatasetDict
import pandas as pd
from tqdm import tqdm
from librosa import load, get_duration
from tqdm.notebook import tqdm


In [8]:
covost2_dataset = load_dataset("facebook/covost2", "id_en",
                  data_dir="covost2_id_en/id",  
                  split="train+validation+test",
                  trust_remote_code=True
                )

Downloading data:   0%|          | 0.00/51.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1243 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/792 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/844 [00:00<?, ? examples/s]

In [11]:
covost2_dataset

Dataset({
    features: ['client_id', 'file', 'audio', 'sentence', 'translation', 'id'],
    num_rows: 2879
})

In [14]:
covost2_dataset[2]

{'client_id': 'e4a9f157112dd7dbdf0b16ff07a6c067d193440280dd634caced1b972c5ad3600e18eed6bea56a4852218c0039067b7b246063d23cd891637fd7e5d7992b3193',
 'file': '/home/cobrayyxx/coding-coding/personal/speech-translation-mentorship/covost2_id_en/id/clips/common_voice_id_19090412.mp3',
 'audio': {'path': '/home/cobrayyxx/coding-coding/personal/speech-translation-mentorship/covost2_id_en/id/clips/common_voice_id_19090412.mp3',
  'array': array([ 2.91038305e-11, -1.60071068e-10,  5.82076609e-11, ...,
         -1.03077582e-06, -7.05323691e-06,  7.26000872e-05]),
  'sampling_rate': 16000},
 'sentence': 'Di belakang rumah ada mobil.',
 'translation': 'Behind the house, there is a car.',
 'id': 'common_voice_id_19090412'}

# Convert to Pandas for Preprocessing

In [17]:
# convert to panda DataFrame
df_covost2 = covost2_dataset.to_pandas()
df_covost2.tail()

Unnamed: 0,client_id,file,audio,sentence,translation,id
2874,8b06b50dc705efb9e9c4bea38dad95d43b7ec78af963af...,/home/cobrayyxx/coding-coding/personal/speech-...,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Bhinneka Tunggal Ika.,Unity in Diversity.,common_voice_id_19258506
2875,8b06b50dc705efb9e9c4bea38dad95d43b7ec78af963af...,/home/cobrayyxx/coding-coding/personal/speech-...,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Saya menyusun piring di meja.,I arrange a plate on the table.,common_voice_id_19258507
2876,8b06b50dc705efb9e9c4bea38dad95d43b7ec78af963af...,/home/cobrayyxx/coding-coding/personal/speech-...,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Dia kaya dengan cepat.,He is rich quickly.,common_voice_id_19258508
2877,8b06b50dc705efb9e9c4bea38dad95d43b7ec78af963af...,/home/cobrayyxx/coding-coding/personal/speech-...,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Dia mendapatkan nilai bagus dalam bahasa inggris.,He got good scores in English.,common_voice_id_19258509
2878,8b06b50dc705efb9e9c4bea38dad95d43b7ec78af963af...,/home/cobrayyxx/coding-coding/personal/speech-...,"{'bytes': None, 'path': '/home/cobrayyxx/codin...","Selamat malam, Timmy.","Good evening, Timmy.",common_voice_id_19258510


In [25]:
df_covost2 = df_covost2.drop(columns=["client_id", "file"], axis=0)

In [26]:
# Rearrange the columns so that the ID column comes first.
df_id = df_covost2.pop("id")
df_covost2.insert(0, "id", df_id)

In [27]:
df_covost2.head()

Unnamed: 0,id,audio,sentence,translation
0,common_voice_id_19090410,"{'bytes': None, 'path': '/home/cobrayyxx/codin...","""""""Cepatlah berangkat!"""" ia berujar padaku.""","""Hurry and go!"" he said to me."
1,common_voice_id_19090411,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Barang Pak Kimura kecil dan ringan.,Mr. Kimura's things are small and light.
2,common_voice_id_19090412,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Di belakang rumah ada mobil.,"Behind the house, there is a car."
3,common_voice_id_19090413,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Tolong tanyain dong sama dia!,Please ask him!
4,common_voice_id_19090414,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Apa yang dokter katakan?,What did the doctor say?


In [31]:
# rename sentence and translation columns
df_covost2 = df_covost2.rename(columns={"sentence":"text_indo", "translation":"text_en"})

In [32]:
df_covost2.head()

Unnamed: 0,id,audio,text_indo,text_en
0,common_voice_id_19090410,"{'bytes': None, 'path': '/home/cobrayyxx/codin...","""""""Cepatlah berangkat!"""" ia berujar padaku.""","""Hurry and go!"" he said to me."
1,common_voice_id_19090411,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Barang Pak Kimura kecil dan ringan.,Mr. Kimura's things are small and light.
2,common_voice_id_19090412,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Di belakang rumah ada mobil.,"Behind the house, there is a car."
3,common_voice_id_19090413,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Tolong tanyain dong sama dia!,Please ask him!
4,common_voice_id_19090414,"{'bytes': None, 'path': '/home/cobrayyxx/codin...",Apa yang dokter katakan?,What did the doctor say?


In [33]:
df_covost2.shape

(2879, 4)

In [62]:
#check null value
df_covost2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2879 entries, 0 to 2878
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         2879 non-null   object
 1   audio      2879 non-null   object
 2   text_indo  2879 non-null   object
 3   text_en    2879 non-null   object
dtypes: object(4)
memory usage: 90.1+ KB


# Convert Back to Dataset Object

In [35]:
# Convert back to Dataset format
covost2_dataset = Dataset.from_pandas(df_covost2)

In [36]:
covost2_dataset

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 2879
})

In [39]:
covost2_dataset[1]["audio"]

{'bytes': None,
 'path': '/home/cobrayyxx/coding-coding/personal/speech-translation-mentorship/covost2_id_en/id/clips/common_voice_id_19090411.mp3'}

In [44]:
# Re-cast the audio column from byte to Audio object
covost2_dataset = covost2_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [45]:
covost2_dataset[1]["audio"]

{'path': '/home/cobrayyxx/coding-coding/personal/speech-translation-mentorship/covost2_id_en/id/clips/common_voice_id_19090411.mp3',
 'array': array([ 2.32830644e-10,  2.09547579e-09,  5.82076609e-10, ...,
        -1.67789403e-05,  2.81736720e-06,  4.79968730e-06]),
 'sampling_rate': 16000}

# Split into Train and Val

In [46]:
splitted_dataset = covost2_dataset.train_test_split(test_size=0.2)

In [48]:
splitted_dataset["train"][0]

{'id': 'common_voice_id_19192633',
 'audio': {'path': '/home/cobrayyxx/coding-coding/personal/speech-translation-mentorship/covost2_id_en/id/clips/common_voice_id_19192633.mp3',
  'array': array([ 4.54747351e-13,  0.00000000e+00, -5.00222086e-12, ...,
         -7.21809511e-06, -8.87564602e-06, -5.72347926e-06]),
  'sampling_rate': 16000},
 'text_indo': 'Tidak ada tuhan selain Allah. Muhammad adalah utusan Allah.',
 'text_en': 'There is no God but Allah. Muhammad is the messenger of God.'}

In [53]:
splitted_dataset["test"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 576
})

In [49]:
splitted_dataset["test"][0]

{'id': 'common_voice_id_19668012',
 'audio': {'path': '/home/cobrayyxx/coding-coding/personal/speech-translation-mentorship/covost2_id_en/id/clips/common_voice_id_19668012.mp3',
  'array': array([-2.03726813e-10, -2.40106601e-10, -2.54658516e-10, ...,
          2.18490401e-04,  1.59549294e-04, -6.98005897e-05]),
  'sampling_rate': 16000},
 'text_indo': 'Dia harus datang kerumahmu setelah satu jam.',
 'text_en': 'He has to come to your house after an hour.'}

In [50]:
dataset = DatasetDict({
    'train': splitted_dataset['train'],
    'validation': splitted_dataset['test']})

In [51]:
dataset["validation"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 576
})

In [52]:
dataset["train"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 2303
})

# Total Duration of Audio

In [59]:
def get_dataset_duration(audio_dataset):
    seconds=0
    for audio in tqdm(audio_dataset, total=len(audio_dataset)):
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        audio_duration = get_duration(y=audio_array, sr=sampling_rate)
        seconds+=audio_duration

    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes,60)
    dataset_duration = f"{hours:0.0f}:{minutes:0.0f}:{seconds:0.0f} "
    return dataset_duration

In [60]:
total_duration = get_dataset_duration(covost2_dataset["audio"])

  0%|          | 0/2879 [00:00<?, ?it/s]

In [61]:
total_duration

'2:58:59 '

# Upload to HuggingFace

In [58]:
dataset.push_to_hub("cobrayyxx/COVOST2_INDO-ENG_Speech_Translation", private=False)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/2303 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cobrayyxx/COVOST2_INDO-ENG_Speech_Translation/commit/72deb9935942dfab42e2e4941c5681ace28ffe51', commit_message='Upload dataset', commit_description='', oid='72deb9935942dfab42e2e4941c5681ace28ffe51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cobrayyxx/COVOST2_INDO-ENG_Speech_Translation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cobrayyxx/COVOST2_INDO-ENG_Speech_Translation'), pr_revision=None, pr_num=None)