In [71]:
from datasets import load_dataset, Dataset, Audio, DatasetDict
import pandas as pd
from tqdm import tqdm
from librosa import load, get_duration
from tqdm.notebook import tqdm


In [22]:
fleurs_dataset_id = load_dataset("google/fleurs", 
                      name="id_id", 
                      split="train+validation+test",
                      trust_remote_code=True
                        )


In [23]:
fleurs_dataset_en = load_dataset("google/fleurs",
                                  name="en_us",
                                  split="train+validation+test",
                                  trust_remote_code=True
                                )

In [24]:
fleurs_dataset_id[2]

{'id': 89,
 'num_samples': 146880,
 'path': '/home/cobrayyxx/.cache/huggingface/datasets/downloads/extracted/843218dfe1b47ac56ebb3fd2fca946a8d26c2d01c4a617ad44ea01c01d0cc1c8/10014050057465037761.wav',
 'audio': {'path': 'train/10014050057465037761.wav',
  'array': array([0.        , 0.        , 0.        , ..., 0.01398671, 0.01056969,
         0.00827134]),
  'sampling_rate': 16000},
 'transcription': 'pihak berwajib tidak banyak memberi pernyataan resmi selain mengonfirmasi penahanan hari ini',
 'raw_transcription': 'Pihak berwajib tidak banyak memberi pernyataan resmi selain mengonfirmasi penahanan hari ini.',
 'gender': 1,
 'lang_id': 36,
 'language': 'Indonesian',
 'lang_group_id': 5}

In [25]:
fleurs_dataset_en[0]

{'id': 903,
 'num_samples': 108800,
 'path': '/home/cobrayyxx/.cache/huggingface/datasets/downloads/extracted/aa5f35b1ec47eaf7b5b2b2300cb1a1b43ff29254f6dffd84be1b7defbaa4f070/10004088536354799741.wav',
 'audio': {'path': 'train/10004088536354799741.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.15904617e-06, -3.03983688e-06, -3.27825546e-06]),
  'sampling_rate': 16000},
 'transcription': 'a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward',
 'raw_transcription': 'A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [26]:
# Removing unused columns
fleurs_dataset_id = fleurs_dataset_id.remove_columns(["num_samples", "path", "transcription", "gender", "lang_id", "language", "lang_group_id" ])
fleurs_dataset_en = fleurs_dataset_en.remove_columns(["num_samples", "path", "audio", "transcription", "gender", "lang_id", "language", "lang_group_id", ])

In [27]:
# Rename "raw_transcription" column
fleurs_dataset_id = fleurs_dataset_id.rename_column("raw_transcription", "text_indo")
fleurs_dataset_en = fleurs_dataset_en.rename_column("raw_transcription", "text_en")


In [28]:
print(fleurs_dataset_id)
print(fleurs_dataset_en)

Dataset({
    features: ['id', 'audio', 'text_indo'],
    num_rows: 3616
})
Dataset({
    features: ['id', 'text_en'],
    num_rows: 3643
})


In [29]:
# Find how many columns that are intertwined
counter = 0
for idx in fleurs_dataset_id["id"]:
    if idx in fleurs_dataset_en["id"]:
        counter+=1
print(counter)

3561


In [30]:
# Identify duplicate id
sorted_dataset_id = fleurs_dataset_id.sort("id")
sorted_dataset_id[1:4]

{'id': [2, 2, 2],
 'audio': [{'path': 'train/3633734419227154.wav',
   'array': array([0.        , 0.        , 0.        , ..., 0.00149804, 0.00070363,
          0.00041729]),
   'sampling_rate': 16000},
  {'path': 'train/4977553378853742690.wav',
   'array': array([ 0.00000000e+00, -1.78813934e-07, -8.34465027e-07, ...,
          -1.28477812e-03, -1.85829401e-03, -1.56390667e-03]),
   'sampling_rate': 16000},
  {'path': 'train/9082789477530922497.wav',
   'array': array([0.        , 0.        , 0.        , ..., 0.00513917, 0.00303674,
          0.00273275]),
   'sampling_rate': 16000}],
 'text_indo': ['Ketua peneliti mengatakan bahwa diagnosis ini mungkin dapat menghasilkan deteksi dini kanker, tuberkulosis, HIV, dan malaria kepada pasien-pasien di negara berpenghasilan rendah, di mana tingkat kesembuhan dari penyakit-penyakit seperti kanker payudara bisa mencapai setengah dari negara-negara kaya.',
  'Ketua peneliti mengatakan bahwa diagnosis ini mungkin dapat menghasilkan deteksi di

In [31]:
# Identify duplicate en
sorted_dataset_en = fleurs_dataset_en.sort("id")
sorted_dataset_en[1:4]

{'id': [1, 1, 2],
 'text_en': ['On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.',
  'On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.',
  'Lead researchers say this may bring early detection of cancer, tuberculosis, HIV and malaria to patients in low-income countries, where the survival rates for illnesses such as breast cancer can be half those of richer countries.']}

### Convert to Pandas

In [32]:
sorted_dataset_en = fleurs_dataset_en.sort("id")

In [33]:
sorted_dataset_en["text_en"][3] ==  sorted_dataset_en["text_en"][5]

True

In [34]:
# Change Dataset format into pandas Dataframe
df_id = sorted_dataset_id.to_pandas()

In [35]:
df_id.shape

(3616, 3)

In [36]:
df_id.count()

id           3616
audio        3616
text_indo    3616
dtype: int64

In [37]:
df_id.head()Scenario

Unnamed: 0,id,audio,text_indo
0,1,{'bytes': b'RIFF2\x91\x14\x00WAVEfmt \x12\x00\...,Ilmuwan dari Stanford University School of Med...
1,2,{'bytes': b'RIFF2\x7f\x17\x00WAVEfmt \x12\x00\...,Ketua peneliti mengatakan bahwa diagnosis ini ...
2,2,{'bytes': b'RIFF2\n\x14\x00WAVEfmt \x12\x00\x0...,Ketua peneliti mengatakan bahwa diagnosis ini ...
3,2,{'bytes': b'RIFF2)\x13\x00WAVEfmt \x12\x00\x00...,Ketua peneliti mengatakan bahwa diagnosis ini ...
4,3,{'bytes': b'RIFF2*\x12\x00WAVEfmt \x12\x00\x00...,JAS 39C Gripen jatuh ke landasan pacu sekitar ...


In [38]:
# Change Dataset format into pandas Dataframe
df_en = sorted_dataset_en.to_pandas()
#drop the duplicate id since it's only text
df_en = df_en.drop_duplicates(subset=["id"], keep="first").reset_index(drop=True)

In [39]:
df_en.head()

Unnamed: 0,id,text_en
0,1,"On Monday, scientists from the Stanford Univer..."
1,2,Lead researchers say this may bring early dete...
2,3,The JAS 39C Gripen crashed onto a runway at ar...
3,4,The pilot was identified as Squadron Leader Di...
4,5,Local media reports an airport fire vehicle ro...


In [40]:
df_en.shape

(1976, 2)

### Merge the two Dataframe that has the same id

In [41]:
df_merge = pd.merge(df_id, df_en, how="left", on="id")

In [42]:
df_merge
df_merge.head()

Unnamed: 0,id,audio,text_indo,text_en
0,1,{'bytes': b'RIFF2\x91\x14\x00WAVEfmt \x12\x00\...,Ilmuwan dari Stanford University School of Med...,"On Monday, scientists from the Stanford Univer..."
1,2,{'bytes': b'RIFF2\x7f\x17\x00WAVEfmt \x12\x00\...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
2,2,{'bytes': b'RIFF2\n\x14\x00WAVEfmt \x12\x00\x0...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
3,2,{'bytes': b'RIFF2)\x13\x00WAVEfmt \x12\x00\x00...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
4,3,{'bytes': b'RIFF2*\x12\x00WAVEfmt \x12\x00\x00...,JAS 39C Gripen jatuh ke landasan pacu sekitar ...,The JAS 39C Gripen crashed onto a runway at ar...


In [51]:
# check whether there's a "None" value
df_merge[(df_merge["text_indo"] == None) | (df_merge["text_en"] == None)]

Unnamed: 0,id,audio,text_indo,text_en


# Convert back to Dataset Object

In [None]:
# Convert back to Dataset format
merged_dataset = Dataset.from_pandas(df_merge)

In [None]:
merged_dataset

In [None]:
# see the data format
merged_dataset[1100]["audio"]

In [55]:
# Re-cast the audio column from byte into Audio object
merged_dataset = merged_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [56]:
merged_dataset[100]

{'id': 44,
 'audio': {'path': 'train/17063152554875437734.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00526178,
         -0.00335544, -0.00391346]),
  'sampling_rate': 16000},
 'text_indo': 'Fred saat ini memiliki kecepatan angin 105 mil per jam (165 km/jam) dan bergerak menuju barat laut.',
 'text_en': 'Fred currently has winds of 105 miles per hour (165 km/h) and is moving towards the northwest.'}

### Split into train and val

In [57]:
dataset = merged_dataset.train_test_split(test_size=0.2)

In [58]:
dataset["train"][0]

{'id': 1803,
 'audio': {'path': 'test/12396623553815830790.wav',
  'array': array([ 0.00000000e+00,  5.96046448e-08, -5.96046448e-08, ...,
          1.71566010e-03,  1.91479921e-03,  4.61918116e-03]),
  'sampling_rate': 16000},
 'text_indo': 'Kelembapan di tangan Anda akan bereaksi dengan lapisan luar, yang akan terasa lucu dan membentuk semacam cangkang.',
 'text_en': 'The moisture on your hands will react with the outer layers, which will feel funny and form a sort of shell.'}

In [59]:
dataset["test"][0]

{'id': 463,
 'audio': {'path': 'train/16282931550754983202.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00059396,
         -0.00049382, -0.00076425]),
  'sampling_rate': 16000},
 'text_indo': 'Hal ini dilakukan untuk memastikan gambar menutupi seluruh layar. Hal tersebut dinamakan overscan.',
 'text_en': 'This is made to ensure that the image covers the whole screen. That is called overscan.'}

In [60]:
dataset= DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']})

In [61]:
dataset["train"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 2892
})

In [62]:
dataset["validation"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 724
})

In [63]:
dataset.push_to_hub("cobrayyxx/indo-eng_speech_translation_datasets", private=False)

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/579 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/579 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/362 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cobrayyxx/indo-eng_speech_translation_datasets/commit/827c9bf607da802e7a7aa327fd907bdffba6065f', commit_message='Upload dataset', commit_description='', oid='827c9bf607da802e7a7aa327fd907bdffba6065f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cobrayyxx/indo-eng_speech_translation_datasets', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cobrayyxx/indo-eng_speech_translation_datasets'), pr_revision=None, pr_num=None)

# add total duration of audio

In [78]:
def get_dataset_duration(audio_dataset):
    seconds=0
    for audio in tqdm(audio_dataset, total=len(audio_dataset)):
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        audio_duration = get_duration(y=audio_array, sr=sampling_rate)
        seconds+=audio_duration

    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes,60)
    dataset_duration = f"{hours:0.0f}:{minutes:0.0f}:{seconds:0.0f} "
    return dataset_duration

In [79]:
total_duration = get_dataset_duration(merged_dataset["audio"])

  0%|          | 0/3616 [00:00<?, ?it/s]

In [80]:
total_duration

'12:37:5 '

In [77]:
def get_dataset_duration(audio_dataset):
  seconds = 0
  for audio in tqdm(audio_dataset, total=len(audio_dataset)):
    audio_array = audio["array"]
    sampling_rate = audio["sampling_rate"]
    audio_duration = get_duration(y=audio_array, sr=sampling_rate)
    seconds += audio_duration

  minutes, seconds = divmod(seconds, 60)
  hours, minutes = divmod(minutes, 60)
  dataset_duration = f"{hours:0.0f}:{minutes:0.0f}:{seconds:0.0f}"

  return dataset_duration