In [1]:
from datasets import load_dataset, Dataset, Audio, DatasetDict
import pandas as pd
from tqdm import tqdm
from librosa import load, get_duration
from tqdm.notebook import tqdm


In [2]:
fleurs_dataset_id = load_dataset("google/fleurs", 
                      name="id_id", 
                      split="train+validation+test",
                      trust_remote_code=True
                        )


In [3]:
fleurs_dataset_en = load_dataset("google/fleurs",
                                  name="en_us",
                                  split="train+validation+test",
                                  trust_remote_code=True
                                )

In [4]:
fleurs_dataset_id[2]

{'id': 89,
 'num_samples': 146880,
 'path': '/home/cobrayyxx/.cache/huggingface/datasets/downloads/extracted/843218dfe1b47ac56ebb3fd2fca946a8d26c2d01c4a617ad44ea01c01d0cc1c8/10014050057465037761.wav',
 'audio': {'path': 'train/10014050057465037761.wav',
  'array': array([0.        , 0.        , 0.        , ..., 0.01398671, 0.01056969,
         0.00827134]),
  'sampling_rate': 16000},
 'transcription': 'pihak berwajib tidak banyak memberi pernyataan resmi selain mengonfirmasi penahanan hari ini',
 'raw_transcription': 'Pihak berwajib tidak banyak memberi pernyataan resmi selain mengonfirmasi penahanan hari ini.',
 'gender': 1,
 'lang_id': 36,
 'language': 'Indonesian',
 'lang_group_id': 5}

In [5]:
fleurs_dataset_en[0]

{'id': 903,
 'num_samples': 108800,
 'path': '/home/cobrayyxx/.cache/huggingface/datasets/downloads/extracted/aa5f35b1ec47eaf7b5b2b2300cb1a1b43ff29254f6dffd84be1b7defbaa4f070/10004088536354799741.wav',
 'audio': {'path': 'train/10004088536354799741.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -3.15904617e-06, -3.03983688e-06, -3.27825546e-06]),
  'sampling_rate': 16000},
 'transcription': 'a tornado is a spinning column of very low-pressure air which sucks the surrounding air inward and upward',
 'raw_transcription': 'A tornado is a spinning column of very low-pressure air, which sucks the surrounding air inward and upward.',
 'gender': 1,
 'lang_id': 19,
 'language': 'English',
 'lang_group_id': 0}

In [6]:
# Removing unused columns
fleurs_dataset_id = fleurs_dataset_id.remove_columns(["num_samples", "path", "transcription", "gender", "lang_id", "language", "lang_group_id" ])
fleurs_dataset_en = fleurs_dataset_en.remove_columns(["num_samples", "path", "audio", "transcription", "gender", "lang_id", "language", "lang_group_id", ])

In [7]:
# Rename "raw_transcription" column
fleurs_dataset_id = fleurs_dataset_id.rename_column("raw_transcription", "text_indo")
fleurs_dataset_en = fleurs_dataset_en.rename_column("raw_transcription", "text_en")


In [8]:
print(fleurs_dataset_id)
print(fleurs_dataset_en)

Dataset({
    features: ['id', 'audio', 'text_indo'],
    num_rows: 3616
})
Dataset({
    features: ['id', 'text_en'],
    num_rows: 3643
})


In [9]:
# Find how many columns that are intertwined
counter = 0
for idx in fleurs_dataset_id["id"]:
    if idx in fleurs_dataset_en["id"]:
        counter+=1
print(counter)

3561


In [10]:
# Identify duplicate id
sorted_dataset_id = fleurs_dataset_id.sort("id")
sorted_dataset_id[1:4]

{'id': [2, 2, 2],
 'audio': [{'path': 'train/3633734419227154.wav',
   'array': array([0.        , 0.        , 0.        , ..., 0.00149804, 0.00070363,
          0.00041729]),
   'sampling_rate': 16000},
  {'path': 'train/4977553378853742690.wav',
   'array': array([ 0.00000000e+00, -1.78813934e-07, -8.34465027e-07, ...,
          -1.28477812e-03, -1.85829401e-03, -1.56390667e-03]),
   'sampling_rate': 16000},
  {'path': 'train/9082789477530922497.wav',
   'array': array([0.        , 0.        , 0.        , ..., 0.00513917, 0.00303674,
          0.00273275]),
   'sampling_rate': 16000}],
 'text_indo': ['Ketua peneliti mengatakan bahwa diagnosis ini mungkin dapat menghasilkan deteksi dini kanker, tuberkulosis, HIV, dan malaria kepada pasien-pasien di negara berpenghasilan rendah, di mana tingkat kesembuhan dari penyakit-penyakit seperti kanker payudara bisa mencapai setengah dari negara-negara kaya.',
  'Ketua peneliti mengatakan bahwa diagnosis ini mungkin dapat menghasilkan deteksi di

In [11]:
# Identify duplicate en
sorted_dataset_en = fleurs_dataset_en.sort("id")
sorted_dataset_en[1:4]

{'id': [1, 1, 2],
 'text_en': ['On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.',
  'On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.',
  'Lead researchers say this may bring early detection of cancer, tuberculosis, HIV and malaria to patients in low-income countries, where the survival rates for illnesses such as breast cancer can be half those of richer countries.']}

### Convert to Pandas

In [12]:
sorted_dataset_en = fleurs_dataset_en.sort("id")

In [13]:
sorted_dataset_en["text_en"][3] ==  sorted_dataset_en["text_en"][5]

True

In [14]:
# Change Dataset format into pandas Dataframe
df_id = sorted_dataset_id.to_pandas()

In [15]:
df_id.shape

(3616, 3)

In [16]:
df_id.count()

id           3616
audio        3616
text_indo    3616
dtype: int64

In [17]:
df_id.head()

Unnamed: 0,id,audio,text_indo
0,1,{'bytes': b'RIFF2\x91\x14\x00WAVEfmt \x12\x00\...,Ilmuwan dari Stanford University School of Med...
1,2,{'bytes': b'RIFF2\x7f\x17\x00WAVEfmt \x12\x00\...,Ketua peneliti mengatakan bahwa diagnosis ini ...
2,2,{'bytes': b'RIFF2\n\x14\x00WAVEfmt \x12\x00\x0...,Ketua peneliti mengatakan bahwa diagnosis ini ...
3,2,{'bytes': b'RIFF2)\x13\x00WAVEfmt \x12\x00\x00...,Ketua peneliti mengatakan bahwa diagnosis ini ...
4,3,{'bytes': b'RIFF2*\x12\x00WAVEfmt \x12\x00\x00...,JAS 39C Gripen jatuh ke landasan pacu sekitar ...


In [18]:
# Change Dataset format into pandas Dataframe
df_en = sorted_dataset_en.to_pandas()
#drop the duplicate id since it's only text
df_en = df_en.drop_duplicates(subset=["id"], keep="first").reset_index(drop=True)

In [19]:
df_en.head()

Unnamed: 0,id,text_en
0,1,"On Monday, scientists from the Stanford Univer..."
1,2,Lead researchers say this may bring early dete...
2,3,The JAS 39C Gripen crashed onto a runway at ar...
3,4,The pilot was identified as Squadron Leader Di...
4,5,Local media reports an airport fire vehicle ro...


In [20]:
df_en.shape

(1976, 2)

### Merge the two Dataframe that has the same id

In [62]:
df_merge = pd.merge(df_id, df_en, how="left", on="id")

In [63]:
df_merge
df_merge.head()

Unnamed: 0,id,audio,text_indo,text_en
0,1,{'bytes': b'RIFF2\x91\x14\x00WAVEfmt \x12\x00\...,Ilmuwan dari Stanford University School of Med...,"On Monday, scientists from the Stanford Univer..."
1,2,{'bytes': b'RIFF2\x7f\x17\x00WAVEfmt \x12\x00\...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
2,2,{'bytes': b'RIFF2\n\x14\x00WAVEfmt \x12\x00\x0...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
3,2,{'bytes': b'RIFF2)\x13\x00WAVEfmt \x12\x00\x00...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
4,3,{'bytes': b'RIFF2*\x12\x00WAVEfmt \x12\x00\x00...,JAS 39C Gripen jatuh ke landasan pacu sekitar ...,The JAS 39C Gripen crashed onto a runway at ar...


In [64]:
# check whether there's a "None" value
df_merge[(df_merge["text_indo"].isnull()) | (df_merge["text_en"].isnull()) | (df_merge["audio"].isnull())]

Unnamed: 0,id,audio,text_indo,text_en
109,49,{'bytes': b'RIFF2\x1f\x0e\x00WAVEfmt \x12\x00\...,Tuan Reid berhasil mengemudikan mobil A1 GP mi...,
110,49,{'bytes': b'RIFF2\x97\x0e\x00WAVEfmt \x12\x00\...,Tuan Reid berhasil mengemudikan mobil A1 GP mi...,
215,102,{'bytes': b'RIFF2\x9d\x08\x00WAVEfmt \x12\x00\...,"Lubang tangki masih bocor hingga Rabu sore, ke...",
216,102,{'bytes': b'RIFF2\xc6\x0c\x00WAVEfmt \x12\x00\...,"Lubang tangki masih bocor hingga Rabu sore, ke...",
459,249,{'bytes': b'RIFF2\x9c\t\x00WAVEfmt \x12\x00\x0...,"Sebuah losmen ambruk di Makkah, kota suci Isla...",
460,249,{'bytes': b'RIFF2q\x07\x00WAVEfmt \x12\x00\x00...,"Sebuah losmen ambruk di Makkah, kota suci Isla...",
490,267,{'bytes': b'RIFF2\xda\x07\x00WAVEfmt \x12\x00\...,Kontrol darat Rusia mengaktifkan jet-jet terse...,
491,267,{'bytes': b'RIFF2p\x08\x00WAVEfmt \x12\x00\x00...,Kontrol darat Rusia mengaktifkan jet-jet terse...,
532,291,{'bytes': b'RIFF2\xbb\x08\x00WAVEfmt \x12\x00\...,Kantor Meteorologi Islandia juga melaporkan ba...,
533,291,{'bytes': b'RIFF2x\x0f\x00WAVEfmt \x12\x00\x00...,Kantor Meteorologi Islandia juga melaporkan ba...,


In [65]:
df_merge = df_merge.dropna()

In [70]:
df_merge.reset_index(drop=True)

Unnamed: 0,id,audio,text_indo,text_en
0,1,{'bytes': b'RIFF2\x91\x14\x00WAVEfmt \x12\x00\...,Ilmuwan dari Stanford University School of Med...,"On Monday, scientists from the Stanford Univer..."
1,2,{'bytes': b'RIFF2\x7f\x17\x00WAVEfmt \x12\x00\...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
2,2,{'bytes': b'RIFF2\n\x14\x00WAVEfmt \x12\x00\x0...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
3,2,{'bytes': b'RIFF2)\x13\x00WAVEfmt \x12\x00\x00...,Ketua peneliti mengatakan bahwa diagnosis ini ...,Lead researchers say this may bring early dete...
4,3,{'bytes': b'RIFF2*\x12\x00WAVEfmt \x12\x00\x00...,JAS 39C Gripen jatuh ke landasan pacu sekitar ...,The JAS 39C Gripen crashed onto a runway at ar...
...,...,...,...,...
3556,2006,{'bytes': b'RIFF2\xf8\x07\x00WAVEfmt \x12\x00\...,Elemen seperti kalsium dan potasium adalah log...,Elements like calcium and potassium are consid...
3557,2007,{'bytes': b'RIFF2O\x0b\x00WAVEfmt \x12\x00\x00...,U.S. Corps of Engineers memperkirakan bahwa cu...,The U.S. Corps of Engineers estimated that 6 i...
3558,2008,{'bytes': b'RIFF2\xc1\x11\x00WAVEfmt \x12\x00\...,Subjek lain di dalam agenda di Bali antara lai...,Other subjects on the agenda in Bali include s...
3559,2009,{'bytes': b'RIFF2\x95\x10\x00WAVEfmt \x12\x00\...,Sumber nabati yang paling mudah diperoleh adal...,The most readily accessible plant resources wo...


In [71]:
df_merge[(df_merge["text_indo"].isnull()) | (df_merge["text_en"].isnull()) | (df_merge["audio"].isnull())]

Unnamed: 0,id,audio,text_indo,text_en


# Convert back to Dataset Object

In [72]:
# Convert back to Dataset format
merged_dataset = Dataset.from_pandas(df_merge)

In [73]:
merged_dataset

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en', '__index_level_0__'],
    num_rows: 3561
})

In [83]:
merged_dataset = merged_dataset.remove_columns(["__index_level_0__"])

In [84]:
# see the data format
merged_dataset[1100]["audio"]

{'path': 'train/17968311064306361757.wav',
 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00020576,
        -0.00021666, -0.00022811]),
 'sampling_rate': 16000}

In [85]:
# Re-cast the audio column from byte into Audio object
merged_dataset = merged_dataset.cast_column("audio", Audio(sampling_rate=16000))

### Split into train and val

In [86]:
dataset = merged_dataset.train_test_split(test_size=0.2)

In [87]:
dataset["train"][0]

{'id': 370,
 'audio': {'path': 'train/6166977075470649137.wav',
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -1.78813934e-07,  4.09364700e-04,  4.82201576e-05]),
  'sampling_rate': 16000},
 'text_indo': 'Beberapa ilmuwan mengira Triceratop memakan palem sagu, yang merupakan jenis tumbuhan yang umum di zaman Kapur.',
 'text_en': 'Some scientists think Triceratops ate cycads, which are a type of plant that was common in the Cretaceous.'}

In [88]:
dataset["test"][0]

{'id': 586,
 'audio': {'path': 'train/4317314961397545836.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ...,  0.00035083,
          0.00042075, -0.00119245]),
  'sampling_rate': 16000},
 'text_indo': 'Namun, teleskop asli pertama dibuat di Eropa pada akhir abad ke-16.',
 'text_en': 'However, the first true telescopes were made in Europe in the late 16th century.'}

In [89]:
dataset= DatasetDict({
    'train': dataset['train'],
    'validation': dataset['test']})

In [90]:
dataset["train"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 2848
})

In [91]:
dataset["validation"]

Dataset({
    features: ['id', 'audio', 'text_indo', 'text_en'],
    num_rows: 713
})

In [92]:
dataset.push_to_hub("cobrayyxx/FLEURS_INDO-ENG_Speech_Translation", private=False)

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/570 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Map:   0%|          | 0/569 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/cobrayyxx/FLEURS_INDO-ENG_Speech_Translation/commit/65b9009a9368657186a50972f664474e3a3f2946', commit_message='Upload dataset', commit_description='', oid='65b9009a9368657186a50972f664474e3a3f2946', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/cobrayyxx/FLEURS_INDO-ENG_Speech_Translation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='cobrayyxx/FLEURS_INDO-ENG_Speech_Translation'), pr_revision=None, pr_num=None)

# add total duration of audio

In [93]:
def get_dataset_duration(audio_dataset):
    seconds=0
    for audio in tqdm(audio_dataset, total=len(audio_dataset)):
        audio_array = audio["array"]
        sampling_rate = audio["sampling_rate"]
        audio_duration = get_duration(y=audio_array, sr=sampling_rate)
        seconds+=audio_duration

    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes,60)
    dataset_duration = f"{hours:0.0f}:{minutes:0.0f}:{seconds:0.0f} "
    return dataset_duration

In [94]:
total_duration = get_dataset_duration(merged_dataset["audio"])

  0%|          | 0/3561 [00:00<?, ?it/s]

In [95]:
total_duration

'12:24:28 '

In [96]:
def get_dataset_duration(audio_dataset):
  seconds = 0
  for audio in tqdm(audio_dataset, total=len(audio_dataset)):
    audio_array = audio["array"]
    sampling_rate = audio["sampling_rate"]
    audio_duration = get_duration(y=audio_array, sr=sampling_rate)
    seconds += audio_duration

  minutes, seconds = divmod(seconds, 60)
  hours, minutes = divmod(minutes, 60)
  dataset_duration = f"{hours:0.0f}:{minutes:0.0f}:{seconds:0.0f}"

  return dataset_duration