# Prepare filelists for MADASR23 dataset


In [1]:
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
language = "bn"
audio_dir = "/path/to/madasr23/bn"
metadata_file_path = "../filelists/metadata.csv"
symlink = "DUMMY3"
n_val = 100
n_test = 500

In [2]:
import os
import glob
import pandas as pd

### Read dataset

Here `normalized_text` contains numbers in the form of words.

**Note**: you may need to replace all `"|"` with `" | "` in the file `metadata.csv` if you are using Windows.


In [3]:
data = pd.read_csv(
    metadata_file_path,
    sep=r"|",
    header=0,
    index_col=False,
)
data.head()

Unnamed: 0,spkidx,spkid,txtid,uttid,text,phonemized_text,cleaned_text
0,0,16777288,629046,281474981563595,বাঁশের প্রায় দশ হাজার প্রজাতি হয় যেমন বাম্বুসা...,bˈãʃeɾ pɾˈajo dˈɔʃ hˈaɟaɾ pɾˈoɟatˌi hˈɔjo d͡ʒ...,2\t27\t136\t26\t150\t109\t30\t105\t5\t40\t105\...
1,0,16777288,629072,281474981405386,এক ধরনের পদ্ধতি যেটা দিয়ে শস্য থেকে খোসা ছাড়ান...,ˈek dʰˈɔɾɔnˌeɾ pˈɔddʰɔtˌi d͡ʒˈeʈa dˈie ʃˈɔssɔ ...,2\t136\t30\t35\t5\t29\t132\t136\t66\t105\t66\t...
2,0,16777288,629108,281474981563619,গুগলি বা ঝিনুকের মানে হল ওয়েস্টার আমরা খাই,ɡˈuɡɔlˌi bˈa ɟʰˈinukˌeɾ mˈane hˈɔl ˈoːeʃʈˌaɾ ˌ...,2\t79\t136\t45\t79\t66\t36\t137\t33\t5\t27\t13...
3,0,16777288,629112,281474981581650,বাগ আরশোলা ইত্যাদি সব পোকা গুলোর চাষ হয়,bˈaɡ ˈaɾɔʃˌola ˈitædˌi ʃˈɔb pˈoka ɡˈuloɾ t͡ʃˈa...,2\t27\t136\t26\t79\t5\t136\t26\t105\t66\t109\t...
4,0,16777288,629121,281474981582236,প্রাকৃতিক সব জিনিস গুলো দিয়া যখন চাষ করা হয়,pɾakɾˈitik ʃˈɔb ɟˈiniʃ ɡˈulo dˈia d͡ʒˈɔkʰɔn t͡...,2\t40\t105\t26\t35\t105\t136\t33\t44\t33\t35\t...


## Save train, val, test filelists


In [38]:
file_paths = {}
for root, dirs, files in os.walk(audio_dir):
    root_save = root.replace(audio_dir, symlink)
    for file in files:
        if file.endswith(".wav"):
            uttid = file.split(".")[0]
            file_paths[uttid] = os.path.join(root_save, file)

files_paths = pd.DataFrame.from_dict(file_paths, orient="index").reset_index()
files_paths.columns = ["uttid", "path"]
files_paths["uttid"] = files_paths["uttid"].astype("int64")

# pd.set_option("display.max_colwidth", None)

files_paths.head()

Unnamed: 0,uttid,path
0,281474978136051,DUMMY3/split2/281474978136051.wav
1,281474979007026,DUMMY3/split2/281474979007026.wav
2,281474980681350,DUMMY3/split2/281474980681350.wav
3,281474980001437,DUMMY3/split2/281474980001437.wav
4,281474978326593,DUMMY3/split2/281474978326593.wav


In [39]:
types = files_paths.dtypes
print("files_paths types")
print(types)
types = data.dtypes
print("\ndata types")
print(types)

files_paths types
uttid     int64
path     object
dtype: object

data types
spkidx              int64
spkid               int64
txtid               int64
uttid               int64
text               object
phonemized_text    object
dtype: object


In [40]:
# Merge data and file paths
df = pd.merge(data, files_paths, on="uttid", how="left")
df.head()

Unnamed: 0,spkidx,spkid,txtid,uttid,text,phonemized_text,path
0,0,16777288,629046,281474981563595,বাঁশের প্রায় দশ হাজার প্রজাতি হয় যেমন বাম্বুসা ভেন্ট্রিকসা জায়ন্ট ব্যাম্বু ইত্যাদি,bˈãʃeɾ pɾˈajo dˈɔʃ hˈaɟaɾ pɾˈoɟatˌi hˈɔjo d͡ʒˈemɔn bˈambuʃˌa bʰˈenʈɾikˌɔʃa ɟˈajɔnʈˌɔ bˈæmbu ˈitædˌi,DUMMY3/split1/281474981563595.wav
1,0,16777288,629072,281474981405386,এক ধরনের পদ্ধতি যেটা দিয়ে শস্য থেকে খোসা ছাড়ানো হয়,ˈek dʰˈɔɾɔnˌeɾ pˈɔddʰɔtˌi d͡ʒˈeʈa dˈie ʃˈɔssɔ tʰˈeke kʰˈoʃa t͡ʃʰˈar.anˌo hˈɔjo,DUMMY3/split1/281474981405386.wav
2,0,16777288,629108,281474981563619,গুগলি বা ঝিনুকের মানে হল ওয়েস্টার আমরা খাই,ɡˈuɡɔlˌi bˈa ɟʰˈinukˌeɾ mˈane hˈɔl ˈoːeʃʈˌaɾ ˌamɔɾˌa kʰˈai,DUMMY3/split1/281474981563619.wav
3,0,16777288,629112,281474981581650,বাগ আরশোলা ইত্যাদি সব পোকা গুলোর চাষ হয়,bˈaɡ ˈaɾɔʃˌola ˈitædˌi ʃˈɔb pˈoka ɡˈuloɾ t͡ʃˈaʃ hˈɔjo,DUMMY3/split1/281474981581650.wav
4,0,16777288,629121,281474981582236,প্রাকৃতিক সব জিনিস গুলো দিয়া যখন চাষ করা হয়,pɾakɾˈitik ʃˈɔb ɟˈiniʃ ɡˈulo dˈia d͡ʒˈɔkʰɔn t͡ʃˈaʃ kˈɔɾa hˈɔjo,DUMMY3/split1/281474981582236.wav


In [41]:
for i in range(10):
    print(df["uttid"][i], df["path"][i])

281474981563595 DUMMY3/split1/281474981563595.wav
281474981405386 DUMMY3/split1/281474981405386.wav
281474981563619 DUMMY3/split1/281474981563619.wav
281474981581650 DUMMY3/split1/281474981581650.wav
281474981582236 DUMMY3/split1/281474981582236.wav
281474981406024 DUMMY3/split1/281474981406024.wav
281474981406039 DUMMY3/split1/281474981406039.wav
281474981405337 DUMMY3/split1/281474981405337.wav
281474981405777 DUMMY3/split1/281474981405777.wav
281474981581658 DUMMY3/split1/281474981581658.wav


### Print first 10 rows with NaNs


In [42]:
df[df.isna().any(axis=1)].head()

Unnamed: 0,spkidx,spkid,txtid,uttid,text,phonemized_text,path


### Save train, val, test filelists


In [43]:
data_save = df[["path", "spkidx", "cleaned_text"]]
data_save = data_save.sample(frac=1).reset_index(drop=True)

# data_train = data_save.iloc[n_val + n_test :]
# For testing purposes, we use a small subset of the training data
data_train = data_save.iloc[n_val + n_test:]
data_val = data_save.iloc[:n_val]
data_test = data_save.iloc[n_val: n_val + n_test]

data_train.to_csv("../filelists/train.txt", sep="|", index=False, header=False)
data_val.to_csv("../filelists/val.txt", sep="|", index=False, header=False)
data_test.to_csv("../filelists/test.txt", sep="|", index=False, header=False)