# Create a dataset


In [None]:
import pandas as pd

dataset_name = "madasr23"
origin_file_path = ["/Users/daniilrobnikov/Developer/datasets/madasr23/corpus/bn/train/text",
                    "/Users/daniilrobnikov/Developer/datasets/madasr23/corpus/bn/dev/text"]

In [2]:
# Read and combine data from all files
data = pd.concat([pd.read_csv(path, sep="\t", header=None)
                 for path in origin_file_path], ignore_index=True)
print("Number of lines:", len(data))
print(data.head())

# The data looks like this:
# sid_txtid_uttid text
# 16777288_629046_281474981563595 বাঁশের প্রায়

Number of lines: 581236
                                                   0
0  16777288_629046_281474981563595 বাঁশের প্রায় দ...
1  16777288_629072_281474981405386 এক ধরনের পদ্ধত...
2  16777288_629108_281474981563619 গুগলি বা ঝিনুক...
3  16777288_629112_281474981581650 বাগ আরশোলা ইত্...
4  16777288_629121_281474981582236 প্রাকৃতিক সব জ...


In [3]:
# Convert the data to the format of sid txtid uttid text, where each value in own column
data = data[0].str.split("_", n=2, expand=True)

data = pd.concat(
    [data[0], data[1], data[2].str.split(" ", n=1, expand=True)], axis=1)
data.columns = ["spkid", "txtid", "uttid", "text"]
print(data.head())

      spkid   txtid            uttid  \
0  16777288  629046  281474981563595   
1  16777288  629072  281474981405386   
2  16777288  629108  281474981563619   
3  16777288  629112  281474981581650   
4  16777288  629121  281474981582236   

                                                text  
0  বাঁশের প্রায় দশ হাজার প্রজাতি হয় যেমন বাম্বুসা...  
1  এক ধরনের পদ্ধতি যেটা দিয়ে শস্য থেকে খোসা ছাড়ান...  
2         গুগলি বা ঝিনুকের মানে হল ওয়েস্টার আমরা খাই  
3            বাগ আরশোলা ইত্যাদি সব পোকা গুলোর চাষ হয়  
4        প্রাকৃতিক সব জিনিস গুলো দিয়া যখন চাষ করা হয়  


### Map speaker ids to indices with sorted order


In [4]:
spkid_to_idx = {spkid: idx for idx,
                spkid in enumerate(sorted(data.spkid.unique()))}
spkidx = data["spkid"].map(spkid_to_idx)

spkidx = pd.DataFrame(spkidx)
spkidx.columns = ["phonemes"]
data = pd.concat([spkidx, data], axis=1)
print(data.head())

{'16777288': 0, '16777289': 1, '16777290': 2, '16777291': 3, '16777292': 4, '16777527': 5, '16777555': 6, '16777564': 7, '16777584': 8, '16777597': 9, '16777601': 10, '16777713': 11, '16777717': 12, '16777718': 13, '16777721': 14, '16777722': 15, '16777723': 16, '16777724': 17, '16777726': 18, '16777729': 19, '16777731': 20, '16777733': 21, '16777734': 22, '16777735': 23, '16777737': 24, '16777738': 25, '16777746': 26, '16777747': 27, '16777748': 28, '16777751': 29, '16777756': 30, '16777758': 31, '16777763': 32, '16777765': 33, '16777778': 34, '16777782': 35, '16777785': 36, '16777786': 37, '16777788': 38, '16777796': 39, '16777803': 40, '16777807': 41, '16777814': 42, '16777819': 43, '16777822': 44, '16777828': 45, '16777829': 46, '16777830': 47, '16777831': 48, '16777832': 49, '16777833': 50, '16777835': 51, '16777837': 52, '16777838': 53, '16777839': 54, '16777840': 55, '16777841': 56, '16777842': 57, '16777843': 58, '16777844': 59, '16777845': 60, '16777851': 61, '16777853': 62, '

In [None]:
# Number of speakers
print("Number of speakers:", len(data.spkidx.unique()))

### Save the data to .csv file


In [6]:
data.to_csv(f"../filelists/{dataset_name}.csv", sep="\t")