# Create a dataset


In [1]:
dir_data = "/path/to/RESPIN_ASRU_Challenge_2023"
corpus = "bn"
origin_file_path = [f"{dir_data}/corpus/{corpus}/train/text",
                    f"{dir_data}/corpus/{corpus}/dev/text"]

config = "../config.yaml"

In [2]:
import pandas as pd

from text.symbols import UNK_ID
from text import tokenizer, detokenizer
from utils.hparams import get_hparams_from_file

## Get hyperparameters from config file


In [3]:
hps = get_hparams_from_file(config)
text_cleaners = hps.data.text_cleaners
print(text_cleaners)

['phonemize_text', 'tokenize_text', 'add_bos_eos']


### Separate text cleaners by `phonemize_text` flag

Used for faster text processing.


In [4]:
def separate_text_cleaners(text_cleaners):
    final_list = []
    temp_list = []

    for cleaner in text_cleaners:
        if cleaner == "phonemize_text":
            if temp_list:
                final_list.append(temp_list)
            final_list.append([cleaner])
            temp_list = []
        else:
            temp_list.append(cleaner)

    if temp_list:
        final_list.append(temp_list)

    return final_list


text_cleaners = separate_text_cleaners(text_cleaners)
print(text_cleaners)

[['phonemize_text'], ['tokenize_text', 'add_bos_eos']]


## Read and combine data from all files

The data looks like this:

`sid` _ `txtid` _ `uttid` `text`

`16777288` _ `629046` _ `281474981563595` `"বাঁশের প্রায়"`


In [5]:
data = pd.concat([pd.read_csv(path, sep="\t", header=None)
                 for path in origin_file_path], ignore_index=True)
print("Number of lines:", len(data))
data.head()

Number of lines: 581236


Unnamed: 0,0
0,16777288_629046_281474981563595 বাঁশের প্রায় দ...
1,16777288_629072_281474981405386 এক ধরনের পদ্ধত...
2,16777288_629108_281474981563619 গুগলি বা ঝিনুক...
3,16777288_629112_281474981581650 বাগ আরশোলা ইত্...
4,16777288_629121_281474981582236 প্রাকৃতিক সব জ...


### Convert the data to the format of sid txtid uttid text, where each value in own column


In [6]:
data = data[0].str.split("_", n=2, expand=True)

data = pd.concat(
    [data[0], data[1], data[2].str.split(" ", n=1, expand=True)], axis=1)
data.columns = ["spkid", "txtid", "uttid", "text"]
data.head()

Unnamed: 0,spkid,txtid,uttid,text
0,16777288,629046,281474981563595,বাঁশের প্রায় দশ হাজার প্রজাতি হয় যেমন বাম্বুসা...
1,16777288,629072,281474981405386,এক ধরনের পদ্ধতি যেটা দিয়ে শস্য থেকে খোসা ছাড়ান...
2,16777288,629108,281474981563619,গুগলি বা ঝিনুকের মানে হল ওয়েস্টার আমরা খাই
3,16777288,629112,281474981581650,বাগ আরশোলা ইত্যাদি সব পোকা গুলোর চাষ হয়
4,16777288,629121,281474981582236,প্রাকৃতিক সব জিনিস গুলো দিয়া যখন চাষ করা হয়


### Map speaker ids to indices with sorted order


In [7]:
spkid_to_idx = {spkid: idx for idx,
                spkid in enumerate(sorted(data.spkid.unique()))}
spkidx = data["spkid"].map(spkid_to_idx)

spkidx = pd.DataFrame(spkidx)
spkidx.columns = ["spkidx"]
data = pd.concat([spkidx, data], axis=1)
data.head()

Unnamed: 0,spkidx,spkid,txtid,uttid,text
0,0,16777288,629046,281474981563595,বাঁশের প্রায় দশ হাজার প্রজাতি হয় যেমন বাম্বুসা...
1,0,16777288,629072,281474981405386,এক ধরনের পদ্ধতি যেটা দিয়ে শস্য থেকে খোসা ছাড়ান...
2,0,16777288,629108,281474981563619,গুগলি বা ঝিনুকের মানে হল ওয়েস্টার আমরা খাই
3,0,16777288,629112,281474981581650,বাগ আরশোলা ইত্যাদি সব পোকা গুলোর চাষ হয়
4,0,16777288,629121,281474981582236,প্রাকৃতিক সব জিনিস গুলো দিয়া যখন চাষ করা হয়


In [8]:
print("Number of speakers:", len(data.spkidx.unique()))
print("Number of lines:", len(data))

Number of speakers: 2011
Number of lines: 581236


## Convert the text to tokens

It may take a while, so better to preprocess the text and save it to a file in advance.

**Note** `phonemize_text` takes the longest time.`


In [9]:
text_norm = data["text"].tolist()
for cleaners in text_cleaners:
    if "phonemize_text" in cleaners:
        text_norm = tokenizer(text_norm, cleaners, hps.data.language)
    else:
        for idx, text in enumerate(text_norm):
            temp = tokenizer(text, cleaners, hps.data.language)
            # print(f"{text}\n{detokenizer(temp)}")
            assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
            text_norm[idx] = temp
    print(f"Finished tokenizing with {cleaners}")

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(cleaned_text=text_norm)
data.head()

## Save the data to .csv file


In [None]:
# data = data.iloc[:100]
data.to_csv(f"../filelists/metadata.csv", sep="|", index=False)