# Prepare filelists for LJSpeech dataset


In [1]:
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
dir_data = "/path/to/LJSpeech-1.1"
config = "../config.yaml"
symlink = "DUMMY1"
n_val = 100
n_test = 500

In [2]:
import pandas as pd

from text.symbols import UNK_ID
from text import tokenizer, detokenizer
from utils.hparams import get_hparams_from_file

## Get hyperparameters from config file


In [3]:
hps = get_hparams_from_file(config)
text_cleaners = hps.data.text_cleaners
print(text_cleaners)

['phonemize_text', 'tokenize_text', 'add_bos_eos']


### Separate text cleaners by `phonemize_text` flag

Used for faster text processing.


In [4]:
def separate_text_cleaners(text_cleaners):
    final_list = []
    temp_list = []

    for cleaner in text_cleaners:
        if cleaner == "phonemize_text":
            if temp_list:
                final_list.append(temp_list)
            final_list.append([cleaner])
            temp_list = []
        else:
            temp_list.append(cleaner)

    if temp_list:
        final_list.append(temp_list)

    return final_list


text_cleaners = separate_text_cleaners(text_cleaners)
print(text_cleaners)

[['phonemize_text'], ['tokenize_text', 'add_bos_eos']]


## Read dataset

Here `normalized_text` contains numbers in the form of words.

**Note**: you may need to replace all `"|"` with `" | "` in the file `metadata.csv`.


In [5]:
data = pd.read_csv(
    f"{dir_data}/metadata_copy.csv",
    sep=r"|",
    header=None,
    names=["file", "text", "normalized_text", "cleaned_text"],
    index_col=False,
    # converter to add .wav to file name
    converters={"file": lambda x: f"{symlink}/{x.strip()}.wav", "text": str.strip, "normalized_text": str.strip},
)
data.head()

Unnamed: 0,file,text,normalized_text,cleaned_text
0,DUMMY1/LJ001-0001.wav,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ...",
1,DUMMY1/LJ001-0002.wav,in being comparatively modern.,in being comparatively modern.,
2,DUMMY1/LJ001-0003.wav,For although the Chinese took impressions from...,For although the Chinese took impressions from...,
3,DUMMY1/LJ001-0004.wav,"produced the block books, which were the immed...","produced the block books, which were the immed...",
4,DUMMY1/LJ001-0005.wav,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...,


## Convert the text to tokens

It may take a while, so better to preprocess the text and save it to a file in advance.

**Note** `phonemize_text` takes the longest time.`


In [6]:
text_norm = data["normalized_text"].tolist()
for cleaners in text_cleaners:
    if "phonemize_text" in cleaners:
        text_norm = tokenizer(text_norm, cleaners, hps.data.language)
    else:
        for idx, text in enumerate(text_norm):
            temp = tokenizer(text, cleaners, hps.data.language)
            assert UNK_ID not in temp, f"Found unknown symbol:\n{text}\n{detokenizer(temp)}"
            text_norm[idx] = temp
    print(f"Finished tokenizing with {cleaners}")

text_norm = ["\t".join(map(str, text)) for text in text_norm]
data = data.assign(cleaned_text=text_norm)
data.head()

Finished tokenizing with ['phonemize_text']
Finished tokenizing with ['tokenize_text', 'add_bos_eos']


Unnamed: 0,file,text,normalized_text,cleaned_text
0,DUMMY1/LJ001-0001.wav,"Printing, in the only sense with which we are ...","Printing, in the only sense with which we are ...",2\t40\t101\t136\t87\t38\t44\t87\t56\t8\t5\t136...
1,DUMMY1/LJ001-0002.wav,in being comparatively modern.,in being comparatively modern.,2\t136\t87\t38\t5\t27\t136\t33\t138\t87\t56\t5...
2,DUMMY1/LJ001-0003.wav,For although the Chinese took impressions from...,For although the Chinese took impressions from...,2\t31\t66\t138\t179\t101\t5\t66\t138\t36\t53\t...
3,DUMMY1/LJ001-0004.wav,"produced the block books, which were the immed...","produced the block books, which were the immed...",2\t40\t101\t71\t29\t136\t45\t138\t43\t44\t5\t5...
4,DUMMY1/LJ001-0005.wav,the invention of movable metal letters in the ...,the invention of movable metal letters in the ...,2\t53\t71\t5\t87\t38\t46\t136\t73\t38\t109\t71...


## Save train, val, test filelists


In [7]:
data = data[["file", "cleaned_text"]]
data = data.sample(frac=1).reset_index(drop=True)

data_train = data.iloc[n_val + n_test:]
data_val = data.iloc[:n_val]
data_test = data.iloc[n_val: n_val + n_test]

data_train.to_csv("../filelists/train.txt", sep="|", index=False, header=False)
data_val.to_csv("../filelists/val.txt", sep="|", index=False, header=False)
data_test.to_csv("../filelists/test.txt", sep="|", index=False, header=False)