# Prepare filelists for LJSpeech dataset


In [None]:
# See: https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md
language = "en-us"
dir_data = "/Users/daniilrobnikov/Developer/datasets/LJSpeech-1.1"
symlink = "DUMMY1"
n_val = 100
n_test = 500

In [None]:
import os
import sys
import logging
import argparse
import pandas as pd
from phonemizer import phonemize

### Read dataset

Here `normalized_text` contains numbers in the form of words.

**Note**: you may need to replace all `"|"` with `" | "` in the file `metadata.csv` if you are using Windows.


In [None]:
data = pd.read_csv(
    f"{dir_data}/metadata.csv",
    sep=r"|",
    header=None,
    names=["file", "text", "normalized_text", "phonemized_text"],
    index_col=False,
    # converter to add .wav to file name
    converters={"file": lambda x: f"{symlink}/{x.strip()}.wav", "text": str.strip, "normalized_text": str.strip},
)
data.head()

### Convert the text to phonemes

It may take a while. F.e. 500_000 lines of text ~ 30 minutes


In [None]:
phonemes = phonemize(data["normalized_text"], backend="espeak", language=language,
                     strip=True, preserve_punctuation=True, with_stress=True, tie=True, njobs=8)
data = data.assign(phonemized_text=phonemes)
data.head()

## Save train, val, test filelists


In [None]:
data = data[["file", "phonemized_text"]]
data = data.sample(frac=1).reset_index(drop=True)

data_train = data.iloc[n_val + n_test:]
data_val = data.iloc[:n_val]
data_test = data.iloc[n_val: n_val + n_test]

data_train.to_csv("../filelists/train.txt", sep="|", index=False, header=False)
data_val.to_csv("../filelists/val.txt", sep="|", index=False, header=False)
data_test.to_csv("../filelists/test.txt", sep="|", index=False, header=False)

## Create symlink to the original data


In [None]:
root_dir = [p for p in sys.path if p.endswith("NaturalSpeech")][0]
os.chdir(root_dir)
!ln -s {dir_data} {symlink}