# Split filelist file into train and test sets

Use a train ratio or number of samples in test set.


In [1]:
%cd ..
%ls

/Users/daniilrobnikov/Developer/TTS/vits-bengali
LICENSE                         [34mpreprocess[m[m/
README.md                       preprocess.py
attentions.py                   requirements.txt
batch_inference.ipynb           [34mresources[m[m/
commons.py                      test-env.md
[34mconfigs[m[m/                        test-gpu_monotonic_align.ipynb
data_utils.py                   test-madasr23-links.txt
[34mfilelists[m[m/                      test-todo.txt
inference.ipynb                 test_torchaudio.ipynb
losses.py                       [34mtext[m[m/
mel_processing.py               train.py
models.py                       train_ms.py
modules.py                      transforms.py
monotonic_align.py              utils.py


In [2]:
# Load the data from the csv file
import pandas as pd
import os
import random

random.seed(42)

dataset_name = "madasr23"
data = pd.read_csv(f"filelists/{dataset_name}.csv")

In [3]:
# Support for DataFrames
def split_file_list(orig_data, train_ratio=None, test_samples=None, max_samples=None):
    # Shuffle the data
    data = orig_data.sample(frac=1).reset_index(drop=True)

    if max_samples is not None:
        data = data[:max_samples]

    if test_samples is not None:
        train_set = data[:-test_samples]
        test_set = data[-test_samples:]
    elif train_ratio is not None:
        train_set_size = int(len(data) * train_ratio)
        train_set = data[:train_set_size]
        test_set = data[train_set_size:]

    else:
        raise ValueError("Either 'train_ratio' or 'test_samples' should be provided.")

    return train_set, test_set


# Example usage
train_data, val_data = split_file_list(data, test_samples=1240)

### Map speaker ids to speaker indexes


In [3]:
# Create a dictionary to map speaker IDs to their indices
sids = data["spkid"].unique()
sid2idx = {sid: index for index, sid in enumerate(sids)}
idx2sid = {index: sid for index, sid in enumerate(sids)}

In [4]:
# Save speaker id to speaker index mapping to .csv file
sid2idx_df = pd.DataFrame.from_dict(sid2idx, orient="index")
sid2idx_df.to_csv(f"filelists/{dataset_name}_sid2idx.csv")

## Save phonemes and text of train_data, val_data


In [None]:
# Closest path to wav directory
# F.e. /Users/usr/datasets/madasr23/bn
source_dir = "/gpfs/mariana/home/darobn/datasets/madasr23/bn"
train_file_path = f"filelists/{dataset_name}_audio_sid_text_train_filelist.txt"
val_file_path = f"filelists/{dataset_name}_audio_sid_text_val_filelist.txt"
link_name = "DUMMY3"

In [None]:
def create_path_map(source_dir):
    path_map = {}
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".wav"):
                path_map[file] = os.path.join(root, file)
    return path_map


def save_file_list(data, out_file_path, source_dir, path_map, link_name, cleaned_text=False):
    out_file_path = out_file_path if not cleaned_text else out_file_path.replace(
        ".txt", ".txt.cleaned")
    with open(out_file_path, "w") as file:
        for row in data.itertuples():
            uttid = f"{row.uttid}.wav"
            path = path_map[uttid].replace(source_dir, link_name)
            sid = sid2idx[row.spkid]
            info = row.text if not cleaned_text else row.phonemes

            file.write(f"{path}|{sid}|{info}\n")
            # Print every nth sample
            if row.Index % 2000 == 0:
                print(f"{row.Index}: {path}|{sid}|{info}")

    print(f"Saved to '{out_file_path}' ({len(data)} samples).")

In [5]:
path_map = create_path_map(source_dir)


save_file_list(train_data, train_file_path, source_dir, path_map, link_name)
save_file_list(val_data, val_file_path, source_dir, path_map, link_name)
save_file_list(train_data, train_file_path, source_dir, path_map, link_name, cleaned_text=True)
save_file_list(val_data, val_file_path, source_dir, path_map, link_name, cleaned_text=True)

Saved to 'filelists/madasr23dataset_audio_sid_text_train_filelist.txt' (579996 samples).
Saved to 'filelists/madasr23dataset_audio_sid_text_dev_filelist.txt' (1240 samples).
Saved to 'filelists/madasr23dataset_audio_sid_text_train_filelist.txt.cleaned' (579996 samples).
Saved to 'filelists/madasr23dataset_audio_sid_text_dev_filelist.txt.cleaned' (1240 samples).


### Create a symlink to the dataset


In [None]:
# Create symlink to the dataset
!ln -s {source_dir} {link_name}