In [9]:
import os
import shutil
from multiprocessing import Pool

import librosa
import pandas as pd
import soundfile as sf

from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

language = 'Tamil'
lang = 'ta'

# IndicTTS in LJSpeech format

In [None]:
data_dir = f"/nlsasfs/home/ai4bharat/manidl/ttsteam/datasets/Indic TTS Data/TTS_data_Phase2_to_be_copied/{language}"
data_dir_new = f"/nlsasfs/home/ai4bharat/manidl/ttsteam/datasets/indictts/{lang}"

In [None]:
os.makedirs(data_dir_new)

In [None]:
shutil.copytree(f'{data_dir}/male/mono/wav', f'{data_dir_new}/wavs')

In [None]:
shutil.copytree(f'{data_dir}/female/mono/wav', f'{data_dir_new}/wavs', dirs_exist_ok=True)

In [None]:
metadata_male_fp = f"{data_dir}/male/mono/txt.done.data"
metadata_male = pd.read_csv(metadata_male_fp, sep='"', usecols=[0,1], header=None)
metadata_male[0] = metadata_male[0].str[2:].str.strip()
metadata_male[1] = metadata_male[1].str.strip()
metadata_male[2] = 'male'
print(metadata_male.shape)
metadata_male.head()

In [None]:
metadata_female_fp = f"{data_dir}/female/mono/txt.done.data"
metadata_female = pd.read_csv(metadata_female_fp, sep='"', usecols=[0,1], header=None)
metadata_female[0] = metadata_female[0].str[2:].str.strip()
metadata_female[1] = metadata_female[1].str.strip()
metadata_female[2] = 'female'
print(metadata_female.shape)
metadata_female.head()

In [None]:
metadata = pd.concat([metadata_male, metadata_female]).reset_index(drop=True)
metadata

In [None]:
metadata.to_csv(f'{data_dir_new}/metadata.csv', sep='|', index=False, header=False)

## Resampling

In [None]:
os.makedirs(f'{data_dir_new}/wavs-20k')

In [None]:
def resample_file(func_args):
    fp_src, fp_dst, output_sr = func_args
    y, sr = librosa.load(fp_src, sr=output_sr)
    sf.write(fp_dst, y, sr)

In [None]:
fps_src = [f'{data_dir_new}/wavs/{fn}' for fn in tqdm(os.listdir(f'{data_dir_new}/wavs'))]
fps_dst = [f'{data_dir_new}/wavs-20k/{fn}' for fn in tqdm(os.listdir(f'{data_dir_new}/wavs'))]
srs = [22050] * len(fps_src)

In [None]:
audio_files = list(zip(fps_src, fps_dst, srs))
with Pool(processes=64) as p:
    with tqdm(total=len(fps_src)) as pbar:
        for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
            pbar.update()

In [None]:
# for fn in tqdm(os.listdir(f'{data_dir_new}/wavs')):
#     fp_src = f'{data_dir_new}/wavs/{fn}'
#     fp_dst =  f'{data_dir_new}/wavs-20k/{fn}'

#     y, sr = librosa.load(fp_src, sr=22050)
#     sf.write(fp_dst, y, sr, 'PCM_24')

## Format data for eval

In [10]:
data_dir = f"/nlsasfs/home/ai4bharat/manidl/ttsteam/datasets/indictts/{lang}"

In [11]:
df = pd.read_csv(f'{data_dir}/metadata.csv', sep='|', names=['id', 'text', 'speaker'])
print(df.shape)
df.head()

(6960, 3)


Unnamed: 0,id,text,speaker
0,train_tamilmale_00001,அது தஞ்சாவூர்க் கோட்டைக்குள் பிரவேசிக்கவும் சக...,male
1,train_tamilmale_00002,அதற்குத் தகுந்தபடி ஏதாவது கொஞ்சம் பேசி வேஷம் ப...,male
2,train_tamilmale_00003,ஆனால் அவன் எதிர்பார்த்த சந்தர்ப்பம் ஒன்றும் கி...,male
3,train_tamilmale_00004,அப்படியும் பல்லக்கு கீழே வைக்கப்படவில்லை ஒரே ம...,male
4,train_tamilmale_00005,கோட்டைக்குள் பல்லக்குப் போய்விட்டால் அப்புறம் ...,male


In [12]:
df_train, df_test = train_test_split(df, test_size=0.01, stratify=df['speaker'], random_state=0)
print(len(df_train), len(df_test))

6890 70


In [13]:
df_train.to_csv(f'{data_dir}/metadata_train.csv', sep='|', index=False)
df_test.to_csv(f'{data_dir}/metadata_test.csv', sep='|', index=False)

In [14]:
os.makedirs(f'{data_dir}/wavs-20k-test-male/')
os.makedirs(f'{data_dir}/wavs-20k-test-female/')

In [15]:
for row_id, row in tqdm(df_test.iterrows(), total=len(df_test)):
    src = f'{data_dir}/wavs-20k/{row["id"]}.wav'
    dst = f'{data_dir}/wavs-20k-test-{row["speaker"]}/{row["id"]}.wav'
    shutil.copyfile(src, dst)

  0%|          | 0/70 [00:00<?, ?it/s]