In [None]:
import os
import shutil
from multiprocessing import Pool

import librosa
import pandas as pd
import soundfile as sf

from tqdm.auto import tqdm

# AI4B in LJSpeech format

In [None]:
data_dir = "/nlsasfs/home/ai4bharat/manidl/ttsteam/datasets/Indic TTS Data/TTS_data_Phase2_to_be_copied/Tamil"
data_dir_new = "/nlsasfs/home/ai4bharat/manidl/ttsteam/datasets/ai4b-tts/ta"

In [None]:
os.makedirs(data_dir_new)

In [None]:
shutil.copytree(f'{data_dir}/male/mono/wav', f'{data_dir_new}/wavs')

In [None]:
shutil.copytree(f'{data_dir}/female/mono/wav', f'{data_dir_new}/wavs', dirs_exist_ok=True)

In [None]:
metadata_male_fp = f"{data_dir}/male/mono/txt.done.data"
metadata_male = pd.read_csv(metadata_male_fp, sep='"', usecols=[0,1], header=None)
metadata_male[0] = metadata_male[0].str[2:].str.strip()
metadata_male[1] = metadata_male[1].str.strip()
metadata_male[2] = 'male'
print(metadata_male.shape)
metadata_male.head()

In [None]:
metadata_female_fp = f"{data_dir}/female/mono/txt.done.data"
metadata_female = pd.read_csv(metadata_female_fp, sep='"', usecols=[0,1], header=None)
metadata_female[0] = metadata_female[0].str[2:].str.strip()
metadata_female[1] = metadata_female[1].str.strip()
metadata_female[2] = 'female'
print(metadata_female.shape)
metadata_female.head()

In [None]:
metadata = pd.concat([metadata_male, metadata_female]).reset_index(drop=True)
metadata

In [None]:
metadata.to_csv(f'{data_dir_new}/metadata.csv', sep='|', index=False, header=False)

## Resampling

In [None]:
os.makedirs(f'{data_dir_new}/wavs-20k')

In [None]:
def resample_file(func_args):
    fp_src, fp_dst, output_sr = func_args
    y, sr = librosa.load(fp_src, sr=output_sr)
    sf.write(fp_dst, y, sr)

In [None]:
fps_src = [f'{data_dir_new}/wavs/{fn}' for fn in tqdm(os.listdir(f'{data_dir_new}/wavs'))]
fps_dst = [f'{data_dir_new}/wavs-20k/{fn}' for fn in tqdm(os.listdir(f'{data_dir_new}/wavs'))]
srs = [22050] * len(fps_src)

In [None]:
audio_files = list(zip(fps_src, fps_dst, srs))
with Pool(processes=64) as p:
    with tqdm(total=len(fps_src)) as pbar:
        for i, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
            pbar.update()

In [None]:
# for fn in tqdm(os.listdir(f'{data_dir_new}/wavs')):
#     fp_src = f'{data_dir_new}/wavs/{fn}'
#     fp_dst =  f'{data_dir_new}/wavs-20k/{fn}'

#     y, sr = librosa.load(fp_src, sr=22050)
#     sf.write(fp_dst, y, sr, 'PCM_24')