In [None]:
!pip install pydub

## Create data

In [None]:
import os
import glob
import shutil
import pandas as pd
import tqdm
import pydub

In [None]:
os.makedirs('/content/dataset/wavs')

files = glob.glob('/content/drive/MyDrive/arquivos/texto*.wav')
shutil.copy2('/content/drive/MyDrive/arquivos/texts.csv', '/content/dataset/')

invalid = []
for f in tqdm.tqdm(files):
    try:
        sound = pydub.AudioSegment.from_file(f)
        sound = sound.set_frame_rate(22050)
        sound = sound.set_channels(1)
        fname = f.split('/')[-1]
        fname = fname.split('.')[0]
        _, fname = fname.split('-')
        fname = '/content/dataset/wavs/sample-{}.wav'.format(int(fname))
        sound.export(fname, format='wav')
    except IndexError:
        invalid.append(f)
        pass
print(invalid)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/arquivos/texts.csv', sep='==', header=None)
texts = df[0].to_list()

files = glob.glob('dataset/wavs/*.wav')
files = [f.replace('dataset/', '') for f in files]

valid = [*(set(texts) & set(files))]

df = df[df[0].isin(valid)]

df[0] = df[0].apply(lambda x: x[5:-4])

df.to_csv('dataset/metadata.csv', sep='|', header=None, index=None)

In [None]:
!zip -r dataset.zip dataset
!mv dataset.zip /content/drive/MyDrive/tts/

## Configure Training

In [None]:
!git clone https://github.com/flych3r/TTS.git -b pt-br
%cd TTS

In [None]:
!pip install -e .
!pip install -e .

In [None]:
df_stats = df[1].str.len().describe()
df_stats

In [None]:
max(df_stats['min'], df_stats['25%'] - df_stats['std'])

In [None]:
min(df_stats['max'], df_stats['75%'] + df_stats['std'])

In [None]:
import ujson
from TTS.utils.io import load_config

CONFIG = load_config('/content/TTS/TTS/tts/configs/config.json')
CONFIG['use_phonemes'] = True
CONFIG['phoneme_language'] = 'pt-br'
CONFIG['phoneme_cache_path'] = 'phoneme_cache/'
CONFIG['datasets'][0]['path'] = '/content/dataset'
CONFIG['num_mels'] = 80
CONFIG['mel_fmin'] = 0.0
CONFIG['mel_fmax'] = 8000.0
CONFIG['spec_gain'] = 20.0
CONFIG['min_seq_len'] = 2
CONFIG['max_seq_len'] = 240
CONFIG['audio']['stats_path'] = '/content/drive/MyDrive/tts/scale_stats.npy'
CONFIG['audio']['sample_rate'] = 22050
CONFIG['gradual_training'] = [
    [0, 7, 16], 
    [1, 5, 16], 
    [50000, 3, 16], 
    [130000, 2, 8], 
    [290000, 1, 8]
]
CONFIG['epochs'] = 1000
CONFIG['test_delay_epochs'] = 10
CONFIG['output_path'] = '/content/drive/MyDrive/tts/'
with open('/content/drive/MyDrive/tts/config.json', 'w') as fp:
    ujson.dump(CONFIG, fp, indent=True)

VOCODER_CONFIG = load_config('/content/TTS/TTS/vocoder/configs/multiband_melgan_config.json')
VOCODER_CONFIG['audio']['sample_rate'] = 22050
VOCODER_CONFIG['audio']['stats_path'] = '/content/drive/MyDrive/tts/vocoder_scale_stats.npy'
VOCODER_CONFIG['data_path'] = '/content/dataset/wavs'
VOCODER_CONFIG['mel_fmin'] = 50.0
VOCODER_CONFIG['mel_fmax'] = 7600.0
VOCODER_CONFIG['spec_gain'] = 1.0
VOCODER_CONFIG['epochs'] = 1000
VOCODER_CONFIG['test_delay_epochs'] = 10
VOCODER_CONFIG['output_path'] = '/content/drive/MyDrive/tts/'
with open('/content/drive/MyDrive/tts/vocoder_config.json', 'w') as fp:
    ujson.dump(VOCODER_CONFIG, fp, indent=True)

In [None]:
!python TTS/bin/compute_statistics.py --config_path /content/drive/MyDrive/tts/config.json --out_path /content/drive/MyDrive/tts/