**Cloning repository**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%tensorflow_version 1.x

In [None]:
!pip install pydub

In [None]:
!git clone https://github.com/Edresson/TTS-Conv.git
import os
import time
os.chdir('TTS-Conv')

**Import modules**

In [None]:
from hyperparams import Hyperparams as hp
import numpy as np
import tensorflow as tf
from train import Graph
from utils import *
from scipy.io.wavfile import write
from tqdm import tqdm
from librosa import  display
from data_load import text_normalize,load_vocab
from IPython.display import Audio



**Download Weights**




In [None]:
!wget -c -q --show-progress -O ./saver-text.zip https://www.dropbox.com/s/oeafuy4yp7nqj5y/saver-text.zip?dl=0
!ls
!unzip saver-text.zip  

**Restore Model**

In [None]:
# Load graph
g = Graph(mode="synthesize"); print("Graph loaded")

sess = tf.Session()

sess.run(tf.global_variables_initializer())

# Restore parameters
var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')
saver1 = tf.train.Saver(var_list=var_list)
saver1.restore(sess, os.path.join('saver-text','text2mel','saver'))
print("Text2Mel Restored!")

var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')
saver2 = tf.train.Saver(var_list=var_list)
saver2.restore(sess, os.path.join('saver-text','mel2linear','saver'))
print("SSRN Restored!")
%cd ..

**Synthesize**


In [None]:
import IPython
from pydub import AudioSegment
import io
import textwrap

def sysntesise_text(sentence, fp=None):
    start = time.time()
    # print('input text: ',frase)
    wavs = []
    for frase in textwrap.wrap(sentence, width=100, break_long_words=False):
        frase = '1 '+frase # add factor 
        #normalize remove inavalid characters
        frase = text_normalize(frase.split(" ", 1)[-1]).strip() + "E" # text normalization, E: EOS

        # print('normalized text:',frase)
            
        char2idx, idx2char = load_vocab()
            
        #convert characters to numbers
        text = np.zeros((1, hp.max_N), np.int32) #hp.max_N = 128, is the max number for characters 
        text[0, :len(frase)] = [char2idx[char] for char in frase]

        # print('converted text:',text)

        L = text
        # Feed Forward
        ## mel
        # note: hp.max_T can be changed depending on the phrase to be synthesized, the default value is 210, which generates an audio of maximum 10 seconds, if it decreases this value can obtain a greater speed of synthesis.
        hp.max_T = 210 
        Y = np.zeros((len(L), hp.max_T, hp.n_mels), np.float32)
        prev_max_attentions = np.zeros((len(L),), np.int32)
        for j in range(hp.max_T):
            _gs, _Y, _max_attentions, _alignments = sess.run([g.global_step, g.Y, g.max_attentions, g.alignments], {g.L: L,g.mels: Y, g.prev_max_attentions: prev_max_attentions})
            Y[:, j, :] = _Y[:, j, :]
            prev_max_attentions = _max_attentions[:, j]

        # Get magnitude
        Z = sess.run(g.Z, {g.Y: Y})

        # Generate wav files
        for i, mag in enumerate(Z):
            # print("Working on file", i+1)
            wav = spectrogram2wav(mag)
            f = io.BytesIO()
            write(f, hp.sr, wav)
            #save for frase.wav
            wavs.append(f)

    wavs = [AudioSegment.from_file_using_temporary_files(w) for w in wavs]
    wav = sum(wavs)
    wav = process_audio(wav)
    if fp:
        wav.export("{}.wav".format(fp), format="wav")
        # write("{}.wav".format(fp), hp.sr, wav)
    end = time.time()
    print(end - start, 'segundos')
    with open('times.txt', 'a') as f:
        print('{}, {}, {}, {}'.format(fp, end - start, len(sentence), get_audio_length('{}.wav'.format(fp))), file=f)
    IPython.display.display(wav)
    print('*' * 80)
    return wavs

In [None]:
!unzip sentences.zip

In [None]:
import glob

sentences_dict = dict()
for fp in glob.glob('*.txt'):
    with open(fp) as f:
        sentences = f.readlines()
    sentences_dict[fp] = [sent.strip() for sent in sentences][:-1]

In [None]:
import pydub

def process_audio(audio):
    return sum(pydub.silence.split_on_silence(audio, silence_thresh=-36)).set_sample_width(2).set_channels(1).set_frame_rate(22500)

In [None]:
import wave
import contextlib

def get_audio_length(file_path):
    if file_path.endswith('.wav'):
        with contextlib.closing(wave.open(file_path,'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            return duration
    if file_path.endswith('.mp3'):
        audio = MP3(file_path)
        return audio.info.length
    raise Exception('Unsuported file format. File must be wav or mp3')

In [None]:
# test_sentences =[
#         # "O capital de uma empresa depende de sua produção",
#         # "Se não fosse ela tudo teria sido melhor, ou talvez não.",
#         # "A principal personagem no filme é uma gueixa",
#         # "Espere seu amigo em casa",
#         # "A juventude tinha que revolucionar a escola",
#         # "A cantora terá quatro meses para ensaiar seu canto",
# ]

# https://cartadeservicos.ce.gov.br/ConsultaCesec/pg_cs_servico.aspx

with open('times.txt', 'w') as f:
    print('fp, inf_time, sent_length, audio_length', file=f)

for n, sentences in sentences_dict.items():
    n = n.split('_')[1].split('.')[0].upper()
    for i, frase in enumerate(sentences):
        if i == 2:
            break
        print('\n'.join(textwrap.wrap(frase, width=80)))
        sysntesise_text(frase, '{}_{}'.format(n, i))

In [None]:
from notify import send
send('Finished DCTTS run')

In [None]:
%%bash

mkdir audios
rm audios/*.wav
mv *.wav audios
zip -r audios_dctts_cpu.zip audios times.txt
cp audios_dctts_cpu.zip '/content/drive/My Drive/TCC_data'