## Cloning WaveRNN repository

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import os

!git clone https://github.com/erogol/WaveRNN.git
os.chdir('WaveRNN')
!git checkout 12c8744
!pip install -r requirements.txt
#download weights
!rm saver-wavernn.zip
#!wget https://www.dropbox.com/s/jp5stdlbvfqw30u/checkpoint-wavernn-finetunnig-tts-portuguese-corpus-544400.zip?dl=0 -O saver-wavernn.zip
!wget https://www.dropbox.com/s/4a60kt3detcw3r6/checkpoint-wavernn-finetunnig-tts-portuguese-corpus-560900.zip?dl=0 -O saver-wavernn.zip
!ls  
!unzip saver-wavernn.zip 

os.chdir('..')
!git clone https://github.com/Edresson/TTS -b TTS-Portuguese
#os.chdir('TTS')
#!pip install -r requirements.txt
#os.chdir('..')

In [None]:
!ls

In [None]:
!cat TTS/requirements.txt

In [None]:
# !python -m pip install -r TTS/requirements.txt
!pip install lws Unidecode tensorboardX Pillow phonemizer pydub
!apt-get install espeak

## Import modules

In [None]:
%load_ext autoreload
%autoreload 2
import os
import sys
import io
import torch 
import time
import numpy as np
from collections import OrderedDict

TTS_PATH = "../content/TTS"
WAVERNN_PATH ="../content/WaveRNN"
# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally

import matplotlib.pyplot as plt
# pylab as plt
# %pylab inline

from matplotlib import rcParams 
rcParams["figure.figsize"] = (16,5)
sys.path.append('')

import librosa
import librosa.display

from TTS.models.tacotron import Tacotron 
from TTS.layers import *
from TTS.utils.data import *
from TTS.utils.audio import AudioProcessor
from TTS.utils.generic_utils import load_config
from TTS.utils.text import text_to_sequence, phoneme_to_sequence, sequence_to_phoneme
from TTS.utils.text.symbols import symbols, phonemes

import IPython
from IPython.display import Audio
from TTS.utils import *


from TTS.utils.visual import visualize

from pydub import AudioSegment
# from matplotlib import pylab as plt

## Download Weights

In [None]:
#!wget -c -q --show-progress -O ./TTS-TL-saver.zip https://www.dropbox.com/s/b88peo1triqvvpe/checkpoint_255k-tts-portuguese-with-phonemes.zip?dl=0
#!wget -c -q --show-progress -O ./TTS-TL-saver.zip  https://www.dropbox.com/s/szrhtl75njx9ic6/TTS-checkpoint-phonemizer-wavernn-362600.zip?dl=0
!wget -c -q --show-progress -O ./TTS-TL-saver.zip https://www.dropbox.com/s/91etfwt4tvzjqyz/TTS-checkpoint-phonemizer-wavernn-381000.zip?dl=0
!ls
!rm config.json
!unzip TTS-TL-saver.zip
! mv checkpoint_381000.pth.tar checkpoint.pth.tar
#!wget -c -q --show-progress -O checkpoint_255200.pth.tar https://www.dropbox.com/s/phwp3bk64dlhx8u/checkpoint_255200.pth.tar?dl=0
#! mv checkpoint_255200.pth.tar checkpoint.pth.tar

## Inference Functions

In [None]:
def plot_alignment_with_text(alignment,text, info=None):
    fig, ax = plt.subplots(figsize=(16, 10))
    im = ax.imshow(
        alignment.T, aspect='auto', origin='lower', interpolation=None)
    fig.colorbar(im, ax=ax)
    xlabel = 'Decoder timestep'
    if info is not None:
        xlabel += '\n\n' + info
    plt.xlabel(xlabel)
    plt.ylabel('Encoder timestep')
    plt.yticks(range(len(text)), list(text))
    plt.tight_layout()
    return fig
  

from TTS.utils.synthesis import visualize

def synthesis(m, s, CONFIG, use_cuda, ap,language=None,WaveRNN=False):
    """ Given the text, synthesising the audio """
    if language is None:
      language=CONFIG.phoneme_language
    text_cleaner = [CONFIG.text_cleaner]
    # print(phoneme_to_sequence(s, text_cleaner))
    # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner)))
    if CONFIG.use_phonemes:
        seq = np.asarray(
            phoneme_to_sequence(s, text_cleaner, language),
            dtype=np.int32)
    else:
        seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32)
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    mel_spec, linear_spec, alignments, stop_tokens = m.forward(
        chars_var.long()
    )
    linear_spec = linear_spec[0].data.cpu().numpy()
    mel_spec = mel_spec[0].data.cpu().numpy()
    alignment = alignments[0].cpu().data.numpy()
    if not WaveRNN:
      wav = ap.inv_spectrogram(linear_spec.T)
      wav = wav[:ap.find_endpoint(wav)]
    else:
      wav = wavernn.generate(torch.FloatTensor(mel_spec.T).unsqueeze(0).cuda(), batched=True, target=11000, overlap=550)
    return wav, alignment, linear_spec, mel_spec, stop_tokens

def tts_griffin_lim(model, text, CONFIG, use_cuda, ap, figures=True,name='figure',language=None, display=True):
    t_1 = time.time()
    waveform, alignment, spectrogram, mel_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, language=language, WaveRNN=False) 
    if display:
        print(" >  Run-time with Griffin lim: {}".format(time.time() - t_1))
        print("Vocoder Griffin-Lim :") 
        IPython.display.display(Audio(waveform, rate=ap.sample_rate))  
    if figures:
        fig = plot_alignment_with_text(alignment,text)
        visualize(alignment, spectrogram, stop_tokens,text,250, CONFIG,mel_spec) 
        fig.savefig(os.path.join(OUT_FOLDER,'alig_'+name+'.png'))
    
    return alignment, spectrogram, stop_tokens, waveform

def tts_wave_rnn(model, text, CONFIG, use_cuda, audio_processor, figures=True,name='figure',language=None, display=True):
    t_1 = time.time()
    waveform, alignment, spectrogram, mel_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, audio_processor, language=language, WaveRNN=True) 
    if display:
        print(" >  Run-time with WaveRNN: {}".format(time.time() - t_1))
        print("Vocoder Griffin-Lim :") 
        IPython.display.display(Audio(waveform, rate=audio_processor.sample_rate))  
    if figures:
        fig = plot_alignment_with_text(alignment,text)
        visualize(alignment, spectrogram, stop_tokens,text,250, CONFIG,mel_spec) 
        fig.savefig(os.path.join(OUT_FOLDER,'alig_'+name+'.png'))
    
    return alignment, spectrogram, stop_tokens, waveform   
  
# Set constants

MODEL_PATH = 'checkpoint.pth.tar'
CONFIG_PATH =  'TTS/config.json'
OUT_FOLDER = 'samples/'
try:
  os.mkdir(OUT_FOLDER)
except:
  pass

CONFIG = load_config(CONFIG_PATH)

use_cuda = torch.cuda.is_available()

## Restore  TTS Model

In [None]:
VOCODER_MODEL_PATH = "WaveRNN/saver.pth.tar"
VOCODER_CONFIG_PATH = "WaveRNN/config_16K.json"
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)

# load the model
ap2 = AudioProcessor(**VOCODER_CONFIG.audio)
ap = AudioProcessor(**CONFIG.audio)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
! mv checkpoint_381200.pth.tar checkpoint.pth.tar
model= Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels, CONFIG.r, CONFIG.memory_size)

# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

## Restore WaveRnn model

In [None]:
#from utils.generic_utils import load_config
from WaveRNN.models.wavernn import Model

bits = 10

wavernn = Model(
    rnn_dims=512,
    fc_dims=512,
    mode=VOCODER_CONFIG.mode,
    mulaw=VOCODER_CONFIG.mulaw,
    pad=VOCODER_CONFIG.pad,
    use_aux_net=VOCODER_CONFIG.use_aux_net,
    use_upsample_net=VOCODER_CONFIG.use_upsample_net,
    upsample_factors=VOCODER_CONFIG.upsample_factors,
    feat_dims=80,
    compute_dims=128,
    res_out_dims=128,
    res_blocks=10,
    hop_length=ap2.hop_length,
    sample_rate=ap2.sample_rate,
)
        
check = torch.load(VOCODER_MODEL_PATH, map_location=torch.device('cuda' if use_cuda else 'cpu'))
wavernn.load_state_dict(check['model'])
if use_cuda:
    wavernn.cuda()
wavernn.eval();
print(check['step'])

## Synthesize test sentences

In [None]:
import pydub

def process_audio(audio):
    return sum(pydub.silence.split_on_silence(audio, silence_thresh=-36)).set_sample_width(2).set_channels(1).set_frame_rate(22500)

In [None]:
import wave
import contextlib

def get_audio_length(file_path):
    if file_path.endswith('.wav'):
        with contextlib.closing(wave.open(file_path,'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            return duration
    if file_path.endswith('.mp3'):
        audio = MP3(file_path)
        return audio.info.length
    raise Exception('Unsuported file format. File must be wav or mp3')

In [None]:
!unzip sentences.zip

In [None]:
import glob

sentences_dict = dict()
for fp in glob.glob('*.txt'):
    with open(fp) as f:
        sentences = f.readlines()
    sentences_dict[fp] = [sent.strip() for sent in sentences][:-1]

In [None]:
# test_sentences =[
#         # "O capital de uma empresa depende de sua produção",
#         # "Se não fosse ela tudo teria sido melhor, ou talvez não",
#         # "A principal personagem no filme é uma gueixa",
#         # "Espere seu amigo em casa",
#         # "A juventude tinha que revolucionar a escola",
#         # "A cantora terá quatro meses para ensaiar seu canto",
# ]

import textwrap
import io

wave_rnn = False

if wave_rnn:
    model.decoder.max_decoder_steps = 500
    br = 50
else:
    model.decoder.max_decoder_steps = 300
    br = 30

with open('times.txt', 'w') as f:
    print('fp, inf_time, sent_length, audio_length', file=f)

for n, sentences in sentences_dict.items():
    n = n.split('_')[1].split('.')[0].upper()
    for idx, sentence in enumerate(sentences):
        if idx == 2:
            break
        print('\n'.join(textwrap.wrap(sentence, width=br)))
        print('#' * br)
        start = time.time()
        wavs = []
        for frase in textwrap.wrap(sentence, width=br, break_long_words=False):
            f = io.BytesIO()
            if wave_rnn:
                align, spec, stop_tokens, wav = tts_wave_rnn(model, frase, CONFIG, use_cuda, ap2, figures=False, name=str(idx+1), display=False)
                ap2.save_wav(wav, f)
            else:
                align, spec, stop_tokens, wav = tts_griffin_lim(model, ' ' + frase + ' ', CONFIG, use_cuda, ap, figures=False, name=str(idx+1), display=False)
                ap.save_wav(wav, f)
            wavs.append(f)
        wavs = [AudioSegment.from_file_using_temporary_files(w) for w in wavs]
        wav = sum(wavs)
        wav = process_audio(wav)
        fp = '{}_{}'.format(n, idx)
        wav.export("{}.wav".format(fp), format="wav")
        end = time.time()
        print('\n', end - start, 'segundos')
        with open('times.txt', 'a') as f:
            print('{}, {}, {}, {}'.format(fp, end - start, len(sentence), get_audio_length('{}.wav'.format(fp))), file=f)
        IPython.display.display(wav)
        print('*' * 80)

In [None]:
# from notify import send
# send('Finished Griffin-Lin run')

In [None]:
# %%bash

# mkdir audios
# rm audios/*.wav
# mv *.wav audios
# zip -r audios_griffin_cpu.zip audios times.txt sentences.txt
# cp audios_griffin_gpu.zip '/content/drive/My Drive/TCC_data'