<a href="https://colab.research.google.com/github/deterministic-algorithms-lab/Speech-Explorations/blob/master/Mellotron/eval_single_speaker(en).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir weights
%cd weights
!gdown https://drive.google.com/uc?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF         #For WaveGlow
!gdown https://drive.google.com/uc?id=1UwDARlUl8JvB2xSuyMFHFsIWELVpgQD4         #For LJS mellotron model
%cd ..

In [None]:
!git clone https://github.com/deterministic-algorithms-lab/Speech-Explorations
%cd Speech-Explorations/Mellotron/mellotron/
!git clone https://github.com/NVIDIA/waveglow

In [None]:
!pip install unidecode
!pip install soundfile
!pip install git+git://github.com/libindic/indic-trans.git
!pip install tensorboardX

In [None]:
%tensorflow_version 1.12.0

In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import IPython.display as ipd

import sys
sys.path.append('waveglow/')

from itertools import cycle
import numpy as np
import scipy as sp
from scipy.io.wavfile import write
import pandas as pd4
import librosa
import torch

from hparams import create_hparams
from model import Tacotron2, load_model
from waveglow.denoiser import Denoiser
from layers import TacotronSTFT
from data_utils import TextMelLoader, TextMelCollate
from text import cmudict, text_to_sequence
from mellotron_utils import get_data_from_musicxml

In [4]:
def panner(signal, angle):
    angle = np.radians(angle)
    left = np.sqrt(2)/2.0 * (np.cos(angle) - np.sin(angle)) * signal
    right = np.sqrt(2)/2.0 * (np.cos(angle) + np.sin(angle)) * signal
    return np.dstack((left, right))[0]

In [5]:
def plot_mel_f0_alignment(mel_source, mel_outputs_postnet, f0s, alignments, figsize=(16, 16)):
    offset = (mel_source is None)
    fig, axes = plt.subplots(4-offset, 1, figsize=figsize)
    axes = axes.flatten()
    if mel_source is not None:
        axes[0].imshow(mel_source, aspect='auto', origin='bottom', interpolation='none')
    axes[1-offset].imshow(mel_outputs_postnet, aspect='auto', origin='bottom', interpolation='none')
    axes[2-offset].scatter(range(len(f0s)), f0s, alpha=0.5, color='red', marker='.', s=1)
    axes[2-offset].set_xlim(0, len(f0s))
    axes[3-offset].imshow(alignments, aspect='auto', origin='bottom', interpolation='none')
    if mel_source is not None:
        axes[0].set_title("Source Mel")
    axes[1-offset].set_title("Predicted Mel")
    axes[2-offset].set_title("Source pitch contour")
    axes[3-offset].set_title("Source rhythm")
    plt.tight_layout()

In [6]:
def load_mel(path):
    audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate)
    audio = torch.from_numpy(audio)
    if sampling_rate != hparams.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = melspec.cuda()
    return melspec

In [None]:
hparams = create_hparams()
hparams.text_cleaners = ['english_cleaners']
hparams.n_speakers = 1

In [8]:
stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length,
                    hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                    hparams.mel_fmax)

## Load Models

In [None]:
checkpoint_path = "/content/weights/mellotron_ljs.pt"
mellotron = load_model(hparams).cuda().eval()
mellotron.load_state_dict(torch.load(checkpoint_path)['state_dict'])

In [None]:
waveglow_path = '/content/weights/waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model'].cuda().eval()
denoiser = Denoiser(waveglow).cuda().eval()

## Setup dataloaders

In [11]:
#To add language and speaker number
#!sed 's/$/|0/' ../sample_data/trans_audio.txt > ../sample_data/train_file1.txt
#!sed 's/$/|0/' ../sample_data/train_file1.txt > ../sample_data/train_file2.txt 

In [12]:
#To change path
#!sed 's+drive/My Drive/sample_data/+Speech-Explorations/Mellotron/sample_data/+' ../sample_data/trans_audio.txt > ../sample_data/trans_audio1.txt

In [12]:
!echo "../sample_data/LJ037-0171.wav|The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,|0|0" > ../sample_data/single_speaker.txt

In [13]:
arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
audio_paths = '../sample_data/single_speaker.txt'
dataloader = TextMelLoader(audio_paths, hparams)
datacollate = TextMelCollate(1)

## Load data

In [None]:
file_idx=0
audio_path, text, sid, lang = dataloader.audiopaths_and_text[file_idx]
mel = load_mel(audio_path)

#text = 'My name is Martha White.'
text_encoded = torch.LongTensor(text_to_sequence(text, hparams.text_cleaners, arpabet_dict))[None, :].cuda()
pitch_contour = dataloader[file_idx][3][None].cuda()
print(audio_path, text)

x, y = mellotron.parse_batch(datacollate([dataloader[file_idx]]))
with torch.no_grad():
    # get rhythm (alignment map) using tacotron 2
    mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = mellotron.forward(x)
    rhythm = rhythm.permute(1, 0, 2)

In [None]:
#Run to sharpen alignment plots
'''
sm = torch.nn.Softmax(dim=2)
print(rhythm.shape)
temperature=0.1
rhythm = sm(rhythm/temperature)
'''

In [None]:
ipd.Audio(audio_path, rate=hparams.sampling_rate)

# Style Transfer 

### Rhythm and Pitch Contour

* Changing text and providing alignments as ```rhythm[:,:,:text_encoded.shape[1]]``` doesn't work here.

* Can replace mel by a number too. 

* Need to try what happens when slight shifts are made in original ```rhythm```

In [None]:
speaker_id = torch.LongTensor([0]).cuda()

with torch.no_grad():
    print(text_encoded.shape, mel.shape, pitch_contour.shape, rhythm.shape)
    mel_outputs, mel_outputs_postnet, gate_outputs, alignments = mellotron.inference_noattention(
        (text_encoded, mel, speaker_id, pitch_contour, rhythm[:,:,:))

plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                      mel_outputs_postnet.data.cpu().numpy()[0],
                      pitch_contour.data.cpu().numpy()[0, 0],
                      rhythm[:100,:,:text_encoded.shape[1]].data.cpu().numpy()[:, 0].T)

In [None]:
with torch.no_grad():
    audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)

### Only Pitch contour

* Seems to work on changed text, but not too well.
* Styles can be changed by writing numbers in place of ```mel```

In [None]:
peaker_id = torch.LongTensor([0]).cuda()

with torch.no_grad():
    mel_outputs, mel_outputs_postnet, gate_outputs, alignments = mellotron.inference(
        (text_encoded, mel, speaker_id, pitch_contour))

plot_mel_f0_alignment(x[2].data.cpu().numpy()[0],
                      mel_outputs_postnet.data.cpu().numpy()[0],
                      pitch_contour.data.cpu().numpy()[0, 0],
                      alignments.data.cpu().numpy()[0].T)

In [None]:
with torch.no_grad():
    audio = denoiser(waveglow.infer(mel_outputs_postnet, sigma=0.8), 0.01)[:, 0]
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate)