In [None]:
!cp '/content/drive/My Drive/TCC_data/audios_error.zip' .
!unzip audios_error.zip
!pip install pysptk
!mkdir figs

In [None]:
import pysptk
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from IPython import display

In [None]:
plt.style.use('classic')
display.set_matplotlib_formats('jpg', quality=94)

## Audio Features

In [None]:
frame_length = 1024
n_mels = 80
hop_length = 256

def spectrogram(filename, plot=True):
    y, sr = librosa.load(filename)
    
    song, _ = librosa.effects.trim(y, top_db=40)

    S = librosa.feature.melspectrogram(song, sr=sr, n_fft=frame_length, hop_length=hop_length, n_mels=n_mels)
    S_DB = librosa.power_to_db(S, ref=np.max)
    if plot:
        librosa.display.specshow(S_DB, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel');
        plt.colorbar(format='%+2.0f dB');
    return S_DB, (song, sr)

In [None]:
order = 25
alpha = 0.41

def f0_pitch_energy(path):
    x, sr = librosa.load(path)
    x, _ = librosa.effects.trim(x, top_db=40)

    f0 = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="f0")
    pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch")

    frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T
    frames *= pysptk.blackman(frame_length)
    frames = frames[(frames != 0).all(axis=1)]
    mc = pysptk.mcep(frames, order, alpha)
    energy = pysptk.conversion.mc2e(mc)

    return f0, pitch, energy

## Metrics

In [None]:
from torch.nn import L1Loss
from torch import tensor

l1 = L1Loss()
def L1(yhat, y):
    loss = l1(tensor(yhat), tensor(y))
    return loss.tolist()

In [None]:
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

def DTW(yhat, y):
    distance, path = fastdtw(yhat, y, dist=euclidean)
    return distance

In [None]:
from sklearn.metrics import mean_squared_error

def RMSE(yhat, y):
    return np.sqrt(mean_squared_error(yhat, y))

In [None]:
from torch.nn import CosineSimilarity
from torch import tensor

cosine = CosineSimilarity()
def COSINE(yhat, y):
    loss = cosine(tensor(yhat), tensor(y))
    return loss.mean().tolist()

## Comparing with sentences with errors

In [None]:
sentences = [
    "A inauguração da vila é quarta ou quinta feira",
    "Vote se você tiver o título de eleitor",
    "Hoje é fundamental encontrar a razão da existência humana",
    "A temperatura é mais amena a noite",
    "Em muitas cidades a população está diminuindo",
    "Nunca se deve ficar em cima do morro",
    "Para as pessoas estranhas, o panorama é desolador",
    "E bom te ver colhendo flores menino",
    "Eu finjo me banhar num lago ao amanhecer",
    "Sua sensibilidade mostrará o caminho",
    "A Amazônia é a reserva ecológica do globo",
    "O ministério mudou demais com a eleição",
    "Nova metas surgem na informática",
    "O capital de uma empresa depende de sua produção",
    "Se não fosse ela tudo teria sido melhor",
    "A principal personagem do filme é uma gueixa",
    "Espere seu amigo em casa",
    "A juventude tinha que revolucionar a escola",
    "A cantora terá quatro meses para ensaiar seu canto",
    "Esse tema foi falado no congresso"
]

In [None]:
def metric(y, yhat):
    if y.shape[1] > yhat.shape[1]:
        a = np.zeros_like(y)
        a[:yhat.shape[0], :yhat.shape[1]] = yhat
        yhat = a
    else:
        yhat = yhat[:y.shape[0], :y.shape[1]]
        # a = np.zeros_like(yhat)
        # a[:y.shape[0], :y.shape[1]] = y
        # y = a
    # y = librosa.feature.mfcc(S=y)
    # yhat = librosa.feature.mfcc(S=yhat)

    return COSINE(y, yhat)

In [None]:
d = {
    'mozilla_tts': {
        'original': [],
        'mix': [],
        'swap': []
    },
    'dctts': {
        'original': [],
        'mix': [],
        'swap': []
    }
}

In [None]:
import glob
import tqdm

for model in ['dctts', 'mozilla_tts']:

    audios = {}

    for i in tqdm.tqdm(range(1, 21)):
        audios[i] = {}
        f = 'ground_truth/{}.wav'.format(i)
        spec, _ = spectrogram(f, plot=False)
        audios[i]['ground_truth'] = (f, spec)
        synthesised = []
        for f in glob.glob('{}/{}_*.wav'.format(model, i)):
            spec, _ = spectrogram(f, plot=False)
            synthesised.append((f, spec))
        audios[i]['synthesised'] = np.array(synthesised)

    for speech in audios:
        gt = audios[speech]['ground_truth'][1]
        synthesised = audios[speech]['synthesised']
        arr = np.array([*map(lambda x: metric(gt, x[1]), synthesised)])
        idx, score = np.argsort(-arr), -np.sort(-arr)
        l = [*zip(synthesised[idx, 0], score)]
        
        for t in l:
            if 'original' in t[0]:
                d[model]['original'].append(t[1])
            if 'mix' in t[0]:
                d[model]['mix'].append(t[1])
            if 'swap' in t[0]:
                d[model]['swap'].append(t[1])
        
        display.display(display.Audio(audios[speech]['ground_truth'][0]))

        fpe_gt = f0_pitch_energy(audios[speech]['ground_truth'][0])

        for i, n in zip(idx, l):
            plt.figure(figsize=(20, 16))

            plt.subplot(4, 2, 1);
            librosa.display.specshow(audios[speech]['ground_truth'][1], x_axis='time', y_axis='mel')
            plt.title(audios[speech]['ground_truth'][0]);

            plt.subplot(4, 2, 2);
            librosa.display.specshow(audios[speech]['synthesised'][i, 1], x_axis='time', y_axis='mel');
            plt.title(str(n));

            fpe = f0_pitch_energy(n[0])

            k = 3
            for j, m in enumerate(['f0', 'pitch', 'energy']):
                    plt.subplot(4, 2, k)
                    k += 1
                    plt.plot(fpe_gt[j])
                    plt.subplot(4, 2, k)
                    k += 1
                    plt.plot(fpe[j])
                    plt.title(m)

            plt.subplots_adjust(hspace=0.5)
            # plt.savefig('figs/{}.png'.format(n[0].replace('/', '_')))
            plt.show()

            display.display(display.Audio(audios[speech]['synthesised'][i, 0]))

In [None]:
# !zip -r figs.zip figs
# !cp  figs.zip '/content/drive/My Drive/TCC_data/'

In [None]:
scores = {'mozilla_tts': {}, 'dctts': {}}
for m in d:
    for s in d[m]:
        scores[m][s] = np.mean(d[m][s])
scores