In [None]:
!cp '/content/drive/My Drive/TCC_data/audios.zip' .
!unzip audios.zip
!pip install pysptk
!mkdir audio_model_comparasion

In [None]:
import pysptk
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from IPython import display

In [None]:
plt.style.use('classic')
display.set_matplotlib_formats('jpg', quality=94)

## Audio Features

In [None]:
frame_length = 1024
n_mels = 80
hop_length = 256

def spectrogram(filename, plot=True):
    y, sr = librosa.load(filename)
    
    song, _ = librosa.effects.trim(y, top_db=40)

    S = librosa.feature.melspectrogram(song, sr=sr, n_fft=frame_length, hop_length=hop_length, n_mels=n_mels)
    S_DB = librosa.power_to_db(S, ref=np.max)
    if plot:
        librosa.display.specshow(S_DB, sr=sr, hop_length=hop_length, x_axis='time', y_axis='mel');
        plt.colorbar(format='%+2.0f dB');
    return S_DB, (song, sr)

In [None]:
order = 25
alpha = 0.41

def f0_pitch_energy(path):
    x, sr = librosa.load(path)
    x, _ = librosa.effects.trim(x, top_db=40)

    f0 = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="f0")
    pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch")

    frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T
    frames *= pysptk.blackman(frame_length)
    frames = frames[(frames != 0).all(axis=1)]
    mc = pysptk.mcep(frames, order, alpha)
    energy = pysptk.conversion.mc2e(mc)

    return f0, pitch, energy

## Metrics

In [None]:
from torch.nn import L1Loss
from torch import tensor

l1 = L1Loss()
def L1(yhat, y):
    loss = l1(tensor(yhat), tensor(y))
    return loss.tolist()

In [None]:
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

def DTW(yhat, y):
    distance, path = fastdtw(yhat, y, dist=euclidean)
    return distance

In [None]:
from sklearn.metrics import mean_squared_error

def RMSE(yhat, y):
    return np.sqrt(mean_squared_error(yhat, y))

In [None]:
from torch.nn import CosineSimilarity
from torch import tensor

cosine = CosineSimilarity()
def COSINE(yhat, y):
    loss = cosine(tensor(yhat), tensor(y))
    return loss.mean().tolist()

## Comparing same sentences with Ground Truth

In [None]:
# https://edresson.github.io/TTS-Portuguese-Corpus/
eval = dict()
for model in ['dctts', 'tacotron', 'mozilla_tts']:
    eval[model] = dict()
    for metric in ['L1', 'DTW', 'RMSE', 'COSINE']:
        eval[model][metric] = []

for i in range(1, 21):
    plt.figure(figsize=(22, 20));
    plt.subplot(4, 4, 1);
    print('Ground Truth')
    display.display(display.Audio('ground_truth/{}.wav'.format(i)))
    g_spec, _ = spectrogram('ground_truth/{}.wav'.format(i))
    plt.title('Ground Truth');

    for j, m in enumerate(['dctts', 'tacotron', 'mozilla_tts'], start=2):
        plt.subplot(4, 4, j);
        s_spec, s_sound = spectrogram('{}/{}.wav'.format(m, i))
        plt.title(m);
        if g_spec.shape[1] > s_spec.shape[1]:
            a = np.zeros_like(g_spec)
            a[:s_spec.shape[0], :s_spec.shape[1]] = s_spec
            s_spec = a
        else:
            a = np.zeros_like(s_spec)
            a[:g_spec.shape[0], :g_spec.shape[1]] = g_spec
            g_spec = a
        print(m)
        for metric, n in zip([L1, DTW, RMSE, COSINE], ['L1', 'DTW', 'RMSE', 'COSINE']):
            eval[m][n].append(metric(g_spec, s_spec))
            print(n, ':', metric(g_spec, s_spec), end=',')
        print()
        display.display(display.Audio(data=s_sound[0], rate=s_sound[1]))
        # plt.suptitle('Mel Spectrogram')
    # plt.show()

    k = i
    gt = f0_pitch_energy(f'/content/ground_truth/{k}.wav')
    dctts = f0_pitch_energy(f'/content/dctts/{k}.wav')
    tacotron = f0_pitch_energy(f'/content/tacotron/{k}.wav')
    mozilla = f0_pitch_energy(f'/content/mozilla_tts/{k}.wav')

    for i, n in enumerate(['f0', 'pitch', 'energy']):
        # plt.figure(figsize=(22, 4));
        for j, m in enumerate(zip([gt, dctts, tacotron, mozilla], ['ground truth', 'dctts', 'tacotron', 'mozilla_tts'], ), start=j + 1):
            plt.subplot(4, 4, j)
            plt.plot(m[0][i])
            plt.title('{} {}'.format(m[1], n))
            # plt.suptitle(f'{n}')
    plt.savefig('audio_model_comparasion/{}.png'.format(i))
    plt.show()

In [None]:
!zip -r audio_model_comparasion.zip audio_model_comparasion
!cp  audio_model_comparasion.zip '/content/drive/My Drive/TCC_data/'

In [None]:
for model, metrics in eval.items():
    print(model)
    for name, metric in metrics.items():
        print(name, np.mean(metric))
    print('*' * 80)