<a href="https://colab.research.google.com/github/franciscojferrari/AUTOVC/blob/speaker-encoder/Speaker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import tensorflow as tf
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM,Dense,Lambda,Masking
from tensorflow import keras
import sys
import keras.backend as K


In [None]:
def speaker_centroids(embeddings):
    """
    Inputs:
        embeddings: Embeddings from encoder, shape=(speakers_per_batch, utterances_per_speaker, embedding_size)

    Returns
        Speaker centroids of shape=(speakers_per_batch, 1, embedding_size).
    """
    speaker_centroids = tf.math.reduce_mean(embeddings, axis=1, keepdims=True)
    speaker_centroids = tf.identity(speaker_centroids) / (
        tf.norm(speaker_centroids, axis=2, keepdims=True) + 1e-6
    )

    return speaker_centroids

In [None]:

def similarity_matrix(embeddings, speaker_centroids, utterance_centroids):
    """
    Inputs:
        embeddings: Embeddings from encoder, shape=(speakers_per_batch, utterances_per_speaker, embedding_size)
        speaker_centroids: Speaker centroids of shape=(speakers_per_batch, 1, embedding_size).
        utterance_centroids: Utterance centroids of shape=(speakers_per_batch, 1, embedding_size).

    Returns
        Similarity matrix of shape=(speakers_per_batch, utterances_per_speaker, speakers_per_batch).
    """
    speakers_per_batch = embeddings.shape[0]
    mask_matrix = 1 - tf.eye(speakers_per_batch)
    sim_values = []

    for j in range(speakers_per_batch):
        mask = tf.transpose(tf.where(mask_matrix[j]))[0]
        a = tf.reduce_sum(tf.gather(embeddings, mask) * speaker_centroids[j], axis=2)
        b = tf.reshape(
            tf.reduce_sum(embeddings[j] * utterance_centroids[j], axis=1), shape=(1, -1)
        )

        # Make sure that b is inserted in the right place.
        a = tf.unstack(a, axis=0)
        b = tf.unstack(b, axis=0)
        a.insert(j, b[0])
        c = tf.stack(a, axis=-1)

        sim_values.append(c)

    sim_values = [
        tf.expand_dims(tf.transpose(m), axis=-1) for m in sim_values
    ]  # Add additional dimension
    sim_matrix = tf.concat(sim_values, axis=2)

    return sim_matrix


In [None]:
def utterance_centroids(embeddings):
    """
    Inputs:
        embeddings: Embeddings from encoder, shape=(speakers_per_batch, utterances_per_speaker, embedding_size)

    Returns
        Utterance centroids of shape=(speakers_per_batch, 1, embedding_size).
    """
    utterances_per_speaker = embeddings.shape[1]

    utterance_centroids = (
        tf.math.reduce_sum(embeddings, axis=1, keepdims=True) - embeddings
    )
    utterance_centroids /= utterances_per_speaker - 1
    utterance_centroids = tf.identity(utterance_centroids) / (
        tf.norm(utterance_centroids, axis=2, keepdims=True) + 1e-6
    )

    return utterance_centroids

In [None]:
def calculate_loss(sim_matrix):
  same_idx = list(range(sim_matrix.shape[0]))
  sim_matrix = sim_matrix.numpy()
  pos = sim_matrix[same_idx, :, same_idx]
  in_neg = (np.exp(sim_matrix))
  neg = np.log(np.sum(in_neg,axis=2)+ 1e-6)
  per_embedding_loss = -1 * (pos - neg)
  loss = per_embedding_loss.sum()
  return loss

In [None]:
class SpeechEmbedder(keras.layers.Layer):
    def __init__(self, time_dim=13, melfilters_dim=32):
        super(SpeechEmbedder, self).__init__()
        self.model = Sequential()
        self.model.add(Masking(mask_value=-1.0,
                                  input_shape=(time_dim, melfilters_dim)))
        
        self.model.add(LSTM(768, return_sequences=True,
                            input_shape= (None,melfilters_dim)))
        self.model.add(LSTM(768))
        #TODO: check activation function
        self.model.add(Dense(256,activation='relu'))
        #TODO: check if this L2 normalization is well done
        self.model.add(Lambda(lambda x: K.l2_normalize(x,axis=1)))

    def call(self, inputs):
        return self.model.predict(inputs)

In [None]:
class GE2ELoss(keras.layers.Layer):
    def __init__(self):
        super(GE2ELoss, self).__init__()
        self.w = tf.Variable(initial_value=10.0, trainable=True)
        self.b = tf.Variable(initial_value=-5.0, trainable=True)
        
    def call(self, inputs):
        #constrain w > 0, to have larger similarity when cosine similarity is larger. 
        tf.clip_by_value(self.w, clip_value_min=1e-6, clip_value_max=np.inf)
        centroids = speaker_centroids(inputs)
        ut_centroids = utterance_centroids(inputs)
        coss_sim = similarity_matrix(inputs,centroids,ut_centroids)
        sim_matrix = self.w*coss_sim + self.b
        loss = calculate_loss(sim_matrix)
        return loss

Experiment Speech Embedder

In [None]:
#Speech Embedder experiment
number_of_speakers = 12
utterances_per_speaker = 10
data_points = number_of_speakers *utterances_per_speaker #Number of utterances
dimension = 13 #Number of time steps
time_steps = 3 #Feature dimension
data = np.random.rand(data_points,time_steps,dimension)
data.shape

(120, 3, 13)

In [None]:
model = SpeechEmbedder(time_steps, dimension)
prediction = model.call(data)
prediction.shape

(120, 256)

We need to reshape the data to have it as (number of speakers, number of utterances per speaker, embedding)

In [None]:
prediction = prediction.reshape((number_of_speakers,utterances_per_speaker,256))
prediction.shape

(12, 10, 256)

Testing speakers centroids

In [None]:
centroids = speaker_centroids(prediction)
centroids.shape

TensorShape([12, 1, 256])

Testing utterance centroids

In [None]:
#TODO: check that this is well done.. the shape not really
ut_centroids = utterance_centroids(prediction)
ut_centroids.shape

TensorShape([12, 10, 256])

In [None]:
coss_sim = similarity_matrix(prediction,centroids,ut_centroids)
coss_sim.shape

TensorShape([12, 10, 12])

Testing with class

In [None]:
loss_model = GE2ELoss()
loss = loss_model.call(prediction)
loss

300.5792

## Mounting bucket

In [None]:
from google.colab import auth

In [None]:
auth.authenticate_user()

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  2537  100  2537    0     0  70472      0 --:--:-- --:--:-- --:--:-- 72485
OK
40 packages can be upgraded. Run 'apt list --upgradable' to see them.
gcsfuse is already the newest version (0.35.0).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.


In [None]:
!mkdir DataSet
!gcsfuse autovc_datasets DataSet

mkdir: cannot create directory ‘DataSet’: File exists
2021/04/27 14:06:27.525036 Using mount point: /content/DataSet
2021/04/27 14:06:27.551423 Opening GCS connection...
2021/04/27 14:06:28.477666 Mounting file system "autovc_datasets"...
2021/04/27 14:06:28.519213 File system has been successfully mounted.


## Read data

In [None]:
! pip install -q tensorflow-io

In [None]:
import DataProcessing as dp
import yaml
import importlib
from pathlib import Path
import utils as utils

from utils import parse_spectrograms
import matplotlib.pyplot as plt
import tensorflow as tf
import random

In [None]:
importlib.reload(dp)
importlib.reload(utils)

<module 'utils' from '/content/utils.py'>

In [None]:
config = yaml.load(Path("config.yml").read_text(), Loader=yaml.SafeLoader)
data_reader = dp.DataReader(config)
data_reader.find_data_sets()
data_reader.load_datasets()
test_data = data_reader.datasets["103"]
for i in test_data.take(2):
  example = parse_spectrograms(i)
  
plt.figure(figsize=(15,4))
spect = tf.math.log(example["mel_spectrogram"]).numpy()
spect.shape
#plt.imshow(spect, aspect="auto")

{'dataset': {'train-clean-100': 'librispeech/downloads/extracted/TAR_GZ.openslr.org_resource_12_train-clean-1001N3R1aarMDBm8Ulx12juQyeKXyoKpD3HFrDmTsu79uI.tar.gz/LibriSpeech/train-clean-100', 'train-clean-360': 'librispeech/downloads/extracted/TAR_GZ.openslr.org_resource_12_train-clean-360FGpWSWIX6WwUM0oWDfl__-3W4KBOZrnFrw1Avjx5Ls8.tar.gz.incomplete_419af91646134b46a332a6d1f38e0261/LibriSpeech/train-clean-360', 'train-clean-500': 'librispeech/downloads/extracted/TAR_GZ.openslr.org_resource_12_train-other-5003bIvJ_luwWNkXVMhVVnfaqNlFfJuAd1weYGINQrcttI.tar.gz.incomplete_4fa7adaee06441e9b95cf97e93617ee2/LibriSpeech/train-other-500', 'dev-clean': 'librispeech/downloads/extracted/TAR_GZ.openslr.org_resources_12_dev-cleandvh9CQZQYX_KDKyPiLlBbg6_gDUKy5ezQ6hfqQNyirM.tar.gz/LibriSpeech/dev-clean', 'test-clean': 'librispeech/downloads/extracted/TAR_GZ.openslr.org_resources_12_test-cleanOf3lJeWWctxtFVGRmxR49yRDipWqVfh0tXa-IZZ-bCM.tar.gz/LibriSpeech/test-clean', 'vctk': 'vctk/mic1/1.0.0'}, 'write_

(872, 128)

<Figure size 1080x288 with 0 Axes>

In [None]:
i["subset"].numpy()

b'\x08\x07\x12\x00B\x05train'

In [None]:
def parse_spectrograms(example):
    """Convert the serialized tensor back to a tensor."""
    example = tf.io.parse_tensor(
        example.numpy()[0], out_type=tf.float32
    )
    return example

def create_batches(datasets, number_speakers, number_utterances):
  list_speakers = random.sample(datasets.keys(),number_speakers)
  batch = []
  for speaker in list_speakers:
      print(speaker)
      list_utterances = datasets[speaker].shuffle(buffer_size=100).batch(number_utterances)
      batch_speaker = next(iter(list_utterances))
      for i in batch_speaker["mel_spectrogram"]:
        spectrogram = parse_spectrograms(i)
        batch.append(spectrogram)
  batch = tf.ragged.stack(batch, axis=0)     
  return batch

1069
5105
6295
260


In [None]:
batch.shape

TensorShape([12, None, None])

# Testing with real data

In [None]:

number_of_speakers = 4
utterances_per_speaker = 3

batch = create_batches(data_reader.datasets,number_of_speakers,utterances_per_speaker)
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(batch.numpy(), padding="post", dtype='float32',value=-1.0)


data_points = padded_inputs.shape[0]
dimension = padded_inputs.shape[2]
time_steps = padded_inputs.shape[1]   #Number of time steps

model = SpeechEmbedder(time_steps, dimension)
prediction = model.call(padded_inputs)


251
2035
6313
1272


  return np.array(rows)


(12, 256)

In [None]:
prediction = prediction.reshape((number_of_speakers,utterances_per_speaker,256))
prediction.shape


(4, 3, 256)

In [None]:
loss_model = GE2ELoss()
loss = loss_model.call(prediction)


In [None]:
loss

19.466358

# Training the model

In [None]:
def grad(model,embedding):
  with tf.GradientTape() as tape:
    loss_model = GE2ELoss()
    loss_value = loss_model.call(embedding)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [None]:
epochs = 200
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
for i in range(epochs):
  