# Steps in data preparation

1. Load the audio using librosa
2. Get the duration using librosa.get_duration
3. Calculate each frame width in ms
4. Split the audio on VAD (Below 20db is silence)
5. For each split calculate mel (180 frames) 
6. np.transpose the data Ex: (1,40,180) to (180,1,40)

In [19]:
# All imports
import os, sys, logging
import datetime, json
import time, shutil, pickle
import librosa
import librosa.display
import pysrt
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import subprocess

from utils import normalize, loss_cal, optim
from tensorflow.contrib import rnn
from sklearn.metrics.pairwise import cosine_similarity
from scipy.ndimage.filters import gaussian_filter
from configuration import get_config

config = get_config()
log_file = os.path.abspath("full-pipeline.logs")
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

%matplotlib inline

Log path: /datadrive2/dalon/diarization-experiments/diarization-experiments/Notebooks/full-pipeline.logs


# All configurations below:

In [21]:
videoid = 'GkOn86EtdNQ' #'YdU7fUXDLpI' #'zPFptdATk_s'# 'cKAnHAHBonM' # 'e-Pjs7UyC8I'


random_state = 222 # random seed
config.N = 64 # Number of speakers per batch
config.M = 10 # Number of utterences per speaker
config.iteration = 50000000 # Number of iterations to run
config.lr = 1e-3
config.hidden = 768 # hidden state dimension of lstm
config.proj = 256 # projection dimension of lstm
config.tisv_frame_min = 50

# config.restore = True
config.model_num = 46
logging.info(f'N={config.N}, M={config.M}')
logging.info(f'Model restore: {config.restore}, Model number: {config.model_num}')

# Configurations

#_____________ Parameters to tune on dev set _______________________
# VAD param
# Changing to 25, which will give slightly better intervals, 20 gives very short intervals
vad_threshold = 25 # threshold for voice activity detection

# Segment param
acceptable_shortseg_dur = 0.2 # in second
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# model parameters
model_path = '/datadrive2/dalon/diarization-experiments/diarization-experiments/models/model.ckpt-46' # model save path
dataset_path = '/datadrive2/dalon/diarization-experiments/diarization-experiments/audio/'
save_dir_path = '/datadrive2/dalon/diarization-experiments/diarization-experiments/embeddings'
os.makedirs(save_dir_path, exist_ok=True)
audio_file_name = f'{videoid}.wav'
output_cluster_path = os.path.join(save_dir_path, f'{videoid}.json')

# Data Preparation

I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
embedding file will have a shape of (2, 256)

In [3]:
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)

# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize

print("embedded size: ", embedded.shape)

config_tensorflow = tf.ConfigProto(
        device_count = {'GPU': 0}
    )

saver = tf.train.Saver(var_list=tf.global_variables())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
embedded size:  (2, 256)


In [4]:
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, model_path)
    logging.info("loading audio")
    audio_path = os.path.join(dataset_path, audio_file_name)
#     audio_file_number = audio_file_name.split('.')[0].split('_')[1]
    utter, sr = librosa.core.load(audio_path, sr=config.sr)        # load audio
    utter_min_len = (config.tisv_frame_min * config.hop + config.window) * sr    # lower bound of utterance length
    # Get the duration
    duration = librosa.get_duration(utter, sr)
    # Duration of each window
    duration_per_frame = (duration / utter.shape[0])
    logging.info(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
    tisv_frame_duration_s = utter_min_len * duration_per_frame
    intervals = librosa.effects.split(utter, top_db=vad_threshold)         # voice activity detection

    all_data = []
    logging.info('Converting intervals to embeddings')
    selected_intervals_idx = []
    for idx, current_interval in enumerate(intervals):
        if (current_interval[1]-current_interval[0]) > utter_min_len:
            # Save these selected intervals, as shorter ones are ignored
            selected_intervals_idx.append(idx)
            utterances_spec = []
            utter_part = utter[current_interval[0]:current_interval[1]]         # save first and last 160 frames of spectrogram.
            S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                                  win_length=int(config.window * sr), hop_length=int(config.hop * sr))
            S = np.abs(S) ** 2
            mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
            S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
            utterances_spec.append(S[:, :config.tisv_frame])
            utterances_spec.append(S[:, -config.tisv_frame:])
            utterances_spec = np.array(utterances_spec)
            utter_batch = np.transpose(utterances_spec, axes=(2,0,1))     # transpose [frames, batch, n_mels]

            data = sess.run(embedded, feed_dict={verif:utter_batch})
            all_data.extend(data)


INFO:tensorflow:Restoring parameters from /datadrive2/dalon/diarization-experiments/diarization-experiments/models/model.ckpt-46


In [5]:
data = np.array(all_data)

# Spectral clustering

In [6]:
# cossine similarity
similarity = np.dot(data, data.T)

# squared magnitude of preference vectors (number of occurrences) (diagonals are ai*ai)
square_mag = np.diag(similarity)

# inverse squared magnitude
inv_square_mag = 1 / square_mag


# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0

# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)

# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
A =  cosine.T * inv_mag

# Fill the diagonals with very large negative value
np.fill_diagonal(A, -1000)
# Fill the diagonals with the max of each row
np.fill_diagonal(A, A.max(axis=1))

# final step in cossine sim
A = (1-A)/2

In [7]:
# Gaussian blur
sigma = 0.5 # we will select sigma as 0.5
A_gau = gaussian_filter(A, sigma)

# Thresholding using multiplier = 0.01
threshold_multiplier = 0.01
A_thresh = A_gau * threshold_multiplier

# Symmetrization
A_sym = np.maximum(A_thresh, A_thresh.T)

# Diffusion
A_diffusion = A_sym * A_sym.T

# Row-wise matrix Normalization
Row_max = A_diffusion.max(axis=1).reshape(1, A_diffusion.shape[0])
A_norm = A_diffusion / Row_max.T

# Eigen decomposition
eigval, eigvec = np.linalg.eig(A_norm)
# Since eigen values cannot be negative for Positive semi definite matrix, the numpy returns negative values, converting it to positive
eigval = np.abs(eigval)
# reordering eigen values
sorted_eigval_idx = np.argsort(eigval)[::-1]
sorted_eigval = np.sort(eigval)[::-1]

# For division according to the equation
eigval_shifted = np.roll(sorted_eigval, -1)
# Thresholding eigen values because we don't need very low eigan values due to errors
eigval_thresh = 0.1
sorted_eigval = sorted_eigval[sorted_eigval > eigval_thresh]
eigval_shifted = eigval_shifted[:sorted_eigval.shape[0]]

# Don't take the first value for calculations, if first value is large, following equation will return k=1, and we want more than one clusters
# Get the argmax of the division, since its 0 indexed, add 1
k = np.argmax(sorted_eigval[1:]/eigval_shifted[1:]) + 2
print(f'Number of Eigen vectors to pick: {k}')

# Get the indexes of eigen vectors
idexes = sorted_eigval_idx[:k]
A_eigvec = eigvec[:, idexes]


Number of Eigen vectors to pick: 2


In [8]:
A_eigvec[:1]

array([[-0.03644929, -0.01817526]], dtype=float32)

In [9]:
A_eigvec = A_eigvec.astype('float32')

In [10]:
embeddings_path = os.path.join(save_dir_path, f'{videoid}.spectral.csv')
np.savetxt(embeddings_path, A_eigvec, delimiter='\t')

# K-Means offline clustering
Like in many diarization systems, we integrated the K-Means clustering algorithm with our system. Specifically, we use K-Means++ for initialization. To determine the number of speakers $k$,  we  use  the  “elbow”  of  the  derivatives  of  conditional  Mean Squared Cosine Distances 1 (MSCD) between each embedding to its cluster centroid: <br>
$k = arg max_{\substack{k \geq 1}} MSCD(k)$ <br>
We define cosine distance as $d(x, y) =(1−cos(x, y))/2$

In [11]:
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans

In [12]:
number_of_clusters = 2

A_eigvec_norm = sk_normalize(A_eigvec) # l2 normalized
kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=random_state)
kmeans.fit(A_eigvec)
labels = kmeans.labels_

In [13]:
len(labels)

2626

In [14]:
labels[:10]

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

# Create a diarization files

In [15]:
# Use selected intervals

In [16]:
len(selected_intervals_idx)

1313

In [17]:
intervals[selected_intervals_idx[0]]

array([  512, 34816])

In [26]:
json_data = []
# with open(output_cluster_path, 'w') as f:
for idx, i in enumerate(selected_intervals_idx):
    start = str(datetime.timedelta(seconds = intervals[i][0] * duration_per_frame))
    end = str(datetime.timedelta(seconds = intervals[i][1] * duration_per_frame))
    speaker = labels[idx*2]
    if labels[idx*2] != labels[(idx*2)+1]:
        speaker = 'OL' # possible overlap
    json_data.append({
        'start': start,
        'end': end,
        'speaker': str(speaker)
    })

In [27]:
with open(output_cluster_path, 'w') as f:
    json.dump(json_data, f, indent=4)