# Steps in data preparation

1. Load the audio using librosa
2. Get the duration using librosa.get_duration
3. Calculate each frame width in ms
4. Split the audio on VAD (Below 20db is silence)
5. For each split calculate mel (180 frames) 
6. np.transpose the data Ex: (1,40,180) to (180,1,40)

In [75]:
# All imports
import os, sys, logging
import datetime
import time, shutil, pickle
import librosa
import librosa.display
import pysrt
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import subprocess

from utils import normalize, loss_cal, optim
from tensorflow.contrib import rnn
from sklearn.metrics.pairwise import cosine_similarity
from scipy.ndimage.filters import gaussian_filter
from configuration import get_config

config = get_config()
log_file = os.path.abspath("data-generation-for-uisrnn.logs")
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

%matplotlib inline

Log path: /datadrive/dalon/diarization-experiments/Notebooks/data-generation-for-uisrnn.logs


# All configurations below:

In [71]:
random_state = 222 # random seed
config.N = 64 # Number of speakers per batch
config.M = 10 # Number of utterences per speaker
config.iteration = 50000000 # Number of iterations to run
config.lr = 1e-3
config.hidden = 768 # hidden state dimension of lstm
config.proj = 256 # projection dimension of lstm
# config.restore = True
config.model_num = 46
logging.info(f'N={config.N}, M={config.M}')
logging.info(f'Model restore: {config.restore}, Model number: {config.model_num}')

# Configurations

#_____________ Parameters to tune on dev set _______________________
# VAD param
# Changing to 25, which will give slightly better intervals, 20 gives very short intervals
vad_threshold = 25 # threshold for voice activity detection

# Segment param
acceptable_shortseg_dur = 0.2 # in second
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# model parameters
model_path = '/datadrive/dalon/models/m-64-10-768-256/Check_Point/model.ckpt-46' # model save path
dataset_path = '/datadrive/dalon/diarizer-dataset/VCTK-Corpus/wav48/'
save_dir_path = '/datadrive/dalon/diarizer-dataset/VCTK-Corpus/embeddings'
os.makedirs(save_dir_path, exist_ok=True)

# Data Preparation

I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
embedding file will have a shape of (2, 256)

In [72]:
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)

# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize

print("embedded size: ", embedded.shape)

config_tensorflow = tf.ConfigProto(
        device_count = {'GPU': 0}
    )

saver = tf.train.Saver(var_list=tf.global_variables())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
embedded size:  (2, 256)


In [None]:
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, model_path)
    
    for folder in os.listdir(dataset_path):
        speakerid = folder
        folder = os.path.join(dataset_path, folder)
        for audio_file_name in os.listdir(folder):
            audio_path = os.path.join(folder, audio_file_name)
            audio_file_number = audio_file_name.split('.')[0].split('_')[1]
            utter, sr = librosa.core.load(audio_path, sr=config.sr)        # load audio
            utter_min_len = (config.tisv_frame * config.hop + config.window) * sr    # lower bound of utterance length
            # Get the duration
            duration = librosa.get_duration(utter, sr)
            # Duration of each window
            duration_per_frame = (duration / utter.shape[0])
#             print(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
            tisv_frame_duration_s = utter_min_len * duration_per_frame
            intervals = librosa.effects.split(utter, top_db=vad_threshold)         # voice activity detection

            for idx, current_interval in enumerate(intervals):
                utterances_spec = []
                utter_part = utter[current_interval[0]:current_interval[1]]         # save first and last 160 frames of spectrogram.
                S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                                      win_length=int(config.window * sr), hop_length=int(config.hop * sr))
                S = np.abs(S) ** 2
                mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
                S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
        #         print(S.shape)
                utterances_spec.append(S[:, :config.tisv_frame])
                utterances_spec.append(S[:, -config.tisv_frame:])

                utterances_spec = np.array(utterances_spec)
                utter_batch = np.transpose(utterances_spec, axes=(2,0,1))     # transpose [frames, batch, n_mels]
        #         print(utter_batch.shape)

                data = sess.run(embedded, feed_dict={verif:utter_batch})
                save_embedding_path = os.path.join(save_dir_path, f'vctk-{speakerid}-{audio_file_number}-{idx}.npy')
                np.save(save_embedding_path, data)
#                 print(data.shape)

INFO:tensorflow:Restoring parameters from /datadrive/dalon/models/m-64-10-768-256/Check_Point/model.ckpt-46


## structuring the dataset

In [10]:
from collections import defaultdict
list_of_embedding_path = sorted(os.listdir(save_dir_path))
print(f'Total number of files {len(list_of_embedding_path)}')
list_of_embedding_path[:5]

embedding_dict = defaultdict(list)

for file in list_of_embedding_path:
    embedding_dict[file.split('-')[1]].append(os.path.join(save_dir_path, file))

for key in embedding_dict:
    if len(embedding_dict[key]) < 5: # remove the keys if its embeddings is less than 5
        embedding_dict.pop(key)
        print(f'Poped {key}')

def shuffle_two(dict_of_two, train_sequence_path, train_ids):
    """Shuffle the given 2 labels"""
    max_to_pick = 5
    key1, key2 = dict_of_two.keys()
#     print(len(dict_of_two[key1]), len(dict_of_two[key2]))
    while dict_of_two[key1] and dict_of_two[key2]:
#         print('in while')
        no_to_pick = np.random.randint(1, max_to_pick)
        if no_to_pick <= len(dict_of_two[key1]):
#             print(no_to_pick, len(dict_of_two[key1]))
            train_sequence_path.extend(dict_of_two[key1][:no_to_pick])
            del dict_of_two[key1][:no_to_pick]
            train_ids.extend([key1] * no_to_pick)
        else: break
        no_to_pick = np.random.randint(1, max_to_pick)
        if no_to_pick <= len(dict_of_two[key2]):
#             print(no_to_pick)
            train_sequence_path.extend(dict_of_two[key2][:no_to_pick])
            del dict_of_two[key2][:no_to_pick]
            train_ids.extend([key2] * no_to_pick)
        else: break
    no_to_pick = len(dict_of_two[key1])
    train_sequence_path.extend(dict_of_two[key1])
    train_ids.extend([key1] * no_to_pick)
    no_to_pick = len(dict_of_two[key2])
    train_sequence_path.extend(dict_of_two[key2])
    train_ids.extend([key2] * no_to_pick)

Total number of files 93417


In [11]:
train_sequence_path = []
train_ids = []
while len(embedding_dict) >= 2:
    first2pairs = {k: embedding_dict[k] for k in list(embedding_dict)[:2]}
    shuffle_two(first2pairs, train_sequence_path, train_ids)
    # remove the keys from embedding_dict
    for key in first2pairs:
        embedding_dict.pop(key)
print(len(embedding_dict), len(train_sequence_path), len(train_ids))

0 93417 93417


In [12]:
# train_sequence_path[81234:81239], train_ids[81234:81239]

In [13]:
train_sequence = []
train_cluster_id = []
for idx, item in enumerate(train_sequence_path):
    embeddings = np.load(item)
    train_sequence.extend(embeddings.tolist())
    train_cluster_id.extend([train_ids[idx], train_ids[idx]])
#     break

In [14]:
len(train_sequence), len(train_cluster_id)

(186834, 186834)

In [15]:
np.savez('/datadrive/dalon/diarizer-dataset/VCTK-Corpus/vctk_training_data.npz',
         train_sequence=train_sequence, train_cluster_id=train_cluster_id)

# Required datastructure for UIS-RNN training


In [110]:
import numpy as np

In [197]:
# train_data = np.load('/datadrive/dalon/diarizer-dataset/VCTK-Corpus/vctk_training_data.npz') #'/datadrive/dalon/uis-rnn/data/training_data.npz')
train_data = np.load('/datadrive/dalon/uis-rnn/data/training_data.npz')
test_data = np.load('/datadrive/dalon/uis-rnn/data/testing_data.npz')

In [198]:
type(train_data)

numpy.lib.npyio.NpzFile

In [173]:
train_data.files

['train_sequence', 'train_cluster_id']

In [54]:
for keys in train_data:
    print(keys, type(train_data[keys]))

train_sequence <class 'numpy.ndarray'>
train_cluster_id <class 'numpy.ndarray'>


In [55]:
train_sequence = train_data['train_sequence']
train_cluster_id = train_data['train_cluster_id']

In [56]:
train_sequence[0].shape

(256,)

In [58]:
train_cluster_id[20:25]

array(['0_0', '0_0', '0_0', '0_0', '0_0'], dtype='<U5')

#### datastructure
data = {'train_sequence': ndarray of embeddings,
        'train_cluster_id': ndarray of labels}

In [64]:
# Analyse test data

In [213]:
# test_data = np.load(test_data_path)

In [214]:
test_data.files

['test_sequences', 'test_cluster_ids']

In [196]:
for (test_sequence, test_cluster_id) in zip(test_data['test_sequences'], test_data['test_cluster_ids']):
    print(not isinstance(test_sequence, np.ndarray))
    print(test_sequence.dtype != float)
    print((not isinstance(test_sequence, np.ndarray) or
        test_sequence.dtype != float))

False
False
False
False
False
False


In [176]:
test_sequence.dtype

dtype('float32')

In [133]:
# test_cluster_ids is an array of list, this list consists of lables

In [134]:
# test_sequences is an array of array, second array consis of audio embeddings

In [158]:
test_data['test_sequences'][0][0][0]

0.09571809014298133

In [155]:
test_data['test_sequences'][0][0][0]

0.005444454

In [215]:
print(*map(len, test_data['test_cluster_ids']))

125 125 125 125 42 125 125 66


# Prepare test dataset

In [88]:
videoid = 'zPFptdATk_s'
save_utter_label_interval = f'/datadrive/dalon/uis-rnn/Notebooks/{videoid}_5min.b'
save_test_data = f'/datadrive/dalon/uis-rnn/Notebooks/{videoid}_test.npz' # it will have embeddings and data

In [89]:
with open(save_utter_label_interval, 'rb') as f:
    _tmp = pickle.load(f)

In [90]:
embeddings = []
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, model_path)

    utter = _tmp['utter']
    sr = config.sr
    utter_min_len = (config.tisv_frame * config.hop + config.window) * sr    # lower bound of utterance length
    # Get the duration
    duration = librosa.get_duration(utter, sr)
    # Duration of each window
    duration_per_frame = (duration / utter.shape[0])
#             print(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
    tisv_frame_duration_s = utter_min_len * duration_per_frame
    intervals = _tmp['intervals']

    for idx, current_interval in enumerate(intervals):
        utterances_spec = []
        utter_part = utter[current_interval[0]:current_interval[1]]         # save first and last 160 frames of spectrogram.
        S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                              win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
#         print(S.shape)
        utterances_spec.append(S[:, :config.tisv_frame])
        utterances_spec.append(S[:, -config.tisv_frame:])

        utterances_spec = np.array(utterances_spec)
        utter_batch = np.transpose(utterances_spec, axes=(2,0,1))     # transpose [frames, batch, n_mels]
        data = sess.run(embedded, feed_dict={verif:utter_batch})
        embeddings.extend(data)

INFO:tensorflow:Restoring parameters from /datadrive/dalon/models/m-64-10-768-256/Check_Point/model.ckpt-46


In [91]:
embeddings = np.array(embeddings)
embeddings.shape

(542, 256)

In [92]:
test_cluster_ids = []
for item in _tmp['labels_list']:
    test_cluster_ids.extend([item, item])

In [93]:
len(test_cluster_ids)

542

In [94]:
np.savez(save_test_data,
         test_sequences=embeddings,
         test_cluster_ids=test_cluster_ids)

# Create single npz file for testing

In [206]:
videoids = ['zPFptdATk_s', 'VqF96Um0HQw']
save_test_files = []
for videoid in videoids:
    save_test_files.append(f'/datadrive/dalon/uis-rnn/Notebooks/{videoid}_test.npz')

In [207]:
test_sequences = []
test_cluster_ids = []

In [208]:
512//125

4

In [209]:
# convert these into smaller chunks
def split_to_chunks(test_sequences, test_cluster_ids, ids, seqs):
    max_size = 125
    start = 0
    for i in range(len(ids) // max_size):
        test_sequences.append(seqs[start:max_size * (i + 1)])
        test_cluster_ids.append(ids[start:max_size * (i + 1)])
        start = max_size * (i + 1)
    test_sequences.append(seqs[start:])
    test_cluster_ids.append(ids[start:])

In [210]:
for file in save_test_files:
    _tmp = np.load(file)
    if len(list(_tmp['test_cluster_ids'])) > 150:
        split_to_chunks(test_sequences,
                        test_cluster_ids,
                        list(_tmp['test_cluster_ids']),
                        np.float64(_tmp['test_sequences']))
    else:
        test_sequences.append(np.float64(_tmp['test_sequences']))
        test_cluster_ids.append(list(_tmp['test_cluster_ids']))

In [211]:
test_sequences = np.array(test_sequences)
test_cluster_ids = np.array(test_cluster_ids)

In [212]:
test_data_path = '/datadrive/dalon/uis-rnn/data/testing_data_custom_2vid.npz'
np.savez(test_data_path,
         test_sequences=test_sequences,
         test_cluster_ids=test_cluster_ids)

## VCTK data prep for UIS-RNN

In [25]:
import os
from glob import glob
from collections import defaultdict

In [22]:

# path to vctk dataset
# There are sub folders for each speaker with wav file inside them
dataset_path = '/datadrive/dalon/diarizer-dataset/VCTK-Corpus/wav48/'


In [27]:
speakers_desc = defaultdict(list) # speaker id is the key and value is list of utterences
for folder in os.listdir(dataset_path):
    speakerid = folder
    folder = os.path.join(dataset_path, folder)
    for utter_path in os.listdir(folder):
        utter_path = os.path.join(folder, utter_path)
        speakers_desc[f'vctk-{speakerid}'].append(utter_path)

In [30]:
utter_desc = []
for key in speakers_desc:
#     print(f'{key} = {len(speakers_desc[key])}')
    utter_desc.append(len(speakers_desc[key]))
utter_desc = np.array(utter_desc)

In [31]:
utter_desc.min(), utter_desc.max(), utter_desc.mean()

(172, 503, 405.89908256880733)

# check training dataset

In [216]:
data_path = '/datadrive/dalon/diarizer-dataset/VCTK-Corpus/vctk_training_data.npz'

In [217]:
data = np.load(data_path)

In [222]:
data.files

['train_sequence', 'train_cluster_id']

In [223]:
data['train_cluster_id'].shape

(186834,)

# Spectral clustering

In [17]:
# cossine similarity
similarity = np.dot(data, data.T)

# squared magnitude of preference vectors (number of occurrences) (diagonals are ai*ai)
square_mag = np.diag(similarity)

# inverse squared magnitude
inv_square_mag = 1 / square_mag

# if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
inv_square_mag[np.isinf(inv_square_mag)] = 0

# inverse of the magnitude
inv_mag = np.sqrt(inv_square_mag)

# cosine similarity (elementwise multiply by inverse magnitudes)
cosine = similarity * inv_mag
A =  cosine.T * inv_mag

# Fill the diagonals with very large negative value
np.fill_diagonal(A, -1000)
# Fill the diagonals with the max of each row
np.fill_diagonal(A, A.max(axis=1))

# final step in cossine sim
A = (1-A)/2

In [18]:
# Gaussian blur
sigma = 0.5 # we will select sigma as 0.5
A_gau = gaussian_filter(A, sigma)

# Thresholding using multiplier = 0.01
threshold_multiplier = 0.01
A_thresh = A_gau * threshold_multiplier

# Symmetrization
A_sym = np.maximum(A_thresh, A_thresh.T)

# Diffusion
A_diffusion = A_sym * A_sym.T

# Row-wise matrix Normalization
Row_max = A_diffusion.max(axis=1).reshape(1, A_diffusion.shape[0])
A_norm = A_diffusion / Row_max.T

# Eigen decomposition
eigval, eigvec = np.linalg.eig(A_norm)
# Since eigen values cannot be negative for Positive semi definite matrix, the numpy returns negative values, converting it to positive
eigval = np.abs(eigval)
# reordering eigen values
sorted_eigval_idx = np.argsort(eigval)[::-1]
sorted_eigval = np.sort(eigval)[::-1]

# For division according to the equation
eigval_shifted = np.roll(sorted_eigval, -1)
# Thresholding eigen values because we don't need very low eigan values due to errors
eigval_thresh = 0.1
sorted_eigval = sorted_eigval[sorted_eigval > eigval_thresh]
eigval_shifted = eigval_shifted[:sorted_eigval.shape[0]]

# Don't take the first value for calculations, if first value is large, following equation will return k=1, and we want more than one clusters
# Get the argmax of the division, since its 0 indexed, add 1
k = np.argmax(sorted_eigval[1:]/eigval_shifted[1:]) + 2
print(f'Number of Eigen vectors to pick: {k}')

# Get the indexes of eigen vectors
idexes = sorted_eigval_idx[:k]
A_eigvec = eigvec[:, idexes]


Number of Eigen vectors to pick: 4


In [19]:
np.savetxt(embeddings_path, A_eigvec, delimiter='\t')

# K-Means offline clustering
Like in many diarization systems, we integrated the K-Means clustering algorithm with our system. Specifically, we use K-Means++ for initialization. To determine the number of speakers $k$,  we  use  the  “elbow”  of  the  derivatives  of  conditional  Mean Squared Cosine Distances 1 (MSCD) between each embedding to its cluster centroid: <br>
$k = arg max_{\substack{k \geq 1}} MSCD(k)$ <br>
We define cosine distance as $d(x, y) =(1−cos(x, y))/2$

In [20]:
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans

In [21]:
number_of_clusters = 2

A_eigvec_norm = sk_normalize(A_eigvec) # l2 normalized
kmeans = KMeans(n_clusters=number_of_clusters, init='k-means++', random_state=random_state)
kmeans.fit(A_eigvec)
labels = kmeans.labels_

# Add speakers to the srt file

In [22]:
# len(intervals_gt_s), len(labels)

In [23]:
# for index, c in enumerate(labels):
#     print(f'{datetime.timedelta(seconds=intervals_gt_s[index][0])}=={datetime.timedelta(seconds=intervals_gt_s[index][1])}->{c}')

In [24]:
# intervals_gt_s.index([635, 636])

In [25]:
subs = pysrt.open(srt_path, encoding="utf-8")
convert_to_s = lambda st: (st.hours * 60 * 60) + \
                            (st.minutes * 60) +\
                            (st.seconds) #+ \
                            #(st.milliseconds / 1000)
get_start_and_end = lambda sub: (convert_to_s(sub.start), convert_to_s(sub.end))

for sub in subs:
    start, end = get_start_and_end(sub)
    speakers = []
#     speakers_intervals = []
    for idx, interval in enumerate(intervals_gt_s):
#         interval[0], interval[1] = int(interval[0]), int(interval[1])
        if interval[0] <= start <= interval[1] or interval[0] <= end <= interval[1]\
                or (start <= interval[0] and interval[1] <= end):
            speakers.append(labels[idx])
            if idx < len(intervals_gt_s) - 1 and intervals_gt_s[idx + 1][0] - interval[1] >= tisv_frame_duration_s:
                speakers.append(-1)

#________________debug________________
#     if sub.index == 171:
#         for idx, interval in enumerate(intervals_gt_s):
#             if interval[0] <= start <= interval[1] or interval[0] <= end <= interval[1]: 
#                 print(interval)
#         print(f'{start, end}')
#         print(f'here: {speakers} T:{end - start} {sub.text}')
#         break
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
            
    if speakers:
#         print(speakers)
        sp_list, sp_count = np.unique(speakers, return_counts=True)
        speaker_dist = 'Speakers: '
        number_speakers = len(speakers)
        for idx, sp in enumerate(sp_list):
            speaker_dist += f'{sp}, '
        sub.text = f'{speaker_dist[:-2]} S:{speakers} T:{end - start} {sub.text}'
subs.save(save_srt_path, encoding='utf-8')
print(save_srt_path)

/datadrive/dalon/diarizer-dataset/srts/outputs/Co4FdrBZgQs--160-0.025-0.01-20-0.2.en.srt


In [26]:
# start <= 635 and 636 <= end

In [27]:
# 635 <= end <= 636

In [28]:
# interval[0], interval[1],start, end

In [29]:
# speakers = []
# for idx, interval in enumerate(intervsals_gt_s):
#     if interval[0] <= start <= interval[1] or interval[0] <= end <= interval[1]:
#         speakers.append(labels[idx])
#     if idx == 120: break
#     print(speakers, interval[0], interval[1],start, end)

In [30]:
# count = 0
# for i,sub in enumerate(subs):
#     if not sub.text.startswith("Speakers"):
#         print(sub.index)
#         count += 1
# print(count, i + 1)