# Steps in data preparation

1. Load the audio using librosa
2. Get the duration using librosa.get_duration
3. Calculate each frame width in ms
4. Split the audio on VAD (Below 20db is silence)
5. For each split calculate mel (180 frames) 
6. np.transpose the data Ex: (1,40,180) to (180,1,40)

In [6]:
# All imports
import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import datetime

import tensorflow as tf
import time
from utils import normalize, loss_cal, optim
from tensorflow.contrib import rnn

%matplotlib inline

Namespace(M=5, N=4, beta1=0.5, beta2=0.9, comment='', hidden=128, hop=0.01, iteration=100000, loss='softmax', lr=0.01, model_num=6, model_path='./tisv_model', nfft=512, noise_filenum=16, noise_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/noise', num_layer=3, optim='sgd', proj=64, restore=False, sr=8000, tdsv=False, tdsv_frame=80, test_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/test', tisv_frame=180, train=False, train_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/train', window=0.025)


In [7]:
# Configurations
tisv_frame = 100 # max frame number of utterances of tdsv
hop = 0.01 # 10ms This is frame level precision we will get
window = 0.025 # 25ms
sr = 8000 # sampling rate
nfft = 512 # ft kernel size

In [8]:
# audio_path = '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/zPFptdATk_s_2min.wav'
# audio_path = '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/zPFptdATk_s.wav'
audio_path = '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/e9TC12UQ8og.wav'
#audio_path = '/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/wav48/p259/p259_008.wav'

In [9]:
utter_min_len = (tisv_frame * hop + window) * sr    # lower bound of utterance length

In [10]:
utter, sr = librosa.core.load(audio_path, sr)        # load audio

In [11]:
# Get the duration
duration = librosa.get_duration(utter, sr)
# Duration of each window
duration_per_frame = (duration / utter.shape[0])

In [12]:
print(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')

Duration: 2353.15675
Duration per frame: 0.000125s
Min length of utterance: 1.025s


In [8]:
tisv_frame_duration_s = utter_min_len * duration_per_frame

In [9]:
intervals = librosa.effects.split(utter, top_db=20)         # voice activity detection

In [10]:
intervals_in_s = [[round(block[0] * duration_per_frame, 3), round(block[1] * duration_per_frame, 3)] for block in intervals]

In [11]:
utter[2050], utter[2049], utter[2048], tisv_frame_duration_s

(0.004544247, 0.0031394924, 0.0032979664, 1.025)

In [12]:
# plt.figure()
# plt.subplot(3, 1, 1)
# for interval in intervals:
#     librosa.display.waveplot(utter[interval[0]:interval[1]], sr=sr)
#     plt.show()
# plt.title('Monophonic')

In [13]:
intervals_in_s[:10]

[[6.592, 9.216],
 [10.048, 10.368],
 [10.944, 11.264],
 [11.968, 12.48],
 [14.016, 14.72],
 [15.168, 15.744],
 [17.984, 18.24],
 [18.496, 19.584],
 [19.712, 24.832],
 [25.024, 25.536]]

In [14]:
utterances_spec = []
intervals_gt_s = []
for index, interval in enumerate(intervals):
    if (interval[1]-interval[0]) > utter_min_len:           # If partial utterance is sufficient long,
#         intervals_gt_s.append(intervals_in_s[index])
        utter_part = utter[interval[0]:interval[1]]         # save first and last 180 frames of spectrogram.
        S = librosa.core.stft(y=utter_part, n_fft=nfft,
                              win_length=int(window * sr), hop_length=int(hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=nfft, n_mels=40)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
        
        prev_tisv_frame = 0
        prev_start = intervals_in_s[index][0]
        for i in range(1, S.shape[1]//tisv_frame + 1):
#             print(f'{i},{prev_tisv_frame},{tisv_frame * i}')
            utterances_spec.append(S[:, prev_tisv_frame:tisv_frame * i])
            intervals_gt_s.append([prev_start, prev_start + tisv_frame_duration_s])
#             print(prev_start, prev_start + tisv_frame_duration_s)
            prev_start = prev_start + tisv_frame_duration_s
            prev_tisv_frame = tisv_frame * i
#         utterances_spec.append(S[:, :tisv_frame])    # first 180 frames of partial utterance
#         utterances_spec.append(S[:, -tisv_frame:])   # last 180 frames of partial utterance
#     else:
#         print("skipping interval", interval)
    
utterances_spec = np.array(utterances_spec)
print(utterances_spec.shape)

(426, 40, 100)


# trying to get frames for every interval

In [15]:
# utter_start = 0
# utter_num = M

# utter_batch = []

# utters = utterances_spec        # load utterance spectrogram of selected speaker

# utter_batch.append(utters[utter_start: utter_start+utter_num])

# utter_batch = np.concatenate(utter_batch, axis=0)     # utterance batch [batch(NM), n_mels, frames]

# utter_batch = utter_batch[:,:,:160]               # for test session, fixed length slicing of input batch

utter_batch = np.transpose(utterances_spec, axes=(2,0,1))     # transpose [frames, batch, n_mels]

In [16]:
utter_batch.shape

(100, 426, 40)

In [17]:
hidden = 128
proj = 64
num_layer =3
path = '/datadrive2/dalon/diarization-experiments/Speaker_Verification/tisv_model_4-5-25thSep-bck'
model_num = 5

tf.reset_default_graph()
batch_size = utter_batch.shape[1]

# draw graph
# enroll is ground truth 
# verif is the actual output
#enroll = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel)
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)

# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=hidden, num_proj=proj) for i in range(num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize

print("embedded size: ", embedded.shape)

# enrollment embedded vectors (speaker model)
# enroll_embed = normalize(tf.reduce_mean(tf.reshape(embedded[:batch_size, :], shape= [config.N, config.M, -1]), axis=1))
# verification embedded vectors
# verif_embed = embedded[config.N*config.M:, :]

# similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed)

saver = tf.train.Saver(var_list=tf.global_variables())
with tf.Session() as sess:
    tf.global_variables_initializer().run()

    # load model
    print("model path :", path)
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(path, "Check_Point"))
    ckpt_list = ckpt.all_model_checkpoint_paths
    loaded = 0
    for model in ckpt_list:
        if model_num == int(model[-1]):    # find ckpt file which matches configuration model number
            print("ckpt file is loaded !", model)
            loaded = 1
            saver.restore(sess, model)  # restore variables from selected ckpt file
            break

    if loaded == 0:
        raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.")


    # return similarity matrix after enrollment and verification
    time1 = time.time() # for check inference time
#     if config.tdsv:
#         S = sess.run(similarity_matrix, feed_dict={enroll:random_batch(shuffle=False, noise_filenum=1),
#                                                    verif:random_batch(shuffle=False, noise_filenum=2)})
#     else:
    S = sess.run(embedded, feed_dict={verif:utter_batch})
    
    time2 = time.time()

#     print(S)    # print similarity matrix


Instructions for updating:
keep_dims is deprecated, use keepdims instead
embedded size:  (426, 64)
model path : /datadrive2/dalon/diarization-experiments/Speaker_Verification/tisv_model_4-5-25thSep-bck
ckpt file is loaded ! /datadrive2/dalon/diarization-experiments/Speaker_Verification/tisv_model_4-5-25thSep-bck/Check_Point/model.ckpt-5
INFO:tensorflow:Restoring parameters from /datadrive2/dalon/diarization-experiments/Speaker_Verification/tisv_model_4-5-25thSep-bck/Check_Point/model.ckpt-5


In [18]:
S.shape

(426, 64)

# clustering

In [39]:
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering, AgglomerativeClustering, DBSCAN

In [40]:
# k_means = AgglomerativeClustering().fit(S)

In [41]:
k_means = DBSCAN(eps=3, min_samples=2).fit(S)

In [31]:
# k_means = MiniBatchKMeans(n_clusters=2,
#         random_state=0,
#         batch_size=6).fit(S[11:])

In [32]:
# k_means = SpectralClustering(n_clusters=2,
#        assign_labels="discretize",
#        random_state=0).fit(S)

In [33]:
# k_means = KMeans(n_clusters = 2)
# k_means.fit(S)

In [42]:
cls = k_means.labels_ 

In [43]:
cls.shape

(426,)

In [44]:
# cls

In [45]:
len(intervals_gt_s)

426

In [46]:
for index, c in enumerate(cls):
    print(f'{datetime.timedelta(seconds=intervals_gt_s[index][0])}=={datetime.timedelta(seconds=intervals_gt_s[index][1])}->{c}')

0:00:06.592000==0:00:07.617000->0
0:00:07.617000==0:00:08.642000->0
0:00:18.496000==0:00:19.521000->0
0:00:19.712000==0:00:20.737000->0
0:00:20.737000==0:00:21.762000->0
0:00:21.762000==0:00:22.787000->0
0:00:22.787000==0:00:23.812000->0
0:00:23.812000==0:00:24.837000->0
0:00:26.752000==0:00:27.777000->0
0:00:27.777000==0:00:28.802000->0
0:00:31.616000==0:00:32.641000->0
0:00:32.641000==0:00:33.666000->0
0:00:33.666000==0:00:34.691000->0
0:00:35.584000==0:00:36.609000->0
0:00:36.609000==0:00:37.634000->0
0:00:40==0:00:41.025000->0
0:00:41.025000==0:00:42.050000->0
0:00:42.944000==0:00:43.969000->0
0:00:43.969000==0:00:44.994000->0
0:00:46.080000==0:00:47.105000->0
0:00:49.280000==0:00:50.305000->0
0:00:58.496000==0:00:59.521000->0
0:01:05.536000==0:01:06.561000->0
0:01:06.561000==0:01:07.586000->0
0:01:07.586000==0:01:08.611000->0
0:01:09.440000==0:01:10.465000->0
0:01:14.624000==0:01:15.649000->0
0:01:15.649000==0:01:16.674000->0
0:01:16.928000==0:01:17.953000->0
0:01:18.656000==0:01:

In [28]:
# import IPython
# IPython.display.Audio(audio_path)

In [24]:
# just test config
# Configurations
tisv_frame = 2 # max frame number of utterances of tdsv
hop = 0.12 # 10ms This is frame level precision we will get
window = 0.24 # 25ms
sr = 8000 # sampling rate
nfft = 512 # ft kernel size
duration_per_frame = 0.000125 # in s
utter_min_len = (tisv_frame * hop + window) * sr    # lower bound of utterance length
print(f'Min length of utterance: {utter_min_len * duration_per_frame}s')

Min length of utterance: 0.48s
