<a href="https://colab.research.google.com/github/cs16b108/IB_MDDP/blob/master/IB_Diarization_14122020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install webrtcvad
!pip install hmmlearn==0.2.3
!pip install python_speech_features

In [3]:
import os
from os.path import isfile, isdir, join
from pathlib import Path
from sklearn import mixture
import math
import numpy as np
import random
from scipy.stats import multivariate_normal
from matplotlib import pyplot as plt
from scipy.spatial import distance
import scipy.io.wavfile as wav
from python_speech_features import mfcc
import pickle
from hmmlearn import hmm
import time
import copy

In [32]:
#File name and directory path
fileName = 'ES2002c'
drivePath = '/content/drive/My Drive/IB_Diarization/'

In [33]:
#######################################################
############# Read WAV and perform VAD ################
# Already done once and files stored on google drive ##
#######################################################

import collections
import contextlib
import sys
import wave
import webrtcvad


def read_wave(path):
    """Reads a .wav file.
    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate


def write_wave(path, audio, sample_rate):
    """Writes a .wav file.
    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.
    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n


def vad_collector(sample_rate, frame_duration_ms,
                  padding_duration_ms, vad, frames,vuv_frames):
    """Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.
    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).
    Returns: A generator that yields PCM audio data.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    index=-1
    voiced_frames = []
    for frame in frames:

        index+=1
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        sys.stdout.write('1' if is_speech else '0')
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.

                id = 0 #we must start actually with num_padding_frames-1 and do index-- 
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                    vuv_frames[index-id]=1
                    id+=1

                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            vuv_frames[index] = 1
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
    sys.stdout.write('\n')
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])


In [None]:
def main():
    # if len(args) != 2:
    #     sys.stderr.write(
    #         'Usage: silenceremove.py <aggressiveness> <path to wav file>\n')
    #     sys.exit(1)
    # path = '/content/amicorpus/'
    path = join(drivePath, 'temp/'+fileName)
    dir_list = sorted(os.listdir(path))
    cnt = 0
    for d in dir_list:
      dir_name = join(path,d)
      if not isdir(dir_name):
        # filePath = join(path, d, 'audio', d+'.Mix-Headset.wav')
        filePath = join(path, d)
        # try:
        # if 1:
        audio, sample_rate = read_wave(filePath)
        vad = webrtcvad.Vad(int(1))
        frame_duration_ms = 30
        frames = frame_generator(30, audio, sample_rate)
        frames = list(frames)

        nof_frames = 1+(len(audio)-1)/int(sample_rate * (frame_duration_ms / 1000.0) * 2)
        nof_frames = int(nof_frames)
        vuv_frames = np.zeros((nof_frames,)).astype('int')
        segments = vad_collector(sample_rate, 30, 300, vad, frames,vuv_frames)
        print(len(frames)," s", sample_rate,len(audio))

        # Segmenting the Voice audio and save it in list as bytes
        concataudio = [segment for segment in segments]

        joinedaudio = b"".join(concataudio)
        np.save(join(drivePath,'save_files', fileName+'_vuv.npy'),vuv_frames)
        # writePath = join('/content/drive/My Drive/IB_Diarization/temp/conv', d)
        # Path(writePath).mkdir(parents=True, exist_ok=True)
        # write_wave(writePath, joinedaudio, sample_rate)
        cnt += 1
        if(cnt == 2):
          break
        # except Exception as inst:
          # print("Skipping: ", filePath, inst)
    print("Converted: ",cnt)
if __name__ == '__main__':
    main()

In [36]:
vuv_frames = np.load(join(drivePath,'save_files', fileName+'_vuv.npy'))

In [None]:
####################################
### Actual Code Starts from Here ###
####################################

In [72]:
#Define path to wav files created after VAD

path = join(drivePath, 'amicorpus_non_silence/')

overlap = 0.01 #10 ms window shift
fullPath = join(path,fileName+'/audio/'+fileName+'.Mix-Headset.wav')
(rate,sig) = wav.read(fullPath)
mfcc_feat = mfcc(sig, rate, numcep = 19, nfilt = 26, winlen=0.03, winstep=overlap)

In [70]:
#Also read the original wav file
path = join(drivePath, 'temp/'+fileName)
fullPath = join(path,fileName+'.Mix-Headset.wav')
(rate,sig) = wav.read(fullPath)
mfcc_feat_original = mfcc(sig, rate, numcep = 19, nfilt = 26, winlen=0.03, winstep=overlap)

In [39]:
##n is total num of frames and d is num of features per frame
n, d = mfcc_feat.shape

In [41]:
# overlap = 0.01 #10 ms window shift
init_cluster_time = 2500 #2.5sec
init_cluster_len = math.ceil(init_cluster_time/(overlap*1000))

In [42]:
N = math.ceil(n/init_cluster_len)
num_of_clusters = N

In [43]:
print(N,n,init_cluster_len, (n/init_cluster_len) )

879 219619 250 878.476


In [None]:
# class GMM:
#     def __init__(self, num_of_clusters):
#         self.num_of_clusters = num_of_clusters
#         self.log_likelihood =[]
#         self.LL_diff = []
#         # self.num_of_speakers = num_of_speakers

#     def gaussian_prob(self, x, mean, sigma):
#         d = x.shape[0]
#         p = ((2*math.pi)**(-d/2))*(np.linalg.det(sigma)**(-0.5))*np.exp(-0.5*(x-mean).reshape(d,1).T.dot(np.linalg.inv(sigma)).dot((x-mean).reshape(d,1)))
#         return p

#     def k_means(self, X):
#         n = X.shape[0]
#         d = X.shape[1]
#         itr = 0
#         #self.centroid = np.zeros((self.num_of_clusters, d), dtype = 'float64')
#         self.centroids = X[random.sample(range(n), self.num_of_clusters)]
#         self.cluster_assigned = np.zeros(n, dtype = int)
#         error = 0.0
#         while True:
#             print("Now at itr - ", itr)
#             # print("Centroids - ", self.centroids)
#             for i in range(n):
#                 f_vec = X[i]
#                 dist = np.sqrt(np.sum((f_vec-self.centroids)**2, 1))
#                 # print("Dist Shape is - ", dist.shape)
#                 self.cluster_assigned[i] = np.argmin(dist)
#             new_error = np.sum(np.sqrt(np.sum((X - self.centroids[self.cluster_assigned])**2, 1)))
#             if(itr>0):
#                 print("Error Difference is - ", np.abs(error-new_error))
#             new_centroids = np.zeros((self.num_of_clusters, d), dtype = 'float64')
#             count_of_elements = np.zeros(self.num_of_clusters, dtype = int)
#             for i in range(n):
#                 c_ind = self.cluster_assigned[i]
#                 new_centroids[c_ind] += X[i]
#                 count_of_elements[c_ind] += 1
#             new_centroids = new_centroids/count_of_elements[:,None]
#             if np.abs(new_error-error)<10 or np.array_equal(self.centroids, new_centroids) or itr>=5:
#                 print("Breaking at itr - ", itr)
#                 break
#             else:
#                 self.centroids = np.copy(new_centroids)
#             itr += 1
#             error = new_error

#     def EM_GMM_INBUILT(self, X):
#         N = X.shape[0]
#         d = X.shape[1]
#         from sklearn.mixture import GaussianMixture as GMM
#         g = GMM(n_components=64, covariance_type = 'full', max_iter = 1)
#         g.fit(X)
#         print("Created")

#     def EM_GMM(self, X):
#         N = X.shape[0]
#         d = X.shape[1]
#         self.cov_mat = np.zeros((self.num_of_clusters, d, d), dtype = 'float64')
#         self.gamma = np.zeros((N,self.num_of_clusters), dtype = 'float64')
#         likelihood = np.zeros((N,self.num_of_clusters), dtype = 'float64')
#         self.pi_prob = np.zeros(self.num_of_clusters, dtype = 'float64')
#         self.Nk = np.zeros(self.num_of_clusters, dtype = 'float64')
#         for k in range(self.num_of_clusters):
#             indices = (np.argwhere(self.cluster_assigned==k)).ravel()
#             X_k = X[indices]
#             X_k_centered = X_k - self.centroids[k]
#             self.Nk[k] = X_k.shape[0]
#             # print("Xk ",X_k.shape)
#             # print("Xkc ",X_k_centered.shape)
#             # print("cov mat ",self.cov_mat[k])
#             self.cov_mat[k] = (1/self.Nk[k])*(X_k_centered.T.dot(X_k_centered))
#         # print(self.Nk)
#         self.pi_prob = self.Nk/N
#         print("EM Begins")
#         itr = 1
#         prev_log_likelihood = 0.0
        
#         while True:
#             #####################################
#             ############   E Step   #############
#             #####################################
#             for k in range(self.num_of_clusters):
#                 #self.gamma[i,k] = self.gaussian_prob(X[i], self.centroids[k], self.cov_mat[k])
#                 self.cov_mat[k] += 1e-6*np.identity(d)
#                 likelihood[:,k] =  multivariate_normal.pdf(X, self.centroids[k], self.cov_mat[k]).ravel()
#                 # print("Done ", k)
#             # log_likelihood = np.sum(np.sum((likelihood*self.pi_prob), axis = 1))

#             # for i in range(N):
#             #     print("Done ",i)
             
#             self.gamma = likelihood*self.pi_prob
#             self.gamma = self.gamma/(np.sum(self.gamma, axis = 1)[:,None])
#             # print("E done")

#             #####################################
#             ############   M Step   #############
#             #####################################
#             self.Nk = np.sum(self.gamma, axis = 0)
#             self.pi_prob = self.Nk/N
#             for k in range(self.num_of_clusters):
#                 self.centroids[k] = (1/self.Nk[k])*np.sum((X*self.gamma[:,k][:,np.newaxis]), axis = 0)
#                 X_centered = X - self.centroids[k]
#                 self.cov_mat[k] = (1/self.Nk[k])*((X_centered*self.gamma[:,k][:,np.newaxis]).T.dot(X_centered))
#             # print("M done")

#             #####################################
#             ########   Log Likelihood   #########
#             #####################################
#             new_log_likelihood = np.sum(np.log(np.sum((likelihood*self.pi_prob), axis = 1)))
#             self.log_likelihood.append(new_log_likelihood)
#             diff_LL = np.abs(new_log_likelihood-prev_log_likelihood)
#             self.LL_diff.append(diff_LL)
#             print("Itr = ", itr, " Current LL is - ",new_log_likelihood)
#             print("Change In LL is - ",diff_LL)
#             if(diff_LL<100 or itr>=10):
#                 print("EM Finished at iteration - ", itr)
#                 break
#             itr += 1
#             prev_log_likelihood = new_log_likelihood

# ug = GMM(num_of_clusters)
# ug.k_means(mfcc_feat)
# ug.EM_GMM(mfcc_feat)

In [44]:
def fitUnimodal(C):
  means = []
  covMatrices = []
  for c in C:
    means.append(np.mean(c, axis = 0))
    covMatrices.append(np.cov(c.T))
  return means, covMatrices

In [45]:
def calc_prob(x, GaussianMeans, GaussianCovMatrices):
  p = 0.0
  D = x.shape[0]
  numOfClusters = len(GaussianMeans)
  for i in range(D):
    s = x[i]
    for k in range(numOfClusters):
    #self.gamma[i,k] = self.gaussian_prob(X[i], self.centroids[k], self.cov_mat[k])
      cov_matrix = 1e-6*np.identity(d) + GaussianCovMatrices[k]
      # cov_matrix = 
      p =  p + ug.pi_prob[k]*multivariate_normal.pdf(s, ug.centroids[k], cov_matrix)
  p = p/D
  return p

In [46]:
def calcYgivenX(x, GaussianMeans, GaussianCovMatrices, i):
  p = 0.0
  numOfClusters = len(GaussianMeans)
  w = 1.0/numOfClusters
  D = x.shape[0]
  probMat = np.zeros((D, num_of_clusters), dtype = float)
  for i in range(num_of_clusters):
    probMat[:,i] = multivariate_normal(x, GaussianMeans[i], GaussianCovMatrices[i])
  p = 0.0
  self.gamma = self.gamma/(np.sum(self.gamma, axis = 1)[:,None]) 
  return p

In [47]:
########################
##### IB Algorithm #####
########################

#Init Variables
C = np.array_split(mfcc_feat, num_of_clusters)
GaussianMeans, GaussianCovMatrices = fitUnimodal(C)
ClusterMapping = dict(zip(range(num_of_clusters), [[i] for i in range(num_of_clusters)]))
beta = 10.0

In [29]:
# probC = []
# for i in range(N):
#   p = 0.0
#   D = C[i].shape[0]
#   for j in range(D):
#     s = C[i][j]
#     p += multivariate_normal.pdf(s, GaussianMeans[i], GaussianCovMatrices[i])
#   p = p/D 
#   probC.append(p)

In [48]:
probC = (1.0/N)*np.ones(N)
probX = probC.copy()

In [49]:
probYgivenC = []
probCgivenX = []
for i in range(N):
  temp1 = []
  temp2 = []
  x = C[i]
  w = 1.0/N
  D = x.shape[0]
  probMat = np.zeros((D, N), dtype = float)
  for j in range(N):
    probMat[:,j] = multivariate_normal.pdf(x, GaussianMeans[j], GaussianCovMatrices[j]).ravel()
  probMat = probMat/(np.sum(probMat, axis = 1)[:,None])
  temp1 = np.mean(probMat, axis = 0)
  for j in range(N):
    # p = probMat[i,j]/(np.sum(probMat[i,j], axis = 1)[:,None])
    # p = calcYgivenX(x, GaussianMeans, GaussianCovMatrices, i)
    # temp1.append(p)
    if j == i:
      temp2.append(1.0)
    else:
      temp2.append(0.0)
    # print("Done2 ",j)
  probYgivenC.append(temp1)
  probCgivenX.append(temp2)
  if i%100 == 0:
    print("Done ",i)

# # prob_cond_y_c = np.zeros((N, N), dtype = float)
# # prob_cond_c_x = np.zeros((N, N), dtype = float)
# del_F = np.zeros((N, N), dtype = float)
# for i in range(N):
#   prob_c(i) = calc_prob(C[i], ug)
#   for j in range(N):
#     prob_cond_y_c[j][i] = calc_cond_prob(j, C[i], ug)
#     if(j == i):
#       prob_cond_c_x[j][i] = 1



#Main Algo


Done  0
Done  100
Done  200
Done  300
Done  400
Done  500
Done  600
Done  700
Done  800


In [51]:
del_F = np.zeros((N, N), dtype = float)
del_F[:,:] = np.inf
probXgivenC = ((np.array(probCgivenX)*np.array(probX)).T/probC).T
for i in range(N):
  for j in range(i+1, N): 
    temp1 = distance.jensenshannon(np.array(probYgivenC)[:,i], np.array(probYgivenC)[:,j]) 
    temp2 = distance.jensenshannon(probXgivenC[i], probXgivenC[j]) 
    dij = temp1 - (1/beta)*temp2
    del_F[i][j] = (probC[i] + probC[j])*dij
    # del_F[i][j] = cal_objective_diff(C[i], C[j])
  if i%100 == 0:
    print("Done ",i)

Done  0
Done  100
Done  200
Done  300
Done  400
Done  500
Done  600
Done  700
Done  800


In [52]:
import pickle
file_name = '/content/drive/My Drive/IB_Diarization/'+fileName+'_probYgivenC.sav'
pickle.dump(probYgivenC, open(file_name, 'wb'))
file_name = '/content/drive/My Drive/IB_Diarization/'+fileName+'_probCgivenX.sav'
pickle.dump(probCgivenX, open(file_name, 'wb'))
file_name = '/content/drive/My Drive/IB_Diarization/'+fileName+'_del_F.sav'
pickle.dump(del_F, open(file_name, 'wb'))

In [231]:
import pickle
probC = (1.0/N)*np.ones(N)
probX = probC.copy()
ClusterMapping = dict(zip(range(N), [[i] for i in range(N)]))
file_name = '/content/drive/My Drive/IB_Diarization/'+fileName+'_probYgivenC.sav'
probYgivenC = pickle.load(open(file_name, 'rb'))
file_name = '/content/drive/My Drive/IB_Diarization/'+fileName+'_probCgivenX.sav'
# pickle.dump(probCgivenY, open(file_name, 'wb'))
probCgivenX = pickle.load(open(file_name, 'rb'))
file_name = '/content/drive/My Drive/IB_Diarization/'+fileName+'_del_F.sav'
del_F = pickle.load(open(file_name, 'rb'))

In [53]:
num_of_speakers = 4
hmm_gmm_cluster_num = 5

In [54]:
# Nif num_of_clusters == 4:
bestClusterMapping = copy.deepcopy(ClusterMapping)

In [55]:
#IB ALgo
import time
startTime = time.time()
num_of_clusters = N
# print("Yaha")

INFyc = 0.0
probY = 1.0/N
for y in range(N):
  for c in range(num_of_clusters):
    INFyc += probC[c]*probYgivenC[y][c]*np.log(probYgivenC[y][c]/probY)

INFyx = INFyc
NMI = INFyc/INFyx
threshold = 0.5
while num_of_clusters > 1:
  # print("Here")
  # i, j = np.argwhere(del_F == np.min(del_F)).ravel()
  mIdx = np.argmin(del_F)
  i = mIdx//N
  j = mIdx%N
  # print(i,j)
  probCr = probC[i] + probC[j]
  del_F[:,j] = np.inf
  del_F[j,:] = np.inf
  # probC.pop(j)
  ClusterMapping[i] += ClusterMapping[j]
  ClusterMapping[j] = []
  probYgivenC[i] = (probYgivenC[i]*probC[i] + probYgivenC[j]*probC[j])/probCr
  probC[i] = probCr
  probCgivenX[i] = [0 for idx in probCgivenX[i]]
  for idx in ClusterMapping[i]:
    probCgivenX[i][idx] = 1
  probXgivenC = ((np.array(probCgivenX)*np.array(probX)).T/probC).T
  for idx in range(0, i):
    if del_F[idx,i] == np.inf:
      continue
    temp1 = distance.jensenshannon(np.array(probYgivenC)[:,idx], np.array(probYgivenC)[:,i]) 
    temp2 = distance.jensenshannon(probXgivenC[idx], probXgivenC[i]) 
    dij = temp1 - (1/beta)*temp2
    del_F[idx][i] = (probC[idx] + probC[i])*dij
  for idx in range(i+1, N):
    if del_F[i, idx] == np.inf:
      continue 
    temp1 = distance.jensenshannon(np.array(probYgivenC)[:,i], np.array(probYgivenC)[:,idx]) 
    temp2 = distance.jensenshannon(probXgivenC[i], probXgivenC[idx]) 
    dij = temp1 - (1/beta)*temp2
    del_F[i][idx] = (probC[i] + probC[idx])*dij
  num_of_clusters = num_of_clusters-1
  
  if num_of_clusters == num_of_speakers:
    print("Deep Copying dict:")
    bestClusterMapping = copy.deepcopy(ClusterMapping)

  INFyc = 0.0
  for y in range(N):
    for c in range(N):
      if ClusterMapping[c]:
        INFyc += probC[c]*probYgivenC[y][c]*np.log(probYgivenC[y][c]/probY)
  NMI = INFyc/INFyx
  if num_of_clusters%50 == 0 or num_of_clusters<=10:
    # print(i,j)
    # cnt = 0
    # for idx in range(N):
    #   if len(ClusterMapping[idx]) !=0:
    #     cnt += len(ClusterMapping[idx])
    #     if num_of_clusters<10:
    #       print(idx, len(ClusterMapping[idx]))
    # print("")
    # for idx in range(N):
    #   if len(bestClusterMapping[idx]) !=0:
    #     if num_of_clusters<6:
    #       print(idx, len(bestClusterMapping[idx]))
    print("All Count: ",cnt)
    print("Clusters Rem: ", num_of_clusters)
    print("NMI: ",NMI)
    print("Time Elapsed: ", (time.time()-startTime)/60, " minutes")

print("Completion Time: ", (time.time()-startTime)/60, " minutes")

All Count:  76
Clusters Rem:  850
NMI:  0.9760661359244392
Time Elapsed:  2.810843535264333  minutes
All Count:  76
Clusters Rem:  800
NMI:  0.9438554604666591
Time Elapsed:  7.338755683104197  minutes
All Count:  76
Clusters Rem:  750
NMI:  0.9162858771072044
Time Elapsed:  11.577003220717112  minutes
All Count:  76
Clusters Rem:  700
NMI:  0.8935194559072733
Time Elapsed:  15.579637630780537  minutes
All Count:  76
Clusters Rem:  650
NMI:  0.8594555734149203
Time Elapsed:  19.271832331021628  minutes
All Count:  76
Clusters Rem:  600
NMI:  0.8136385116507342
Time Elapsed:  22.708418889840445  minutes
All Count:  76
Clusters Rem:  550
NMI:  0.7561084391384516
Time Elapsed:  25.899140242735545  minutes
All Count:  76
Clusters Rem:  500
NMI:  0.7066188999334495
Time Elapsed:  28.849774038791658  minutes
All Count:  76
Clusters Rem:  450
NMI:  0.6330973827302447
Time Elapsed:  31.52183584769567  minutes
All Count:  76
Clusters Rem:  400
NMI:  0.589260819130265
Time Elapsed:  33.957224869

In [56]:
ClusterMapping = copy.deepcopy(bestClusterMapping)

In [58]:
#Save the results of IB Diarization
file_name = join(drivePath,fileName+'done.sav')
doneFile = [N, probX, probC, probYgivenC, probCgivenX, probXgivenC, ClusterMapping, GaussianMeans, GaussianCovMatrices]
pickle.dump(doneFile, open(file_name, 'wb'))

In [55]:
#Load the results of IB Diarization
file_name = join(drivePath,fileName+'done.sav')
doneFile = pickle.load(open(file_name, 'rb'))
N = doneFile[0]
probX= doneFile[1]
probC= doneFile[2]
probYgivenC= doneFile[3]
probCgivenX= doneFile[4]
probXgivenC= doneFile[5]
ClusterMapping = doneFile[6]
GaussianMeans = doneFile[7]
GaussianCovMatrices = doneFile[8]

In [59]:
###################################
#### HMM Alignment Begins Here ####
###################################

In [80]:
#Segregate MFCCs for each cluster for creating GMMs for HMM
segCount = 0
mfcc_segregated = []
for key in ClusterMapping:
  if(len(ClusterMapping[key])>0):
    cur_mfcc = np.empty((0,19), dtype = float)
    for curSeg in sorted(ClusterMapping[key]):
      sIdx = max(0, (curSeg)*init_cluster_len)
      eIdx = min((curSeg+1)*init_cluster_len, n)
      cur_mfcc = np.concatenate((cur_mfcc, np.array(mfcc_feat[sIdx:eIdx])))
    mfcc_segregated.append(cur_mfcc)

In [90]:
#Calculate parameters for initializing HMM
gmmMeans = np.zeros((num_of_speakers, hmm_gmm_cluster_num,mfcc_feat.shape[-1]), dtype = float)
gmmCov = np.zeros((num_of_speakers, hmm_gmm_cluster_num ,mfcc_feat.shape[-1], mfcc_feat.shape[-1]), dtype = float)
gmmWeights = np.zeros((num_of_speakers, hmm_gmm_cluster_num ), dtype = float)
for i, X in enumerate(mfcc_segregated):
  gmm = mixture.GaussianMixture(n_components=hmm_gmm_cluster_num).fit(X)
  gmmMeans[i] = gmm.means_
  gmmCov[i] = gmm.covariances_
  gmmWeights[i] = gmm.weights_

In [91]:
#Save GMM results
file_name = join(drivePath, fileName+'_gmmTrained.sav')
gmmTrained = [gmmMeans, gmmCov, gmmWeights]
pickle.dump(gmmTrained, open(file_name, 'wb'))

In [92]:
#Load GMM results
file_name = join(drivePath, fileName+'_gmmTrained.sav')
gmmTrained = pickle.load(open(file_name, 'rb'))
gmmMeans = gmmTrained[0]
gmmCov = gmmTrained[1]
gmmWeights = gmmTrained[2]

In [93]:
#Separate the final clusters into a list
cluster_list = []
for i in range(N):
  if len(ClusterMapping[i]) !=0:
    cluster_list.append(ClusterMapping[i]);

#Map Which cluster belongs to which speaker
cluster_to_speaker = np.ones((N,))
for sp_id, clstr in enumerate(cluster_list):
  for c in clstr:
    cluster_to_speaker[c] = int(sp_id)

#Start probability for HMM
start_prob=np.zeros((num_of_speakers,))
start_prob[int(cluster_to_speaker[0])] = 1.0

In [94]:
#Calculate Transmission Probability for HMM
transmission_prob = np.zeros((num_of_speakers,num_of_speakers), dtype = float)
for i in range(1,N):
  fromSpeaker = int(cluster_to_speaker[i-1])
  toSpeaker = int(cluster_to_speaker[i])
  transmission_prob[fromSpeaker][toSpeaker] += 1
for i in range(num_of_speakers):
  total = np.sum(cluster_to_speaker[:-1] == i)
  transmission_prob[i,:] /= total

In [95]:
# Initializie HMM
U_GMM_HMM =  hmm.GMMHMM(n_components = num_of_speakers,
                        n_mix = hmm_gmm_cluster_num,
                        covariance_type = "full",
                        init_params = "",
                        n_iter = 50
                         )
U_GMM_HMM.covars_ = gmmCov
U_GMM_HMM.means_ = gmmMeans
U_GMM_HMM.weights_ = gmmWeights
U_GMM_HMM.transmat_ = transmission_prob
U_GMM_HMM.startprob_ = start_prob

In [96]:
viterbiAligned = U_GMM_HMM.predict(mfcc_feat_original)

In [98]:
#Mask each silence frame to value num_of_speakers+1
for i in range(len(viterbiAligned)):
  if vuv_frames[i//3] == 0:
    viterbiAligned[i] = num_of_speakers+1

In [42]:
def most_common(lst):
    return max(set(lst), key=lst.count)

In [116]:
# finalSegments = []
# for i in range(N):
#   temp = viterbiAlignedSpeakers[i*init_cluster_len:min((i+1)*init_cluster_len, len(viterbiAlignedSpeakers))]
#   x = np.bincount(temp).argmax()
#   finalSegments.append(x)

In [99]:
#Generate final segments after viterbi alignment
finalSegments = []
init_cluster_timee = 2500 #2.5sec, is the min speaker segment size
init_cluster_lenn = math.ceil(init_cluster_timee/(overlap*1000))
NN = math.ceil(n/init_cluster_lenn)
for i in range(NN):
  temp = viterbiAligned[i*init_cluster_lenn:min((i+1)*init_cluster_lenn, len(viterbiAligned))]
  x = np.bincount(temp).argmax()
  finalSegments.append(x)

In [100]:
#Generate RTTM file for predicted data
def segmentToRTTM(finalSegments, init_cluster_time):
  fileObj = open(fileName+'_predicted_2500.rttm', 'a')
  idx = 0
  startTime = -1
  duration = 0
  # speakerId
  recName = fileName
  channelId = '1'
  onset = 0.000
  duration = 0.0
  while idx<=NN:
    if idx == NN or finalSegments[idx] != finalSegments[idx-1]:
      lineStr = 'SPEAKER ' + 'meeting' + ' ' + channelId + ' ' + str(onset) + ' ' + str(duration) + ' <NA> <NA> ' \
                    + 'speaker_'+str(int(finalSegments[idx-1])) + ' <NA> <NA>\n'
      if finalSegments[idx-1] != num_of_speakers+1 and duration>0.0:
        fileObj.writelines(lineStr)
      onset += duration
      duration = init_cluster_time/1000
    else:
      duration += init_cluster_time/1000
    idx += 1
  fileObj.close()
try:
    os.remove(fileName+'_predicted_2500.rttm')
except OSError:
    pass
segmentToRTTM(finalSegments, init_cluster_timee)

In [None]:
# f = open('actual.rttm', 'r')
# fr = open('new_actual.rttm','a')
# lastEnd = 0.0
# for l in f.readlines():
#   lst = l.split()
#   lineStr = 'SPEAKER ' + 'meeting' + ' ' + '1' + ' ' + str(lastEnd) + ' ' + lst[4] + ' <NA> <NA> ' \
#                     + lst[7] + ' <NA> <NA>\n'
#   # lst[3] = str(lastEnd)
#   lastEnd = lastEnd + float(lst[4])
#   fr.writelines(lineStr)
#   # break
# f.close()
# fr.close()

In [50]:
# f = open(fileName+'.scp', 'a')
# l = len(vuv_frames)
# i = 0
# while i<l:
#   if vuv_frames[i] == 1:
#     j = i
#     cnt = 1
#     while j+1<l and vuv_frames[j+1] == 1:
#       cnt += 1
#       j += 1
#     curLine = fileName+'_'+str(i*3)+'_'+str(j*3)+'='+fileName+'.fea['+str(i*3)+','+str(j*3)+']\n'
#     f.writelines(curLine)
#     i = j
#   i += 1
# f.close()

In [5]:
# f = open('IS1000a_Mix-Headset_25', 'r')
# fw = open('IS1000a.scp', 'a')
# lst = f.readlines()
# # scp_vec = np.zeros((227976), dtype=int)
# for i in range(13, len(lst)-2):
#   if lst[i+2] == '"sounding"\n':
#     l = int(float(lst[i])*100)
#     r = int(float(lst[i+1])*100)
#     curLine = 'IS1000a_'+str(l)+'_'+str(r)+'=IS1000a.fea['+str(l)+','+str(r)+']\n'
#     fw.writelines(curLine)     
# f.close()
# fw.close()