In [1]:
import keras
import librosa
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from tensorflow.keras.optimizers import SGD
import numpy as np
import glob as glob
from numpy import asarray
from numpy import save

In [3]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.clongdouble))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

nmfcc = 13

In [5]:
def normalize(arr):          # used to scale all values between 0 and 1
    arr_max = np.ndarray.max(arr, axis = 1)    # obtain max and min values for each feature   
    arr_min = np.ndarray.min(arr, axis = 1)
    arr_max = arr_max.reshape((nmfcc, 1))
    arr_min = arr_min.reshape((nmfcc, 1))
    diff = arr_max - arr_min         # obtain range of each feature
    arr = arr-arr_min
    arr = arr/diff
    return arr

In [6]:
def listofsamples(music_mfcc):         # create a list of all the samples
    samples = []
    cols = music_mfcc.shape[1]        # total no of samples
    for i in range(cols):
        samples.append(music_mfcc[:, i])

    arr = np.array(samples)     # convert list to 2d array
    arr = arr.T
    arr = normalize(arr)         # spread values between 0 and 1
    cols = arr.shape[1]
    n_samples = []
    for i in range(cols):
        n_samples.append(arr[:, i])    # list of normalized samples
    return n_samples

In [9]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/SpeechData/Speech2.wav'
speech, sr = readAudio(dirname)
Speech = librosa.feature.mfcc(y=speech, sr= 16000, n_mfcc= nmfcc)
speech = listofsamples(Speech)       #Speech Dataset
print(len(speech))
print(speech[0].shape)
print(speech[0])

54392
(13,)
[0.67890245 0.5774692  0.40413493 0.45244664 0.47015077 0.4663162
 0.7861795  0.49941975 0.7575369  0.3286484  0.22860876 0.46127218
 0.609776  ]


In [8]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/MusicData/music2.wav'
music, sr = readAudio(dirname)
Music = librosa.feature.mfcc(y=music, sr= 16000, n_mfcc= nmfcc)
music = listofsamples(Music)    #Music Dataset
print(len(music))
print(music[0].shape)

56211
(13,)


In [10]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/NoiseData/noise2.wav'
noise, sr = readAudio(dirname)
Noise = librosa.feature.mfcc(y=noise, sr= 16000, n_mfcc= nmfcc)
silence = listofsamples(Noise)      #Silence Dataset
print(len(silence))
print(silence[0].shape)

62501
(13,)


In [11]:
# print(len(music))      just functions to confirm shapes
# print(len(speech))
# print(len(silence))
# print(silence[56])

56211
54392
62501
[0.01581264 0.4047906  0.34330437 0.40189147 0.35940826 0.43214425
 0.41150963 0.4052104  0.4465698  0.51538366 0.4778128  0.37968874
 0.4904235 ]


Gaussian Mixture Models implementation:-

In [12]:
class GMM:
    def __init__(self, max_iter = 50):
        self.max_iter = max_iter
        
        # pi list contains the fraction of the dataset for every cluster
        self.pi = [1/3 for comp in range(3)]
        
    def multivariate_normal(self, X, mean_vector, covariance_matrix):     # implements pdf of multivariate normal distribution
        (sign, logdet) = np.linalg.slogdet(covariance_matrix)      # used to find det of matrix without overflowing
        if logdet < -100:
            det = np.exp(-5)
        else:    
            det = np.exp(logdet)
        
        a = covariance_matrix
        m = 10^-6
        inv_prod = np.linalg.inv(a+ np.eye(a.shape[1])*m)     #small noise added in diagonals to prevent singularity
        
        return (2*np.pi)**(-len(X)/2)*det**(-1/2)*np.exp(-np.dot(np.dot((X-mean_vector).T, inv_prod), (X-mean_vector))/2)
        
    def fit(self, X):
        # Spliting the data in 3 sub-sets
        temp_X = np.array_split(X, 3)
        new_X = []
        for x in temp_X:
            new_X.append(x.T)
            
        # Initial computation of the mean-vector and covarience matrix
        self.mean_vector = [np.mean(x, axis=1) for x in new_X]
        self.covariance_matrixes = [np.cov(x) for x in new_X]
        
        # Deleting the new_X matrix because we will not need it anymore
        del new_X
        count = 1
        for iteration in range(self.max_iter):
            ''' ----------------   E - STEP   ------------------ '''
            # Initiating the r matrix, every row contains the probabilities
            # for every cluster for this row
            self.r = np.ones((len(X), 3))*(1/3)
            # Calculating the r matrix
            for n in range(len(X)):
                for k in range(3):
                    self.r[n][k] = self.pi[k] * self.multivariate_normal(X[n], self.mean_vector[k], self.covariance_matrixes[k])
                    temp_sum = sum([self.pi[j]*self.multivariate_normal(X[n], self.mean_vector[j], self.covariance_matrixes[j]) for j in range(3)])
                    self.r[n][k] /= temp_sum
                    
            # Calculating the N
            N = np.sum(self.r, axis=0)
            
            
            ''' ---------------   M - STEP   --------------- '''
            # Initializing the mean vector as a zero vector
            self.mean_vector = [np.zeros((len(X[0]),)) for k in range(3)]
            # Updating the mean vector
            for k in range(3):
                for n in range(len(X)):
                    if self.r[n][k] < np.exp(-5):
                        self.mean_vector[k] = np.exp(-5)
                    else:    
                        self.mean_vector[k] += self.r[n][k] * X[n]
                   
                    temp = 1/N[k]
                    self.mean_vector[k] = self.mean_vector[k]*temp
                    
            # Initiating the list of the covariance matrixes
            self.covariance_matrixes = [np.zeros((len(X[0]), len(X[0]))) for k in range(3)]
            # Updating the covariance matrices
            for k in range(3):
                for n in range(len(X)):
                    diff_vec = X[n]-self.mean_vector[k]
                    self.covariance_matrixes[k] += np.dot(diff_vec, diff_vec.T)*self.r[n][k]
                    temp = 1/N[k]
                    self.covariance_matrixes[k]* temp
                
            # Updating the pi list
            self.pi = [N[k]/len(X) for k in range(3)]
            print(count)         #helpful in finding time of training
            count = count+1
                          
    
                
    

In [13]:
music_model = GMM(50)         #making a GMM model for music
music_model.fit(music)
print(music_model.pi)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
[0.15395677931312723, 0.07940195395538824, 0.7666412667314995]


In [15]:
speech_model = GMM(50)          #making a GMM model for speech
speech_model.fit(speech)
print(speech_model.pi)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
[0.05212334950320024, 0.3449651118149789, 0.6029115386817304]


In [16]:
silence_model = GMM(50)         #making a GMM moel for silence
silence_model.fit(silence)
print(silence_model.pi)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
[0.26877920834068697, 0.5456196358301244, 0.185601155829187]


In [31]:
def Predict(test):      #method used to predict the class to which our test sample belongs
    cluster = []
    for i in range(len(test)):
        music_score = 0        #initializing scores of all classes
        speech_score = 0
        silence_score = 0
        
        #Using bayes theorem, p(c/x) = (p(x/c)p(c))/p(x)  where c is class and x is given sample
        #We assume p(x) to be constant for all classes and
        #p(c) to be equal for all (equally likely events)
        #Thus claculating p(x/c) for a class which is summation over pi[k]*normal(x/mu_k, sigma_k)
        for k in range(3):
            music_score += music_model.pi[k] * music_model.multivariate_normal(test[i], music_model.mean_vector[k], music_model.covariance_matrixes[k])
            speech_score += speech_model.pi[k] * speech_model.multivariate_normal(test[i], speech_model.mean_vector[k], speech_model.covariance_matrixes[k])
            silence_score += silence_model.pi[k] * silence_model.multivariate_normal(test[i], silence_model.mean_vector[k], silence_model.covariance_matrixes[k])
        if music_score > speech_score:
            if music_score > silence_score:
                cluster.append('Music')      # if music is max score
            else:
                cluster.append('Silence')    # if silence is max score ... and so on
        else:
            if speech_score > silence_score:
                cluster.append('Speech')
            else:
                cluster.append('Silence')
        
    return cluster

In [32]:
def audioEventDetect(cluster, window_size = 0.064):     #initial event detection method
    super_list = []      #stores all events
    cur_list = []        #stores start time, end time and event class
    cur_list.append(0)
    cur_list.append(window_size)     # end time of first event
    cur_class = cluster[0]
    cur_list.append(cur_class)
    super_list.append(cur_list)
    prev_class = cur_class      
    cluster = cluster[1:]        #start iterating from 2nd element of cluster
    
    for cur_class in cluster:
        if cur_class == prev_class:     #eg. if music is followed by music 
            super_list[-1][1] += window_size - 0.032    #end time of current event is increased by window_size - 0.032s
        else:
            cur_list = super_list[-1]    
            new_start = cur_list[1]      #start time of new event is end time of previous event
            cur_list = [new_start]
            cur_list.append((new_start + window_size - 0.032))     # end time of new event is new start event + remaining part of overlapping window
            cur_list.append(cur_class)
            super_list.append(cur_list)
        prev_class = cur_class
                            
    return super_list

In [35]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/NoiseData/noise2.wav'    # test_sample
x_test, sr = readAudio(dirname)
log_mels = calc_spec(x_test)    # in test, these spectrograms would be given
mfcc = librosa.feature.mfcc(S = log_mels, sr=16000, n_mfcc=13)   # extract mfcc features from spectrogram
test = listofsamples(mfcc)
cluster = Predict(test)       # obtain list of framewise predicted classes
print(cluster)

['Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Music', 'Music', 'Music', 'Music', 'Music', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Music', 'Speech', 'Speech', 'Music', 'Music', 'Music', 'Music', 'Music', 'Music', 'Speech', 'Speech', 'Music', 'Music', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Music', 'Music', 'Music', 'Music', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Music', 'Music', 'Music', 'Music', 'Speech', 'Music', 'Music', 'Music', 'Music', 'Music', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speec

"new_events = []\nfor event in events:\n    if (event[1] - event[0]) >= 0.1:\n        new_events.append(event)\n#print(new_events)\nprint('Now')\nevents = smoothEventDetect(new_events)\n#print(events)"

In [None]:
events = audioEventDetect(cluster)
print(events)                #initial audio event detection 

In [36]:
noOfFrames = 29        # this functions does a majority voting on 28 (sub)frames and allots corresponding label to this new frame (0.96 s in length) of 28 (sub)frames 
new_cluster = []
count = 0 
music_score = 0
speech_score = 0
silence_score = 0
for title in cluster:     # count number of music, speech, or silence classes in 28 frames  
    if title == 'Music':
        music_score+= 1    
    elif title == 'Speech':
        speech_score+= 1 
    else:
        silence_score += 1
    if count == noOfFrames-1:     # iterate till 28 frames have been counted 
        count = -1
        if music_score > speech_score:
            if music_score > silence_score:
                new_cluster.append('Music')    # max class is Music
            else:
                new_cluster.append('Silence')   #max class is Silence.. and so on
        else:
            if speech_score > silence_score:
                new_cluster.append('Speech')
            else:
                new_cluster.append('Silence') 
        music_score = 0
        speech_score = 0
        silence_score = 0
    count+= 1
    
events = audioEventDetect(new_cluster, 0.96)       #pass this new cluster of 0.96s frames through audio event detection method
print(events)

[[0, 9.312000000000001, 'Speech']]


In [37]:
# Saving model weights in the form of .npy files
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/music_pi.npy'
print(music_model.pi)
saveSpectrogram(music_model.pi, dirname)
saveSpectrogram(music_model.mean_vector, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/music_mean.npy')
saveSpectrogram(music_model.covariance_matrixes, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/music_covariance.npy')
saveSpectrogram(speech_model.pi, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/speech_pi.npy')
saveSpectrogram(speech_model.mean_vector, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/speech_mean.npy')
saveSpectrogram(speech_model.covariance_matrixes, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/speech_cov.npy')
saveSpectrogram(silence_model.pi, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/silence_pi.npy')
saveSpectrogram(silence_model.mean_vector, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/silence_mean.npy')
saveSpectrogram(silence_model.covariance_matrixes, 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/silence_cov.npy')

[0.15395677931312723, 0.07940195395538824, 0.7666412667314995]


In [46]:
pi = np.load('D:/Videh_Acads/IITK/5th Sem/EE603/Project/GMM_Weights/music_pi.npy')   # load weights
#print(pi)

[0.14926945 0.08193013 0.76880042]
