In [1]:
import numpy as np
import librosa

In [3]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.clongdouble))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

nmfcc = 13

In [16]:
def normalize(arr):          # used to scale all values between 0 and 1
    arr_max = np.ndarray.max(arr, axis = 1)    # obtain max and min values for each feature   
    arr_min = np.ndarray.min(arr, axis = 1)
    arr_max = arr_max.reshape((nmfcc, 1))
    arr_min = arr_min.reshape((nmfcc, 1))
    diff = arr_max - arr_min         # obtain range of each feature
    arr = arr-arr_min
    arr = arr/diff
    return arr

In [19]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/MusicData/music2.wav' #Music file
audio, sr = readAudio(dirname)
log_mels = calc_spec(audio)          #calculate spectrograms
music_mfcc = librosa.feature.mfcc(S = log_mels, sr=16000, n_mfcc=13)    #extract mfcc features
music_mfcc = normalize(music_mfcc).T
imageNo = music_mfcc.shape[0]
t1 =  np.array([[1,0,0]]*imageNo)    #Corresponding ground truths

In [20]:
#Repeating above process for speech data
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/SpeechData/Speech2.wav'
audio, sr = readAudio(dirname)
log_mels = calc_spec(audio)
speech_mfcc = librosa.feature.mfcc(S = log_mels, sr=16000, n_mfcc=13)
speech_mfcc = normalize(speech_mfcc).T
imageNo = speech_mfcc.shape[0]
t2 =  np.array([[0,1,0]]*imageNo)    #Corresponding ground truths

In [21]:
#Repeating above process for silence data
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/NoiseData/noise2.wav'
audio, sr = readAudio(dirname)
log_mels = calc_spec(audio)
silence_mfcc = librosa.feature.mfcc(S = log_mels, sr=16000, n_mfcc=13)
silence_mfcc = normalize(silence_mfcc).T
imageNo = silence_mfcc.shape[0]
t3 =  np.array([[0,0,1]]*imageNo)    #Corresponding ground truths

In [22]:
X_train_temp = np.concatenate((music_mfcc, speech_mfcc, silence_mfcc))  #creating X transpose matrix
total_samples = music_mfcc.shape[0] + speech_mfcc.shape[0] + silence_mfcc.shape[0]
ones_arr = np.ones((total_samples, 1))
phi = np.concatenate((ones_arr, X_train_temp), axis = 1)    #attaching bias terms to X transpose to make final phi matrix
Y = np.concatenate((t1, t2, t3))     #concatenated ground truths
W = np.random.rand(phi.shape[1], 3)  #initializing random weights

In [41]:
# Training step
# We know that W = (inv(phi.T*phi + lamda*I))* phi.T * Y

temp_phi = np.dot(phi.T, phi)
lamda = 0.02        #regularization parameter
temp_phi += lamda*np.eye(phi.shape[1])

#inverting above matrix
a = temp_phi
m = 10^-6
inv_prod = np.linalg.inv(a+ np.eye(a.shape[1])*m)     #small noise added in diagonals to prevent singularity

temp = np.dot(inv_prod, phi.T)
W = np.dot(temp, Y)      #final weights matrix
print(W)

[[ 1.54335848  0.10009325 -1.81148428]
 [ 1.9359647  -1.18391543 -1.26776529]
 [ 2.51941079 -1.85292736 -1.26013106]
 [-1.45123275  0.78631498  0.96253142]
 [-1.57111171  0.7767184   1.24294347]
 [-0.12731203 -0.24028291  0.53429718]
 [-0.77238113  0.55910096  0.29485656]
 [ 0.09843508 -0.08736832  0.17950085]
 [-1.76490217  1.11618789  0.99406246]
 [ 2.40973086 -1.3416438  -1.18201635]
 [-0.45810146  0.72721771  0.0903639 ]
 [-0.67896434 -0.93150197  2.11603657]
 [-0.748221    0.58593803  0.65772563]
 [-2.71194598  1.88570312  1.62514838]]


In [37]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/val_set/val_set/wav/music_noisy1.wav'
x_test, sr = readAudio(dirname)
log_mels = calc_spec(x_test)    # in test, these spectrograms would be given
mfcc = librosa.feature.mfcc(S = log_mels, sr=16000, n_mfcc=13)   # extract mfcc features from spectrogram
X_test = normalize(mfcc).T
ones_arr = np.ones((X_test.shape[0], 1))
phi = np.concatenate((ones_arr, X_test), axis = 1)     #attaching bias terms to X transpose to make final phi matrix
Y = np.dot(phi, W)   # predictions of model 

classes = ['Music', 'Speech', 'Silence']
cluster = []
for i in range(Y.shape[0]):
    pred = Y[i]         #individual prediction array
    cluster.append(classes[np.argmax(pred)])     # append the class with max probability
        
print(cluster)    

['Speech', 'Speech', 'Silence', 'Speech', 'Speech', 'Silence', 'Speech', 'Silence', 'Speech', 'Silence', 'Speech', 'Silence', 'Silence', 'Speech', 'Speech', 'Speech', 'Silence', 'Silence', 'Silence', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Silence', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Speech', 'Music', 'Silence', 'Speech', 'Speech', 'Speech', 'Silence', 'Speech', 'Speech', 'Silence', 'Silence', 'Music', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Music', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Silence', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Silence', 'Speech', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Silence', 'Speech', 'Silence', 'Silence', 'Silence', 'Speech', 'Speech', 'Speech', 'Silence', 'Speech', 'Silence'

In [38]:
def audioEventDetect(cluster, window_size = 0.064):     #initial event detection method
    super_list = []      #stores all events
    cur_list = []        #stores start time, end time and event class
    cur_list.append(0)
    cur_list.append(window_size)     # end time of first event
    cur_class = cluster[0]
    cur_list.append(cur_class)
    super_list.append(cur_list)
    prev_class = cur_class      
    cluster = cluster[1:]        #start iterating from 2nd element of cluster
    
    for cur_class in cluster:
        if cur_class == prev_class:     #eg. if music is followed by music 
            super_list[-1][1] += window_size - 0.032    #end time of current event is increased by window_size - 0.032s
        else:
            cur_list = super_list[-1]    
            new_start = cur_list[1]      #start time of new event is end time of previous event
            cur_list = [new_start]
            cur_list.append((new_start + window_size - 0.032))     # end time of new event is new start event + remaining part of overlapping window
            cur_list.append(cur_class)
            super_list.append(cur_list)
        prev_class = cur_class
                            
    return super_list

In [39]:
events = audioEventDetect(cluster)
print(events)                #initial audio event detection 

[[0, 0.096, 'Speech'], [0.096, 0.128, 'Silence'], [0.128, 0.192, 'Speech'], [0.192, 0.224, 'Silence'], [0.224, 0.256, 'Speech'], [0.256, 0.28800000000000003, 'Silence'], [0.28800000000000003, 0.32000000000000006, 'Speech'], [0.32000000000000006, 0.3520000000000001, 'Silence'], [0.3520000000000001, 0.3840000000000001, 'Speech'], [0.3840000000000001, 0.4480000000000002, 'Silence'], [0.4480000000000002, 0.5440000000000003, 'Speech'], [0.5440000000000003, 0.6400000000000003, 'Silence'], [0.6400000000000003, 0.8000000000000005, 'Speech'], [0.8000000000000005, 0.8960000000000006, 'Silence'], [0.8960000000000006, 0.9280000000000006, 'Speech'], [0.9280000000000006, 0.9920000000000007, 'Silence'], [0.9920000000000007, 1.0240000000000007, 'Speech'], [1.0240000000000007, 1.0560000000000007, 'Music'], [1.0560000000000007, 1.0880000000000007, 'Silence'], [1.0880000000000007, 1.1840000000000008, 'Speech'], [1.1840000000000008, 1.2160000000000009, 'Silence'], [1.2160000000000009, 1.280000000000001, '

In [40]:
noOfFrames = 29        # this functions does a majority voting on 28 (sub)frames and allots corresponding label to this new frame (0.96 s in length) of 28 (sub)frames 
new_cluster = []
count = 0 
music_score = 0
speech_score = 0
silence_score = 0
for title in cluster:     # count number of music, speech, or silence classes in 28 frames  
    if title == 'Music':
        music_score+= 1    
    elif title == 'Speech':
        speech_score+= 1 
    else:
        silence_score += 1
    if count == noOfFrames-1:     # iterate till 28 frames have been counted 
        count = -1
        if music_score > speech_score:
            if music_score > silence_score:
                new_cluster.append('Music')    # max class is Music
            else:
                new_cluster.append('Silence')   #max class is Silence.. and so on
        else:
            if speech_score > silence_score:
                new_cluster.append('Speech')
            else:
                new_cluster.append('Silence')
        music_score = 0
        speech_score = 0
        silence_score = 0
    count+= 1
    
events = audioEventDetect(new_cluster, 0.96)       #pass this new cluster of 0.96s frames through audio event detection method
print(events)

[[0, 0.96, 'Speech'], [0.96, 9.312000000000001, 'Silence']]


In [32]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Model_Weights/linear_weights.npy'
saveSpectrogram(W, dirname)        # saving model weights