In [1]:
import keras
import librosa
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM
from tensorflow.keras.optimizers import SGD
import numpy as np

In [2]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.clongdouble))
    X = librosa.power_to_db(X,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

nmfcc = 13

In [3]:
def normalize(arr):          # used to scale all values between 0 and 1
    arr_max = np.ndarray.max(arr, axis = 1)    # obtain max and min values for each feature   
    arr_min = np.ndarray.min(arr, axis = 1)
    arr_max = arr_max.reshape((nmfcc, 1))
    arr_min = arr_min.reshape((nmfcc, 1))
    diff = arr_max - arr_min         # obtain range of each feature
    arr = arr-arr_min
    arr = arr/diff
    return arr

In [4]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/MusicData/music2.wav'
music, sr = readAudio(dirname) 
Music = librosa.feature.mfcc(y=music, sr= 16000, n_mfcc= nmfcc)     #extract mfcc features
Music = normalize(Music)         #Music Dataset
print(Music.shape)
print(Music)

(13, 56211)
[[0.         0.00084225 0.00351698 ... 0.         0.         0.        ]
 [0.1771701  0.1795556  0.18709294 ... 0.1771701  0.1771701  0.1771701 ]
 [0.5548715  0.55810696 0.5681736  ... 0.5548715  0.5548715  0.5548715 ]
 ...
 [0.6058027  0.6127098  0.62480617 ... 0.6058027  0.6058027  0.6058027 ]
 [0.5778084  0.58467644 0.5949725  ... 0.5778084  0.5778084  0.5778084 ]
 [0.5478337  0.55462134 0.5630414  ... 0.5478337  0.5478337  0.5478337 ]]


In [5]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/SpeechData/Speech2.wav'
speech, sr = readAudio(dirname)
Speech = librosa.feature.mfcc(y=speech, sr= 16000, n_mfcc= nmfcc)
Speech = normalize(Speech)         #Speech dataset
print(Speech.shape)

(13, 54392)


In [6]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Datasets/NoiseData/noise2.wav'
noise, sr = readAudio(dirname)
Noise = librosa.feature.mfcc(y=noise, sr= 16000, n_mfcc= nmfcc)
Noise = normalize(Noise)         #Noise Dataset
print(Noise.shape)

(13, 62501)


In [7]:
X1 = Music
X2 = Speech
X3 = Noise
X_train = np.concatenate((X1.T, X2.T, X3.T), axis = 0)     #conactenate music, speech and noise samples
y_music = np.array([[1,0,0]]*X1.shape[1])       # create one-hot labels for each class - music,
y_speech = np.array([[0,1,0]]*X2.shape[1])      # speech,
y_noise = np.array([[0,0,1]]*X3.shape[1])       # silence
t_train = np.concatenate((y_music, y_speech, y_noise))
print(X_train.shape)
print(t_train.shape)

(173104, 13)
(173104, 3)


In [8]:
model = Sequential()
model.add(Dense(64, activation = 'relu', input_dim = X_train.shape[1]))    # adding dense layers of 64 neurons
model.add(Dense(64, activation = 'relu'))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))      # using softmax for multiclass classification

sgd = SGD(learning_rate = 0.01, decay = 1e-6, momentum = 0.9, nesterov = True)   # using the traditional stochastic gradient descent optimizing algo

In [9]:
model.compile(loss = 'categorical_crossentropy', optimizer = sgd, metrics = ['accuracy'])      # loss used for multi-class classification

In [10]:
model.fit(X_train, t_train, epochs = 1500, batch_size = 173104)      # training model with complete dataset as batch_size

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

<keras.callbacks.History at 0x211881a1df0>

In [11]:
dirname = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/val_set/val_set/wav/music+speech_noisy2.wav'
x_test, sr = readAudio(dirname)
log_mels = calc_spec(x_test)    # in test, these spectrograms would be given
mfcc = librosa.feature.mfcc(S = log_mels, sr=16000, n_mfcc=13)   # extract mfcc features from spectrogram
X_test = normalize(mfcc)
X_test = X_test.T
classes = ['Music', 'Speech', 'Silence']
cluster = []
for i in range(X_test.shape[0]):
    arr = model.predict(X_test[i:i+1], batch_size = None, verbose = 0, steps = None) #predictions array
    cluster.append(classes[np.argmax(arr)])   # append the class with max probability
    
print(cluster)    

['Music', 'Silence', 'Silence', 'Music', 'Silence', 'Music', 'Silence', 'Silence', 'Speech', 'Speech', 'Speech', 'Speech', 'Silence', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Music', 'Music', 'Music', 'Music', 'Music', 'Silence', 'Music', 'Music', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Speech', 'Music', 'Music', 'Music', 'Silence', 'Silence', 'Silence', 'Silence', 'Music', 'Music', 'Silence', 'Music', 'Music', 'Silence', 'Silence', 'Music', 'Silence', 'Music', 'Music', 'Music', 'Silence', 'Music', 'Silence', 'Silence', 'Music', 'Music', 'Silence', 'Silence', 'Music', 'Music', 'Music', 'Silence', 'Music', 'Silence', 'Silence', 'Silence', 'Silence', 'Music', 'Silence', 'Music', 'Silence', 'Silence', 'Silence', 'Music', 'Speech', 'Speech', 'Music', 'Speech', 'Silence', 'Silence', 'Music', 'Music', 'M

In [12]:
def audioEventDetect(cluster, window_size = 0.064):     #initial event detection method
    super_list = []      #stores all events
    cur_list = []        #stores start time, end time and event class
    cur_list.append(0)
    cur_list.append(window_size)     # end time of first event
    cur_class = cluster[0]
    cur_list.append(cur_class)
    super_list.append(cur_list)
    prev_class = cur_class      
    cluster = cluster[1:]        #start iterating from 2nd element of cluster
    
    for cur_class in cluster:
        if cur_class == prev_class:     #eg. if music is followed by music 
            super_list[-1][1] += window_size - 0.032    #end time of current event is increased by window_size - 0.032s
        else:
            cur_list = super_list[-1]    
            new_start = cur_list[1]      #start time of new event is end time of previous event
            cur_list = [new_start]
            cur_list.append((new_start + window_size - 0.032))     # end time of new event is new start event + remaining part of overlapping window
            cur_list.append(cur_class)
            super_list.append(cur_list)
        prev_class = cur_class
                            
    return super_list

In [13]:
events = audioEventDetect(cluster)
print(events)                #initial audio event detection 

[[0, 0.064, 'Music'], [0.064, 0.128, 'Silence'], [0.128, 0.16, 'Music'], [0.16, 0.192, 'Silence'], [0.192, 0.224, 'Music'], [0.224, 0.28800000000000003, 'Silence'], [0.28800000000000003, 0.41600000000000015, 'Speech'], [0.41600000000000015, 0.4480000000000002, 'Silence'], [0.4480000000000002, 0.8960000000000006, 'Speech'], [0.8960000000000006, 1.0560000000000007, 'Music'], [1.0560000000000007, 1.0880000000000007, 'Silence'], [1.0880000000000007, 1.1520000000000008, 'Music'], [1.1520000000000008, 1.6000000000000012, 'Speech'], [1.6000000000000012, 1.6960000000000013, 'Music'], [1.6960000000000013, 1.8240000000000014, 'Silence'], [1.8240000000000014, 1.8880000000000015, 'Music'], [1.8880000000000015, 1.9200000000000015, 'Silence'], [1.9200000000000015, 1.9840000000000015, 'Music'], [1.9840000000000015, 2.0480000000000014, 'Silence'], [2.0480000000000014, 2.0800000000000014, 'Music'], [2.0800000000000014, 2.1120000000000014, 'Silence'], [2.1120000000000014, 2.2080000000000015, 'Music'], [

In [14]:
noOfFrames = 29        # this functions does a majority voting on 28 (sub)frames and allots corresponding label to this new frame (0.96 s in length) of 28 (sub)frames 
new_cluster = []
count = 0 
music_score = 0
speech_score = 0
silence_score = 0
for title in cluster:     # count number of music, speech, or silence classes in 28 frames  
    if title == 'Music':
        music_score+= 1    
    elif title == 'Speech':
        speech_score+= 1 
    else:
        silence_score += 1
    if count == noOfFrames-1:     # iterate till 28 frames have been counted 
        count = -1
        if music_score > speech_score:
            if music_score > silence_score:
                new_cluster.append('Music')    # max class is Music
            else:
                new_cluster.append('Silence')   #max class is Silence.. and so on
        else:
            if speech_score > silence_score:
                new_cluster.append('Speech')
            else:
                new_cluster.append('Silence') 
        music_score = 0
        speech_score = 0
        silence_score = 0
    count+= 1
    
events = audioEventDetect(new_cluster, 0.96)       #pass this new cluster of 0.96s frames through audio event detection method
print(events)

[[0, 3.7439999999999998, 'Speech'], [3.7439999999999998, 9.312000000000001, 'Music']]


In [15]:
from keras.models import load_model
path = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/Model_Weights/my_model_NN.h5'
path1 = 'C:/Users/Videh Aggarwal/Downloads/my_model_RNN.h5'
model.save(path)                 # save model
#model = load_model(path)        # load model