In [29]:
import numpy as np
import keras
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, LSTM, Reshape
from tensorflow.keras.optimizers import SGD
from sklearn import preprocessing
from keras.layers.core import Reshape
np.seterr(divide='ignore', invalid='ignore')


{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [30]:
def readAudio(filename):
    x, sr = librosa.load(filename, sr=16000)
    return x, sr

#calculate spectrogram
def calc_spec(x):
    n_fft = 1024
    hop_length = 512
    win_length = 1024
    X = np.abs(librosa.stft(x, n_fft = n_fft, hop_length = hop_length, win_length = win_length, window='hann', dtype = np.clongdouble))
    X = librosa.power_to_db(X**2,ref=np.max)
    return X

def saveSpectrogram(X, outfilename):
    assert outfilename[-4:]=='.npy'  #'outfilename extension should be .npy'
    np.save(outfilename, X)
    return

def readSpectrogram(infilename):
    X = np.load(infilename)
    return X

nmfcc = 13

In [31]:
def normalize(arr, features = 513):             # used to scale all values between 0 and 1
    arr_max = np.ndarray.max(arr, axis = 1)      #obtaining max and min values for each feature
    arr_min = np.ndarray.min(arr, axis = 1)
    arr_max = arr_max.reshape((features, 1))
    arr_min = arr_min.reshape((features, 1))
    diff = arr_max - arr_min                    #getting the range of each value
    arr = arr- arr_min
    arr = arr/diff 
    return arr

In [32]:
def createDataset(dirname):            # obtain 3d array (samples, 27, 19)                        
    speech, sr = readAudio(dirname)
    Speech = calc_spec(speech)          #obtain Spectrogram
    Speech = normalize(Speech)
    imageNo = Speech.shape[1]         #Number of samples (frames)
    X = np.zeros((imageNo, 27, 19))    
    for i in range(imageNo):
        Image = Speech[:, i]                # storing 513 features into Image 
        Image = Image.reshape((27, 19))       # reshaping 513 = 27*19 features
        X[i] = Image     
    return X

In [33]:
dirname = 'C:/Users/Dhruv Goyal/Desktop/Ye Khol/music2.wav'
X_music = createDataset(dirname)     #Music Dataset
imageNo = X_music.shape[0]
t1 =  np.array([[1,0,0]]*imageNo)    #Corresponding ground truths

In [34]:
dirname = 'C:/Users/Dhruv Goyal/Desktop/Ye Khol/Speech2.wav'
X_speech = createDataset(dirname)      #Speech Dataset
imageNo = X_speech.shape[0]
t2 =  np.array([[0,1,0]]*imageNo)  #Corresponding Ground Truths
print(t2.shape)
print(X_speech.shape[0])

(54392, 3)
54392


In [35]:
dirname = 'C:/Users/Dhruv Goyal/Desktop/Ye Khol/noise2.wav'
X_noise = createDataset(dirname)       #Silence Dataset
imageNo = X_noise.shape[0]
t3 =  np.array([[0,0,1]]*imageNo)      #Corresponding labels
print(t3.shape)
print(X_noise.shape[0])

(62501, 3)
62501


In [36]:
X_train = np.concatenate((X_music, X_speech, X_noise), axis = 0)       #concatenating music, speech and silence samples
t_train = np.concatenate((t1, t2, t3), axis = 0)
print(X_train.shape)
print(t_train.shape)

(173104, 27, 19)
(173104, 3)


In [37]:
model = Sequential()

model.add(LSTM(128, activation = 'relu', return_sequences = True))     #RNN layer 
model.add(Dropout(0.2))   # Prevents overfitting

model.add(LSTM(128, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(3, activation = 'softmax'))     #softmax for multi-class classification

In [38]:
opt = tf.keras.optimizers.Adam(learning_rate = 1e-3, decay = 1e-5)   #Adam is faster than stochastic gradient descent

In [39]:
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])     #loss used for multi-class classification

In [40]:
model.fit(X_train, t_train, epochs = 1, batch_size = 128)      # training model 



<tensorflow.python.keras.callbacks.History at 0x25229badec8>

In [41]:
# pathName = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/test_samples/test_sample-0.npy'
def print_cluster(pathName):
    
    Spect = readSpectrogram(pathName)     #function similar to createDataset
    Speech = normalize(Spect)
    imageNo = Speech.shape[1]
    X = np.zeros((imageNo, 27, 19))
    for i in range(imageNo):
        Image = Speech[:, i]
        Image = Image.reshape((27, 19))
        X[i] = Image

    X_test = X

    classes = ['Music', 'Speech', 'Silence']
    cluster = []

    for i in range(X_test.shape[0]):
        X_temp = X_test[i]
        X_temp = np.expand_dims(X_temp, axis=0)    # Since RNN accepts 4 dimensions
        arr = model.predict(X_temp)          #Predictions array
        cluster.append(classes[np.argmax(arr)])     # Select that class whose probability is most and append it to cluster

    return cluster   

In [42]:
def audioEventDetect(cluster, window_size = 0.064):     #initial event detection method
    super_list = []      #stores all events
    cur_list = []        #stores start time, end time and event class
    cur_list.append(0)
    cur_list.append(window_size)     # end time of first event
    cur_class = cluster[0]
    cur_list.append(cur_class)
    super_list.append(cur_list)
    prev_class = cur_class      
    cluster = cluster[1:]        #start iterating from 2nd element of cluster
    
    for cur_class in cluster:
        if cur_class == prev_class:     #eg. if music is followed by music 
            super_list[-1][1] += window_size - 0.032    #end time of current event is increased by window_size - 0.032s
        else:
            cur_list = super_list[-1]    
            new_start = cur_list[1]      #start time of new event is end time of previous event
            cur_list = [new_start]
            cur_list.append((new_start + window_size - 0.032))     # end time of new event is new start event + remaining part of overlapping window
            cur_list.append(cur_class)
            super_list.append(cur_list)
        prev_class = cur_class
                            
    return super_list

In [47]:
import glob

header1 = ['filename','event','onset','offset']
header2 = ['filename','Music','Speech']
countt = 0

file_paths = glob.glob('C:/Users/Dhruv Goyal/Desktop/mocktest_set/spectrogram' + '/**/*.npy',recursive = True)
for pathname in file_paths:
    filename = 'test_sample-' + str(countt)
    cluster = print_cluster(pathname)
#     print(cluster)
    events = audioEventDetect(cluster)
    
    noOfFrames = 29        # this functions does a majority voting on 28 (sub)frames and allots corresponding label to this new frame (0.96 s in length) of 28 (sub)frames 
    new_cluster = []
    count = 0 
    music_score = 0
    speech_score = 0
    silence_score = 0
    for title in cluster:     # count number of music, speech, or silence classes in 28 frames  
        if title == 'Music':
            music_score+= 1    
        elif title == 'Speech':
            speech_score+= 1 
        else:
            silence_score += 1
        if count == noOfFrames-1:     # iterate till 28 frames have been counted 
            count = -1
            if music_score > speech_score:
                if music_score > silence_score:
                    new_cluster.append('Music')    # max class is Music
                else:
                    new_cluster.append('Silence')   #max class is Silence.. and so on
            else:
                if speech_score > silence_score:
                    new_cluster.append('Speech')
                else:
                    new_cluster.append('Silence')
            music_score = 0
            speech_score = 0
            silence_score = 0
        count+= 1
        
    events = audioEventDetect(new_cluster, 0.96)       #pass this new cluster of 0.96s frames through audio event detection method
    for event in events:
        if event[-1] == 'Silence':
            events.remove(event)
    for event in events:
        data = [filename]
        data.append(event[2])
        data.append(event[0])
        data.append(event[1])
        with open('C:/Users/Dhruv Goyal/Desktop/event identification.csv', 'a',newline='') as f:
            writer = csv.writer(f)
            writer.writerow(data) 
    
#     print(events)
    m = 0
    s = 0
    for event in cluster:
        if event == 'Music':
            m=m+1
        elif event == 'Speech':
            s=s+1
    print(s)
    if m > 50:
        m = 1
    else:
        m =0
    if s > 50:
        s = 1
    else:
        s =0
    if m == 0:
        s=1
    data = [filename]
    data.append(m)
    data.append(s)
    with open('C:/Users/Dhruv Goyal/Desktop/event tagging.csv', 'a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(data)
    countt = countt + 1

C:/Users/Dhruv Goyal/Desktop/mocktest_set/spectrogram\test_sample-0.npy
['Music', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Silence', 'Sile

In [16]:
from keras.models import load_model
path = 'D:/Videh_Acads/IITK/5th Sem/EE603/Project/CNN_Model/my_model_CNN.h5'
path1 = 'C:/Users/Videh Aggarwal/Downloads/my_model_RNN.h5'
model.save(path)                 # save model
#model = load_model(path)        # load model