In [1]:
# importing all the dependencies
import pandas as pd # data frame
import numpy as np # matrix math
from glob import glob # file handling
import librosa # audio manipulation
from sklearn.utils import shuffle # shuffling of data
import os # interation with the OS
from random import sample # random selection
from tqdm import tqdm
from scipy import signal # audio processing
from scipy.io import wavfile # reading the wavfile

In [2]:
# fixed param
PATH = './data/train/audio/'

In [28]:
def load_files(path):
    # write the complete file loading function here, this will return
    # a dataframe having files and labels
    # loading the files
    train_labels = os.listdir(PATH)
    train_labels.remove('_background_noise_')
    print(train_labels)
    
    labels_to_keep = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', '_background_noise_']

    train_file_labels = dict()
    for label in train_labels:
        files = os.listdir(PATH + '/' + label)
        print(files)
        for f in files:
            train_file_labels[label + '/' + f] = label

    train = pd.DataFrame.from_dict(train_file_labels, orient='index')
    print(train)
    train = train.reset_index(drop=False)
    train = train.rename(columns={'index': 'file', 0: 'folder'})
    train = train[['folder', 'file']]
    train = train.sort_values('file')
    train = train.reset_index(drop=True)

    def remove_label_from_file(label, fname):
        return  fname[len(label)+1:] #path + label + '/' + fname[len(label)+1:]

    train['file'] = train.apply(lambda x: remove_label_from_file(*x), axis=1)
    train['label'] = train['folder'].apply(lambda x: x if x in labels_to_keep else 'unknown')

    labels_to_keep.append('unknown')

    return train, labels_to_keep

In [29]:
train, labels_to_keep = load_files(PATH)

# making word2id dict
word2id = dict((c,i) for i,c in enumerate(sorted(labels_to_keep)))

# get some files which will be labeled as unknown
unk_files = train.loc[train['label'] == 'unknown']['file'].values
unk_files = sample(list(unk_files), 1000)

['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero']
['00176480_nohash_0.wav', '004ae714_nohash_0.wav', '004ae714_nohash_1.wav', '00f0204f_nohash_0.wav', '00f0204f_nohash_1.wav', '012c8314_nohash_0.wav', '012c8314_nohash_1.wav', '0132a06d_nohash_0.wav', '0135f3f2_nohash_0.wav', '0137b3f4_nohash_0.wav', '014f9f65_nohash_0.wav', '01648c51_nohash_0.wav', '01648c51_nohash_1.wav', '016e2c6d_nohash_0.wav', '01b4757a_nohash_0.wav', '01b4757a_nohash_1.wav', '01bcfc0c_nohash_0.wav', '0227998e_nohash_0.wav', '026290a7_nohash_0.wav', '02746d24_nohash_0.wav', '035de8fe_nohash_0.wav', '0362539c_nohash_0.wav', '0362539c_nohash_1.wav', '037c445a_nohash_0.wav', '0397ecda_nohash_0.wav', '03c96658_nohash_0.wav', '042186b8_nohash_0.wav', '042ea76c_nohash_0.wav', '0447d7c1_nohash_0.wav', '0474c92a_nohash_0.wav', '0474c92a_nohas

['004ae714_nohash_0.wav', '012c8314_nohash_0.wav', '0132a06d_nohash_0.wav', '0132a06d_nohash_1.wav', '0132a06d_nohash_2.wav', '0132a06d_nohash_3.wav', '0132a06d_nohash_4.wav', '0135f3f2_nohash_0.wav', '0135f3f2_nohash_1.wav', '0137b3f4_nohash_0.wav', '0137b3f4_nohash_1.wav', '0137b3f4_nohash_2.wav', '0137b3f4_nohash_3.wav', '0137b3f4_nohash_4.wav', '014f9f65_nohash_0.wav', '014f9f65_nohash_1.wav', '01648c51_nohash_0.wav', '019fa366_nohash_0.wav', '019fa366_nohash_1.wav', '01b4757a_nohash_0.wav', '01b4757a_nohash_1.wav', '01b4757a_nohash_2.wav', '01bb6a2a_nohash_0.wav', '01bb6a2a_nohash_1.wav', '01bb6a2a_nohash_2.wav', '01bcfc0c_nohash_0.wav', '01bcfc0c_nohash_1.wav', '01d22d03_nohash_0.wav', '0227998e_nohash_0.wav', '0227998e_nohash_1.wav', '0227998e_nohash_2.wav', '0227998e_nohash_3.wav', '022cd682_nohash_0.wav', '023a61ad_nohash_0.wav', '023a61ad_nohash_1.wav', '026290a7_nohash_0.wav', '02e85b60_nohash_0.wav', '030ec18b_nohash_0.wav', '030ec18b_nohash_1.wav', '035de8fe_nohash_0.wav',

In [5]:
word2id

{'_background_noise_': 0,
 'down': 1,
 'go': 2,
 'left': 3,
 'no': 4,
 'off': 5,
 'on': 6,
 'right': 7,
 'stop': 8,
 'unknown': 9,
 'up': 10,
 'yes': 11}

In [30]:
unk_files[:10]

['da76aa58_nohash_0.wav',
 '026290a7_nohash_0.wav',
 '161fcca8_nohash_1.wav',
 '0132a06d_nohash_1.wav',
 '6205088b_nohash_0.wav',
 '6cb6eee7_nohash_0.wav',
 '834f03fe_nohash_4.wav',
 '70a00e98_nohash_2.wav',
 '9e2ce5e3_nohash_0.wav',
 '840c366d_nohash_0.wav']

In [31]:
train.sample(12)

Unnamed: 0,folder,file,label
25703,marvin,ab5ae445_nohash_0.wav,unknown
59983,yes,0137b3f4_nohash_2.wav,yes
10600,eight,92a9c5e6_nohash_4.wav,unknown
36534,one,56eab10e_nohash_0.wav,unknown
56919,up,75915c90_nohash_0.wav,up
48254,stop,8bf6acb9_nohash_0.wav,stop
10650,eight,97f493b9_nohash_1.wav,unknown
34520,on,7910d292_nohash_0.wav,on
7645,down,538e1856_nohash_0.wav,down
46969,six,fcb25a78_nohash_0.wav,unknown


In [32]:
from glob import glob
files = glob(PATH + '_bac*/*.wav')
print(files)

['./data/train/audio\\_background_noise_\\doing_the_dishes.wav', './data/train/audio\\_background_noise_\\dude_miaowing.wav', './data/train/audio\\_background_noise_\\exercise_bike.wav', './data/train/audio\\_background_noise_\\pink_noise.wav', './data/train/audio\\_background_noise_\\running_tap.wav', './data/train/audio\\_background_noise_\\white_noise.wav']


In [33]:
# silence background samples
all_sil = []
for s in files:
    sr, audio = wavfile.read(s)
    # converting the file into samples of 1 sec each
    len_ = int(len(audio)/sr)
    print(len_)
    for i in range(len_-1):
        sample_ = audio[i*sr:(i+1)*sr]
        all_sil.append(sample_)
print(len(all_sil))
print(all_sil[0].shape)
sil_data =  np.zeros((392, 16000, ))
for i,d in enumerate(all_sil):
    sil_data[i] = d
print(sil_data.shape)

95
61
61
60
61
60
392
(16000,)
(392, 16000)


  sr, audio = wavfile.read(s)


In [34]:
# Writing functions to extract the data, script from kdnuggets: 
# www.kdnuggets.com/2016/09/urban-sound-classification-neural-networks-tensorflow.html
def extract_feature(path):
	X, sample_rate = librosa.load(path)
	stft = np.abs(librosa.stft(X))
	mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
	chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
	mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
	contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
	tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
	return mfccs,chroma,mel,contrast,tonnetz

def parse_audio_files(files, word2id, unk = False):
    # n: number of classes
    features = np.empty((0,193))
    one_hot = np.zeros(shape = (len(files), word2id[max(word2id)]))
    print(one_hot.shape)
    for i in tqdm(range(len(files))):
        f = files[i]
        mfccs, chroma, mel, contrast,tonnetz = extract_feature(f)
        ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
        features = np.vstack([features,ext_features])
        if unk == True:
            l = word2id['unknown']
            one_hot[i][l] = 1.
        else:
            l = word2id[f.split('/')[-2]]
            one_hot[i][l] = 1.
    return np.array(features), one_hot

In [35]:
files = train.loc[train['label'] != 'unknown']['file'].values
print(len(files))
print(files[:10])

23682
['00176480_nohash_0.wav' '004ae714_nohash_0.wav' '00b01445_nohash_0.wav'
 '00b01445_nohash_1.wav' '00f0204f_nohash_0.wav' '0132a06d_nohash_0.wav'
 '0132a06d_nohash_1.wav' '0132a06d_nohash_2.wav' '0132a06d_nohash_3.wav'
 '0132a06d_nohash_4.wav']


## Playing around with the single audio clip
We now look at a single audio clip and see how it goes.

In [12]:
# playing around with the data for now
train_audio_path = '../input/train/audio/'
filename = '/tree/24ed94ab_nohash_0.wav' # --> 'Yes'
sample_rate, audio = wavfile.read(str(train_audio_path) + filename)

In [13]:
plt.figure(figsize = (15, 4))
plt.plot(audio)
ipd.Audio(audio, rate=sample_rate)

NameError: name 'plt' is not defined

In [14]:
# goto: https://medium.com/@ageitgey/machine-learning-is-fun-part-6-how-to-do-speech-recognition-with-deep-learning-28293c162f7a
# We convert it into chunks of 20ms each i.e. units of 320 
audio_chunks = []
n_chunks = int(audio.shape[0]/320)
for i in range(n_chunks):
    chunk = audio[i*320: (i+1)*320]
    audio_chunks.append(chunk)
audio_chunk = np.array(audio_chunks)

In [43]:
# we now convert it to spertogram
# goto: https://www.kaggle.com/davids1992/data-visualization-and-investigation
def log_specgram(audio, sample_rate, window_size=10,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    _, _, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return np.log(spec.T.astype(np.float32) + eps)

In [16]:
spectrogram = log_specgram(audio, sample_rate, 10, 0)
spec = spectrogram.T
print(spec.shape)
plt.figure(figsize = (15,4))
plt.imshow(spec, aspect='auto', origin='lower')

(81, 100)


NameError: name 'plt' is not defined

## Making the data
Now that we know about the shape of the data, we will finally make the total processed data.

In [36]:
# make labels and convert them into one hot encodings
labels = sorted(labels_to_keep)
word2id = dict((c,i) for i,c in enumerate(labels))
label = train['label'].values
label = [word2id[l] for l in label]
print(labels)
def make_one_hot(seq, n):
    # n --> vocab size
    seq_new = np.zeros(shape = (len(seq), n))
    for i,s in enumerate(seq):
        seq_new[i][s] = 1.
    return seq_new
one_hot_l = make_one_hot(label, 12)

['_background_noise_', 'down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'unknown', 'up', 'yes']


In [37]:
print(one_hot_l[10:15])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]


In [38]:
one_hot_l[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])

In [39]:
train

Unnamed: 0,folder,file,label
0,bed,00176480_nohash_0.wav,unknown
1,bed,004ae714_nohash_0.wav,unknown
2,bed,004ae714_nohash_1.wav,unknown
3,bed,00f0204f_nohash_0.wav,unknown
4,bed,00f0204f_nohash_1.wav,unknown
...,...,...,...
64716,zero,ffd2ba2f_nohash_1.wav,unknown
64717,zero,ffd2ba2f_nohash_2.wav,unknown
64718,zero,ffd2ba2f_nohash_3.wav,unknown
64719,zero,ffd2ba2f_nohash_4.wav,unknown


In [40]:
# getting all the paths to the files
paths = []
folders = train['folder']
files = train['file']
for i in range(len(files)):
    path = './data/train/audio/' + str(folders[i]) + '/' + str(files[i])
    paths.append(path)

In [41]:
def audio_to_data(path):
    # we take a single path and convert it into data
    sample_rate, audio = wavfile.read(path)
    spectrogram = log_specgram(audio, sample_rate, 10, 0)
    return spectrogram.T

def paths_to_data(paths,labels):
    data = np.zeros(shape = (len(paths), 81, 100))
    indexes = []
    for i in tqdm(range(len(paths))):
        audio = audio_to_data(paths[i])
        if audio.shape != (81,100):
            indexes.append(i)
        else:
            data[i] = audio
    final_labels = [l for i,l in enumerate(labels) if i not in indexes]
    print('Number of instances with inconsistent shape:', len(indexes))
    return data[:len(data)-len(indexes)], final_labels, indexes

In [44]:
d,l,indexes = paths_to_data(paths,one_hot_l)

MemoryError: Unable to allocate 3.91 GiB for an array with shape (64721, 81, 100) and data type float64

In [23]:
labels = np.zeros(shape = [d.shape[0], len(l[0])])
for i,array in enumerate(l):
    for j, element in enumerate(array):
        labels[i][j] = element
print(labels.shape)

NameError: name 'l' is not defined

In [24]:
print(d.shape)
print(labels.shape)

(16000,)


AttributeError: 'list' object has no attribute 'shape'

In [25]:
d,labels = shuffle(d,labels)

ValueError: Found input variables with inconsistent numbers of samples: [16000, 12]

In [26]:
print(d[0].shape)
print(labels[0].shape)

()


AttributeError: 'str' object has no attribute 'shape'

## Machine learning model
Using a LSTM network to determine the text

In [27]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

Using TensorFlow backend.
  return f(*args, **kwds)


In [28]:
model = Sequential()
model.add(LSTM(256, input_shape = (81, 100)))
# model.add(Dense(1028))
model.add(Dropout(0.2))
model.add(Dense(128))
model.add(Dropout(0.2))
model.add(Dense(12, activation = 'softmax'))
model.compile(optimizer = 'Adam', loss = 'mean_squared_error', metrics = ['accuracy'])

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 12)                1548      
Total params: 400,012
Trainable params: 400,012
Non-trainable params: 0
_________________________________________________________________


In [30]:
model.fit(d, labels, batch_size = 1024, epochs = 10)

ValueError: Error when checking input: expected lstm_1_input to have 3 dimensions, but got array with shape (16000, 1)

## Add further for testing modules
Add modules for testing and saving the files, will keeo improving the model in the future.