In [1]:
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from tensorflow.contrib.keras import models
from tensorflow.contrib.keras import layers
from tensorflow.contrib import keras
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 12
plt.rcParams['axes.labelsize'] = 11
plt.rcParams['axes.labelweight'] = 'bold'
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 9
plt.rcParams['ytick.labelsize'] = 9
plt.rcParams['legend.fontsize'] = 11
plt.rcParams['figure.titlesize'] = 13

In [2]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

def extract_features(parent_dir,sub_dirs,file_ext="*.wav",bands = 60, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    for l, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            sound_clip,s = librosa.load(fn)
            label = fn.split('/')[-1].split('-')[1]
            for (start,end) in windows(sound_clip,window_size):
                if(len(sound_clip[start:end]) == window_size):
                    signal = sound_clip[start:end]
                    melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
                    logspec = librosa.logamplitude(melspec)
                    logspec = logspec.T.flatten()[:, np.newaxis].T
                    log_specgrams.append(logspec)
                    labels.append(label)
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels,dtype = np.int) 

def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = len(np.unique(labels))
    one_hot_encode = np.zeros((n_labels,n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [3]:
def extract_single_file_feature(fn, fn2, bands = 25, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    sound_clip,s = librosa.load(fn)
    label = os.path.basename(os.path.splitext(fn)[0])
    count = 1
    for (start,end) in windows(sound_clip,window_size):
        if(len(sound_clip[start:end]) == window_size):
            print('\nwindow %d shapes:'%count)
            signal = sound_clip[start:end]
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            print('melspec.shape - %s'%str(melspec.shape))
            logspec = librosa.logamplitude(melspec)
            print('logspec.shape - %s'%str(logspec.shape))
            logspec = logspec.T.flatten()[:, np.newaxis].T
            print('logspec.shape - %s'%str(logspec.shape))
            log_specgrams.append(logspec)
            labels.append(label)
            count = count + 1
            
    sound_clip,s = librosa.load(fn2)
    label = os.path.basename(os.path.splitext(fn2)[0])
    count = 1
    for (start,end) in windows(sound_clip,window_size):
        if(len(sound_clip[start:end]) == window_size):
            print('\nwindow %d shapes:'%count)
            signal = sound_clip[start:end]
            melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
            print('melspec.shape - %s'%str(melspec.shape))
            logspec = librosa.logamplitude(melspec)
            print('logspec.shape - %s'%str(logspec.shape))
            logspec = logspec.T.flatten()[:, np.newaxis].T
            print('logspec.shape - %s'%str(logspec.shape))
            log_specgrams.append(logspec)
            labels.append(label)
            count = count + 1


    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    print('\nlog_specgrams.shape - %s'%str(log_specgrams.shape))
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    print('features.shape - %s'%str(features.shape))
    print('len of features = %s'%str(len(features)))
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    print('features.shape - %s'%str(features.shape))
    return np.array(features), np.array(labels,dtype = np.int) 

In [36]:
ft, lb = extract_single_file_feature('/Users/Gundeep/Dropbox/notebook/sounds/recorded/large/ah_drakht_boht_sohna_lag_reha/1.wav','/Users/Gundeep/Dropbox/notebook/sounds/recorded/large/ah_drakht_boht_sohna_lag_reha/3.wav')


window 1 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 2 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 3 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 4 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 5 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 1 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 2 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 3 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 4 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

window 5 shapes:
melspec.shape - (25, 41)
logspec.shape - (25, 41)
logspec.shape - (1, 1025)

log_specgrams.shape - (10, 25, 41, 1)
features.shape - (10,

In [37]:
lb

array([1, 1, 1, 1, 1, 3, 3, 3, 3, 3])

In [6]:
parent_dir = '/home/paperspace/Documents/Projects/Urban Sound Classification/UrbanSound8K/audio/'
sub_dirs= ['fold1','fold2']
features,labels = extract_features(parent_dir,sub_dirs)
labels = one_hot_encode(labels)

### Dump to pickle

In [7]:
rnd_indices = np.random.rand(len(labels)) < 0.70

train_x = features[rnd_indices]
train_y = labels[rnd_indices]
test_x = features[~rnd_indices]
test_y = labels[~rnd_indices]

In [9]:
# dump to pickle
if os.environ['HOME'] == '/Users/Gundeep':
    pickle_filename = 'urban_pickle_cnn.pickle'
else:
    pickle_filename = '/home/paperspace/Documents/Projects/Urban Sound Classification/urban_pickle_cnn.pickle'

pickle_dump = {'train_x' : train_x,
               'train_y' : train_y,
               'test_x' : test_x,
               'test_y' : test_y}
import pickle
try:
  with open(pickle_filename, 'wb') as f:
    pickle.dump(pickle_dump, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
  print('Unable to save data to', pickle_filename, ':', e)

### Load from pickle

In [3]:
# load from pickle
import pickle
if os.environ['HOME'] == '/Users/Gundeep':
    pickle_filename = 'urban_pickle_cnn.pickle'
else:
    pickle_filename = '/home/paperspace/Documents/Projects/Urban Sound Classification/urban_pickle_cnn.pickle'
pickle_dump = {}
try:
  with open(pickle_filename, 'rb') as f:
    pickle_dump = pickle.load(f)
except Exception as e:
  print('Unable to load pickle', pickle_filename, ':', e)

In [5]:
train_x = pickle_dump['train_x']
train_y = pickle_dump['train_y']
test_x = pickle_dump['test_x']
test_y = pickle_dump['test_y']

In [6]:
print('train_x:%s'%str(train_x.shape))
print('train_y%s'%str(train_y.shape))
print('test_x%s'%str(test_x.shape))
print('test_y:%s'%str(test_y.shape))

train_x:(7594, 60, 41, 2)
train_y(7594, 10)
test_x(3240, 60, 41, 2)
test_y:(3240, 10)


### Create neural net

In [12]:
# keras class shorthands
Sequential = models.Sequential
Dense = layers.Dense
Conv2D = layers.Conv2D

In [19]:
frames = 41
bands = 60

feature_size = 2460 #60x41
num_classes = 10
num_channels = 2
epochs = 2000

batch_size = 50
kernel_size = 30
depth = 20
num_hidden = 200

learning_rate = 0.01
training_iterations = 2000

In [14]:
train_x.shape[1:]

(60, 41, 2)

In [15]:
model = Sequential()
model.add(Conv2D(depth, (5,5), input_shape=train_x.shape[1:], activation='relu', padding='same'))
model.add(layers.Flatten())
model.add(Dense(num_hidden, activation='sigmoid'))
model.add(Dense(num_classes, activation='softmax'))

In [16]:
opt = keras.optimizers.Adam(lr=learning_rate)#, decay=1e-6)

In [17]:
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

### setup file logger

In [16]:
from keras.callbacks import CSVLogger
csv_logger = CSVLogger('log.csv', append=True, separator=';')

Using TensorFlow backend.


In [None]:
model.fit(train_x, train_y,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(test_x, test_y),
              shuffle=True)

Train on 7594 samples, validate on 3240 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000