In [1]:
'''Trains a simple convnet on the MNIST dataset.
Gets to 99.25% test accuracy after 12 epochs
(there is still a lot of margin for parameter tuning).
16 seconds per epoch on a GRID K520 GPU.
'''

from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

from pathlib import Path
import librosa
import scipy
import pandas as pd
import numpy as np
import glob, os
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


## Define Functions

In [2]:
def extract_features(signal, fs):
    # Always Need to give an input for sampling rate so that other higher level features can be extracter
   
    zcr = librosa.feature.zero_crossing_rate(signal); #Zero Crossing Rate
    spec_cent = librosa.feature.spectral_centroid(signal,fs);  #Spectral Centroid
    tontz = librosa.feature.tonnetz(signal,fs) #tonal centroid features
    S = librosa.feature.melspectrogram(signal,fs) # Mel Spectorgram
    mfc = librosa.feature.mfcc(signal,fs,S) #Mel frequency Coefficents
    return zcr, spec_cent, tontz, S, mfc
  
    
## Function to load all .wav files in a given folder to an array of (data, sample rate) ##
def load_folder(data_path):
    # input string of path to specific folder
    samples = [];
    
    for file in glob.glob(os.path.join(data_path,'*.wav')):
        temp,sr = librosa.load(file);
        temp = librosa.util.fix_length(temp,2*sr);
        samples.append([temp,sr])
        
    return samples
            

## Load Audio and Extract Data

In [3]:
# CHANGE PATHS TO YOUR FOLDERS
kick_path = 'audio\kickSamples'; 
snare_path = 'audio\snareSamples';

kickSamples = load_folder(kick_path);
snareSamples = load_folder(snare_path);


In [4]:

kickFeat = [extract_features(x[0], x[1]) for x in kickSamples]
snareFeat = [extract_features(x[0], x[1]) for x in snareSamples]


class_tags = np.ones(len(kickSamples)) 
class_tags = np.append(class_tags, np.zeros(len(snareSamples)))

zero_co= []
cent_freq = []
tonal_ctroid = []
mfccs = []

i = 0
for s in kickFeat:
    zero_co.append(s[0][i])
    cent_freq.append( s[1][i])
    tonal_ctroid.append(s[2][i])
    mfccs.append(s[4][i])


for s in snareFeat:
    zero_co.append(s[0][i])
    cent_freq.append( s[1][i])
    tonal_ctroid.append(s[2][i])
    mfccs.append(s[4][i])

    
zero_co = (zero_co, class_tags)
cent_freq = (cent_freq,  class_tags)
tonal_ctroid = (tonal_ctroid, class_tags)
mfccs = (mfccs,  class_tags)

# datasets = [make_moons(noise=0.3),make_circles(noise=0.3),zero_co,cent_freq,tonal_ctroid,mfccs]
h = .02  # step size in the mesh


In [5]:
#ds is a tuple
# unpacks X as the feature training set, y as the target values
X, y = mfccs
# StandardScaler will remove the mean and scaling to unit variance
# fit_transform: fits then transforms
# fit: Compute the mean and std to be used for later scaling
# transform: Perform standardization by centering and scaling
X = StandardScaler().fit_transform(X)
# splits the features into random train and test data
x_train, x_test, y_train, y_test = \
    train_test_split(X, y, test_size=.4, random_state=42)

In [6]:
x_train.shape

(147, 87)

In [10]:
batch_size = 128
num_classes = 10
epochs = 12

# input image dimensions
# img_rows, img_cols = 28, 28

# the data, split between train and test sets
# (x_train, y_train), (x_test, y_test) = mnist.load_data()

# if K.image_data_format() == 'channels_first':
#     x_train = x_train.reshape(x_train.shape[0], 1)
#     x_test = x_test.reshape(x_test.shape[0], 1)
#     input_shape = (1, img_rows, img_cols)
# else:

x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1, 1)
x_test = x_test.reshape(x_test.shape[0],  x_test.shape[1], 1, 1)
input_shape = (x_test.shape[1], 1, 1)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')



x_train shape: (147, 87, 1, 1)
147 train samples
98 test samples


In [11]:
# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

# This is where the model is constructed:
#This format is a convolutional NN
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))



model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])


In [12]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ValueError: Negative dimension size caused by subtracting 3 from 1 for 'conv2d_3/convolution' (op: 'Conv2D') with input shapes: [?,87,1,1], [3,3,1,32].