In [1]:
import IPython.display
from IPython.display import Audio
import pandas
from pandas import DataFrame
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
ds = pandas.read_pickle("full_dataset_44100.pickle")


## Convert target column to 1-hot encoding
Our neural net will output a 2d vector. If the first element is 1 then its a noise, otherwise its a goal.

In [2]:
def to1hot(row):
    one_hot = np.zeros(2)
    one_hot[row]=1.0
    return one_hot

ds["one_hot_encoding"] = ds.target.apply(to1hot)


# Establishing the baseline

In [3]:
print "This is the error rate if we always guess the majority: %.2f" % \
(min(ds[ds["target"] == 0].index.size, ds[ds["target"] == 1].index.size) / (float)(ds.index.size))

This is the error rate if we always guess the majority: 0.49


# Form the training, testing, and validation data sets

In [4]:
ds["mels_flatten"] = ds.mels.apply(lambda mels: mels.flatten())
train_data = ds[0:160]
validation_data = ds[160:190]
test_data = ds[189:]

In [5]:
train_x = np.vstack(train_data.mels_flatten).reshape(train_data.shape[0],128, 87,1).astype(np.float32)
train_y = np.vstack(train_data["one_hot_encoding"])
train_size = train_y.shape[0]
validation_x = np.vstack(validation_data.mels_flatten).reshape(validation_data.shape[0],128, 87,1).astype(np.float32)
validation_y = np.vstack(validation_data["one_hot_encoding"])
test_x = np.vstack(test_data.mels_flatten).reshape(test_data.shape[0],128, 87,1).astype(np.float32)
test_y = np.vstack(test_data["one_hot_encoding"])

# Define the model

In [6]:
import tensorflow as tf
BATCH_SIZE = 160 # we have so little data, just set the batch size to the entire training set
NUM_CHANNELS = 1 
NUM_LABELS = 2
INPUT_SHAPE = (128,87)
SEED = 42

# This node is where we feed a batch of the training data and labels at each training step
train_data_node = tf.placeholder(tf.float32,shape=(BATCH_SIZE, INPUT_SHAPE[0], INPUT_SHAPE[1], 1))
train_labels_node = tf.placeholder(tf.float32, shape=(BATCH_SIZE, NUM_LABELS))

# constants for validation and tests
validation_data_node = tf.constant(validation_x)
test_data_node = tf.constant(test_x)

In [7]:
conv1_weights = tf.Variable(
    # The first 3 elements defines the shape of the filter, the last one is the number of feature maps it outputs
    # This 1d filter only looks at a small contiguous chunk of audio signal (550 samples, ~550ms) (
    # if the data was an image then one would probably use a 2d (greyscale) or even 3d (color) filter
    # The size of the filter can be anything, as long as it is smaller than the input
    tf.truncated_normal([2, 8, 1, 32], # Creating 32 feature maps.
    stddev=0.1, 
    seed=SEED))
conv1_biases = tf.Variable(tf.zeros([32])) #Each feature needs a bias for ReLU

conv2_weights = tf.Variable(
    tf.truncated_normal([30, 8, 32, 64], # Creating 64 feature maps. 
    stddev=0.1, 
    seed=SEED))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64]))

# 56 is the ceiling of 11025/50/2/2. 
# See comments below for explaination on the effect of stride size on the size of hidden layers
fc1_weights = tf.Variable(
    tf.truncated_normal([48 * 64, 512], stddev=0.1, seed=SEED))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512]))

fc2_weights = tf.Variable(
    tf.truncated_normal([512, NUM_LABELS], stddev=0.1, seed=SEED))
fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS]))
print 'done'

done


In [8]:
#wire the variables together

def model(data, train=False):
    """The Model definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [image index, y, x, depth].
    
    print data.get_shape()
    conv = tf.nn.conv2d(data,
                      conv1_weights,
                      strides=[1, 2, 2, 1], # Since stride is 50, the filters moves 50 frames each time. Therefore the shape becomes ceiling(11025/50)
                      padding='SAME')
    print conv.get_shape()
    
    # Bias and rectified linear non-linearity.
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))

    print relu.get_shape()
    # Max pooling. The kernel size spec ksize also follows the layout of
    # the data.
    pool = tf.nn.max_pool(relu,
                        ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1],
                        padding='SAME')

    print "pool_shape: %s" % pool.get_shape()
    
    conv = tf.nn.conv2d(pool,
                      conv2_weights,
                      strides=[1, 2, 2, 1],
                      padding='SAME')

    print "conv: %s" % conv.get_shape()
    
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    pool = tf.nn.max_pool(relu,
                        ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], # max pool aggregates 2 units into 1, therefore the shape is halved again.
                        padding='SAME')

    print "pool: %s" % pool.get_shape()
    
    # Reshape the feature map cuboid into a 2D matrix to feed it to the
    # fully connected layers.

    pool_shape = pool.get_shape().as_list()
    
    reshape = tf.reshape(
      pool,
      [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])

    # Fully connected layer. Note that the '+' operation automatically
    # broadcasts the biases.
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)

    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
        hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
    return tf.matmul(hidden, fc2_weights) + fc2_biases

In [9]:
# Training computation: logits + cross-entropy loss.
logits = model(train_data_node, True)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
  logits, train_labels_node))

# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
# Add the regularization term to the loss.
loss += 5e-4 * regularizers

# Optimizer: set up a variable that's incremented once per batch and
# controls the learning rate decay.
batch = tf.Variable(0)
# Decay once per epoch, using an exponential schedule starting at 0.01.
learning_rate = tf.train.exponential_decay(
  0.01,                # Base learning rate.
  batch * BATCH_SIZE,  # Current index into the dataset.
  train_size,          # Decay step.
  0.99,                # Decay rate.
  staircase=True)
# Use simple momentum for the optimization.
optimizer = tf.train.MomentumOptimizer(learning_rate,
                                       0.9).minimize(loss,
                                                     global_step=batch)

# Predictions for the minibatch, validation set and test set.
train_prediction = tf.nn.softmax(logits)
# We'll compute them only once in a while by calling their {eval()} method.
validation_prediction = tf.nn.softmax(model(validation_data_node))
test_prediction = tf.nn.softmax(model(test_data_node))

print 'Done'

(160, 128, 87, 1)
(160, 64, 44, 32)
(160, 64, 44, 32)
pool_shape: (160, 32, 22, 32)
conv: (160, 16, 11, 64)
pool: (160, 8, 6, 64)
(30, 128, 87, 1)
(30, 64, 44, 32)
(30, 64, 44, 32)
pool_shape: (30, 32, 22, 32)
conv: (30, 16, 11, 64)
pool: (30, 8, 6, 64)
(20, 128, 87, 1)
(20, 64, 44, 32)
(20, 64, 44, 32)
pool_shape: (20, 32, 22, 32)
conv: (20, 16, 11, 64)
pool: (20, 8, 6, 64)
Done


# Training

In [10]:
# Create a new interactive session that we'll use in
# subsequent code cells.
s = tf.InteractiveSession()

# Use our newly created session as the default for 
# subsequent operations.
s.as_default()

# Initialize all the variables we defined above.
tf.initialize_all_variables().run()

In [None]:
def error_rate(predictions, labels):
    # We use argmax to convert prediction probabilities into 1-hot encoding and compare it against the labels
    correct = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
    total = predictions.shape[0]

    error = 100.0 - (100 * float(correct) / float(total))

    confusions = np.zeros([NUM_LABELS, NUM_LABELS], np.float32)

    bundled = zip(np.argmax(predictions, 1), np.argmax(labels, 1))
    for predicted, actual in bundled:
        confusions[predicted, actual] += 1
    return error, confusions

print 'Done'

Done


In [None]:
# I modified the original code to use the entire training set instead of mini batches
for i in range(200):
    # Train over the first 1/4th of our training set.
#     steps = int(train_size / BATCH_SIZE)
    #for step in xrange(steps):
    # Compute the offset of the current minibatch in the data.
    # Note that we could use better randomization across epochs.
    offset = 0 #(step * BATCH_SIZE) % (train_size - BATCH_SIZE)
    batch_data = train_x[offset:(offset + BATCH_SIZE), :, :, :]
    batch_labels = train_y[offset:(offset + BATCH_SIZE)]
    # This dictionary maps the batch data (as a numpy array) to the
    # node in the graph it should be fed to.
    feed_dict = {train_data_node: batch_data,
               train_labels_node: batch_labels}
    # Run the graph and fetch some of the nodes.
    _, l, lr, predictions = s.run(
    [optimizer, loss, learning_rate, train_prediction],
    feed_dict=feed_dict)

      # Print out the loss periodically.
    if i % 20 == 0:
        error, _ = error_rate(predictions, batch_labels)
        print 'Mini-batch loss: %.5f Error: %.5f Learning rate: %.5f' % (l, error, lr)
        print 'Validation error: %.5f' % error_rate(
            validation_prediction.eval(), validation_y)[0]


Mini-batch loss: 4.12113 Error: 56.87500 Learning rate: 0.01000
Validation error: 60.00000


In [None]:
test_error, confusions = error_rate(test_prediction.eval(), test_y)
print 'Test error: %.5f' % test_error

plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.grid(False)
plt.xticks(np.arange(1))
plt.yticks(np.arange(1))
plt.imshow(confusions, cmap=plt.cm.jet, interpolation='nearest');

for i, cas in enumerate(confusions):
  for j, count in enumerate(cas):
    if count > 0:
      xoff = .07 * len(str(count))
      plt.text(j-xoff, i+.2, int(count), fontsize=9, color='white')

In [None]:
def show_sample(i):
    print i, test_data.iloc[i]["target"]
    IPython.display.display(IPython.display.Audio(data=test_data.iloc[i]["data"], rate=44100))
res = np.argmax(test_prediction.eval(),1) == np.argmax(test_y, 1)
right = []
wrong = []
for i, v in enumerate(res.tolist()):
    if v:
        right.append(i)
    else:
        wrong.append(i)

#These are the samples our model got incorrect
for w in wrong:
    show_sample(w)

# Save the model

In [None]:
# saver = tf.train.Saver()
# save_path = saver.save(s, "model.ckpt")
# print("Model saved in file: %s" % save_path)