In [None]:
# default data download is to ~/.keras

In [None]:
# the big examples: (based on examples from:
#     https://github.com/fchollet/keras/tree/master/examples)
# mlp mnist, cnn mnist, cnn cifar

In [None]:
# FIXME:  add model names so we can refer back to them 
#     (i.e., they don't clobber each other)

In [None]:
import keras

from keras.datasets import mnist, cifar10

# don't usually prefer/do direct imports, but so be it ...
from keras.models import Sequential
from keras.layers import (Activation, 
                          Dense, Dropout, Flatten,
                          Conv2D, MaxPooling2D)

import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Our Friend - Sigmoid

$$
\sigma = \frac{1}{1 + e^{-x}}
$$


In [None]:
def sigmoid(x):
    """Sigmoid function"""
    return 1.0 / (1.0 + np.exp(-x))

# Plot The sigmoid function
xs = np.linspace(-10, 10, num=100, dtype=np.float32)
activation = sigmoid(xs)

fig = plt.figure(figsize=(6,4))
plt.plot(xs, activation)
plt.plot(0,.5,'ro')

plt.grid(True, which='both')
plt.axhline(y=0, color='y')
plt.axvline(x=0, color='y')
plt.ylim([-0.1, 1.15]);

# A Tiny Neural Network

##### OR Logic
A logic gate takes in two boolean (true/false or 1/0) inputs, and returns either a 0 or 1 depending on its rule. The truth table for a logic gate shows the outputs for each combination of inputs: (0, 0), (0, 1), (1,0), and (1, 1). For example, let's look at the truth table for an Or-gate:

<table>
<tr><th colspan="3">OR gate truth table</th></tr>
<tr><th colspan="2">Input</th><th>Output</th></tr>
<tr><td>0</td><td>0</td><td>0</td></tr>
<tr><td>0</td><td>1</td><td>1</td></tr>
<tr><td>1</td><td>0</td><td>1</td></tr>
<tr><td>1</td><td>1</td><td>1</td></tr>
</table>

##### OR as a Neuron

A neuron that uses the sigmoid activation function outputs a value between (0, 1). This naturally leads us to think about boolean values. Imagine a neuron that takes in two inputs, $x_1$ and $x_2$, and a bias term:

<img src="./logic01.png" width=50%/>

By limiting the inputs of $x_1$ and $x_2$ to be in $\left\{0, 1\right\}$, we can simulate the effect of logic gates with our neuron. The goal is to find the weights (represented by ? marks above), such that it returns an output close to 0 or 1 depending on the inputs.  What weights should we use to output the same results as OR? Remember: $\sigma(z)$ is close to 0 when $z$ is largely negative (around -10 or less), and is close to 1 when $z$ is largely positive (around +10 or greater).

$$
z = w_1 x_1 + w_2 x_2 + b
$$

Let's think this through:

* When $x_1$ and $x_2$ are both 0, the only value affecting $z$ is $b$. Because we want the result for input (0, 0) to be close to zero, $b$ should be negative (at least -10) to get the very left-hand part of the sigmoid.
* If either $x_1$ or $x_2$ is 1, we want the output to be close to 1. That means the weights associated with $x_1$ and $x_2$ should be enough to offset $b$ to the point of causing $z$ to be at least 10 (i.e., to the far right part of the sigmoid).

Let's give $b$ a value of -10. How big do we need $w_1$ and $w_2$ to be?  At least +20 will get us to +10 for just one of $\{w_1, w_2\}$ being on.

So let's try out $w_1=20$, $w_2=20$, and $b=-10$:

<img src="./logic02.png" width=50%/>

If you break out pencil-and-paper, we'll find this works wonderfully.  That's great.  But we don't really want to have to do a thought exercise everytime we encounter a new dataset.  How can we get an automated process to learn these weights for us?

In [None]:
data = np.array([[0,0,0],
                 [1,0,1],
                 [0,1,1],
                 [1,1,1]], dtype=np.float64)
x_train, y_train = data[:,0:2], data[:,2]

# build the network architecture
model = Sequential()
model.add(Dense(1, activation='sigmoid', input_shape=(2,)))

# define "correctness" (the optimization problem)
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy'])

# fit the model wrt the optimization
model.fit(x_train, y_train, epochs=5000, verbose=False)

# results
print("learned weights:\n", 
      model.layers[0].get_weights()[0], 
      model.layers[0].get_weights()[1])
print("predictions:\n", model.predict(x_train).round().astype(np.uint8))

# Keras MLP on MNIST

In [None]:
'''Trains a simple MLP with dropout on the MNIST dataset.
Gets to 98.40% test accuracy after 20 epochs
(there is *a lot* of margin for parameter tuning).
2 seconds per epoch on a K520 GPU.
'''

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
n_train, n_test = x_train.shape[0], x_test.shape[0]
num_classes = len(set(y_train)) # aka, 10

# here we flatten the data 
# (so r,c information is lost - rely on absolute position only)
x_train = x_train.reshape(n_train, -1).astype(np.float32) / 255.0
x_test  = x_test.reshape( n_test, -1).astype(np.float32) / 255.0

flat_shape = x_train[0].shape

print('{} train and {} test examples'.format(n_train, n_test))

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test  = keras.utils.to_categorical(y_test,  num_classes)
 
# network architecture
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=flat_shape))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

# display architecture
model.summary()

# define learning procedure
opt = keras.optimizers.RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

# fit model
batch_size, epochs = 128, 2
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(x_test, y_test), verbose=1)

# evaluate
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Keras CNN MNIST

In [None]:
'''Trains a simple convnet on the MNIST dataset.

Gets to 99.25% test accuracy after 12 epochs
(there is still a lot of margin for parameter tuning).
16 seconds per epoch on a GRID K520 GPU.
'''

# the data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()
n_train, n_test = x_train.shape[0], x_test.shape[0]
img_shape = x_train[0].shape[1:]
num_classes = len(set(y_train)) # aka, 10

# we maintain grid structure of data
# (and we add a dimension to account for the single channel images)
x_train = np.expand_dims(x_train, -1).astype(np.float32) / 255.0
x_test  = np.expand_dims(x_test,  -1).astype(np.float32) / 255.0
input_shape = x_train[0].shape

print('{} train and {} test examples'.format(n_train, n_test))
print('image shape:', input_shape)

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test  = keras.utils.to_categorical(y_test,  num_classes)
 

# define architecture
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# define learning procedure
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

# fit model
batch_size, epochs = 128, 2 # 128, 12
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(x_test, y_test), verbose=1)

# evaluate
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# CNN on CIFAR

In [None]:
# for lenet in keras, see here
# (and compare with the tensorflow exmaple in week 6)
# https://www.kaggle.com/ftence/keras-cnn-inspired-by-lenet-5

In [None]:
'''Train a simple deep CNN on the CIFAR10 small images dataset.

It gets to 75% validation accuracy in 25 epochs, and 79% after 50 epochs.
(it's still underfitting at that point, though).
'''

# The data, shuffled and split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
n_train, n_test = x_train.shape[0], x_test.shape[0]
img_shape = x_train[0].shape
num_classes = len(set(y_train[:,0])) # aka, 10

x_train = x_train.astype(np.float32) / 255.0
x_test  = x_test.astype(np.float32) / 255.0

print('{} train and {} test examples'.format(n_train, n_test))
print('image shape:', img_shape)

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test  = keras.utils.to_categorical(y_test,  num_classes)

# definte architecture
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same', input_shape=img_shape))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# define learning procedure (RMSprop optimizer)
opt = keras.optimizers.RMSprop(lr=0.0001, decay=1e-6)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

# fit model
batch_size, epochs = 32, 2 # 32, 50; 32,100
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, 
          validation_data=(x_test, y_test), shuffle=True)

# evaluate trained model
scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

# saved trained model
model_name = 'keras_cifar10_trained_model.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')
if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Saved trained model at %s ' % model_path)

# Exercises

##### Minimal Networks for AND

In [None]:
# train a (trivial) network for AND
# how do weights differ if you use ReLU versus sigmoid activations?

##### Sigmoid

In [None]:
data = np.array([[0,0,0],
                 [1,0,0],
                 [0,1,0],
                 [1,1,1]], dtype=np.float64)
x_train, y_train = data[:,0:2], data[:,2]

# student section begins here
# build the network architecture




# student section ends here
# results
print("learned weights:\n", 
      model.layers[0].get_weights()[0], 
      model.layers[0].get_weights()[1])
print("predictions:\n", model.predict(x_train).round().astype(np.uint8))

##### ReLU

In [None]:
data = np.array([[0,0,0],
                 [1,0,0],
                 [0,1,0],
                 [1,1,1]], dtype=np.float64)
x_train, y_train = data[:,0:2], data[:,2]

# student section begins here
# build the network architecture




# student section ends here

# results
print("learned weights:\n", 
      model.layers[0].get_weights()[0], 
      model.layers[0].get_weights()[1])
print("predictions:\n", model.predict(x_train).round().astype(np.uint8))

##### MLP on XOR

In [None]:
# mathematically, a single node can't learn XOR.  
# we need at least a network with a hidden layer.  try it out.
# train a MLP for XOR
# learning rate of .01 may be too small; try .1
# may need lots of epochs ... and due to random initialization, may have bad luck

In [None]:
data = np.array([[0,0,0],
                 [1,0,1],
                 [0,1,1],
                 [1,1,0]], dtype=np.float64)
x_train, y_train = data[:,0:2], data[:,2]

# network architecture
model = Sequential()
model.add(Dense(2, activation='sigmoid', input_shape=(2,)))
model.add(Dense(1, activation='sigmoid'))

# opt & fit (customize learning rate)
# this is somewhat brittle:
my_sgd = keras.optimizers.SGD(lr=0.5)
# this is more robust, but we haven't taken a deep dive into optimization parameters
# my_sgd = keras.optimizers.SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=my_sgd, metrics=['accuracy'])
model.fit(x_train, y_train, epochs=10000, verbose=False)

# results
print("learned weights:", 
      model.layers[0].get_weights()[0], 
      model.layers[0].get_weights()[1],
      model.layers[1].get_weights()[0], 
      model.layers[1].get_weights()[1], sep="\n")
print("predictions:\n", 
      model.predict(x_train).round().astype(np.uint8))

##### parameters of cnn

In [None]:
# how many parameters were in the cnn cifar model?

##### mlp cifar

In [None]:
# mlp network on cifar data

In [None]:
# The data, shuffled and split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
n_train, n_test = x_train.shape[0], x_test.shape[0]
img_shape = x_train[0].shape
num_classes = len(set(y_train[:,0])) # aka, 10

x_train = x_train.reshape(n_train, -1).astype(np.float32) / 255.0
x_test  = x_test.reshape( n_test, -1).astype(np.float32) / 255.0
flat_shape = x_train[0].shape

print('{} train and {} test examples'.format(n_train, n_test))
print('image shape:', img_shape)

# Convert class vectors to binary class matrices.
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test  = keras.utils.to_categorical(y_test,  num_classes)


# network architecture
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=flat_shape))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

# display architecture
model.summary()

# define learning procedure
opt = keras.optimizers.RMSprop()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

# fit model
batch_size, epochs = 128, 2
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
          validation_data=(x_test, y_test), verbose=1)

# evaluate
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

##### cnn 101categories

In [None]:
# cnn cifar on 101cat (i.e., train(cifar), test(101cat) "transfer learning")
# do this via two steps:
# before doing any network processing, use cv2 to resize all images to a "standard" size
# (annoying, but keras doesn't have a nice resize layer - tensorflow does)