In [1]:
import numpy as np
from keras.datasets import fashion_mnist as fmn
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Dense
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import tensorflow as tf
from time import time

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Problem 1
Experiment with fully connected neural nets for classification of the Fashion-MNIST data: add at least two more layers, make all hidden layers at least 20 neurons wide, and try it with both ReLU and sigmoid activations.  Train for as many epochs as you need until the loss function (categorical cross entropy) stops improving--Keras's `callbacks.EarlyStopping` may be useful.

In [2]:
def build_keras_model(optimizer,activ) :
    output_dim = 10
    model = Sequential()
    model.add(Dense(20, input_dim=input_dim, activation=activ))
    model.add(Dense(40, input_dim=input_dim, activation=activ))
    model.add(Dense(20, input_dim=input_dim, activation=activ))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
    return model

# Generate optimizers for each run
def generate_optimizers(learning_rate=.01):
    """Generate a dictionary of optimizers for keras to use in gridsearch"""
    adam = Adam(lr=learning_rate)
    optimizer_dict = {'Adam':adam}
    return optimizer_dict

In [3]:
(x_train, y_train), (x_test, y_test) = fmn.load_data()

Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading data from http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz


In [4]:
rates = [0.0001,0.001,.01,.1,1]
meth_list = ['Adam']

# Reshape y as one-hot
y_train = to_categorical(y_train, 10).astype(float)
y_test = to_categorical(y_test, 10).astype(float)

# Flatten images
input_dim = x_train.shape[1]*x_train.shape[2]
x_train = x_train.reshape((x_train.shape[0], input_dim))
x_test = x_test.reshape((x_test.shape[0], input_dim))
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

# Permute the training data to be in random order
perm = np.random.permutation(x_train.shape[0])
x_train = x_train[perm]
y_train = y_train[perm]



In [8]:
relu_times = []
sigm_times = []

relu_acc = []
sigm_acc = []

for rate in rates :
    methods = generate_optimizers(rate)
    for meth in methods.keys() :
        method = methods[meth]
        model1 = build_keras_model(method,'relu')
        model2 = build_keras_model(method,'sigmoid')
        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        start = time()
        hist1 = model1.fit(x_train,
                y_train,
                batch_size=128,
                epochs=20,
                verbose=2,
                shuffle=False,
                validation_data=(x_test, y_test),
                callbacks=[early_stopping])
        end = time()
        relu_times.append(end-start)
        start = time()
        hist2 = model2.fit(x_train,
                y_train,
                batch_size=128,
                epochs=20,
                verbose=2,
                shuffle=False,
                validation_data=(x_test, y_test),
                callbacks=[early_stopping])
        end = time()
        sigm_times.append(end-start)
        relu_acc.append(max(hist1.history['acc']))
        sigm_acc.append(max(hist2.history['acc']))

Train on 60000 samples, validate on 10000 samples
Epoch 1/20
 - 1s - loss: 1.3460 - acc: 0.5547 - val_loss: 0.8014 - val_acc: 0.7407
Epoch 2/20
 - 1s - loss: 0.6437 - acc: 0.7818 - val_loss: 0.5873 - val_acc: 0.7951
Epoch 3/20
 - 1s - loss: 0.5206 - acc: 0.8185 - val_loss: 0.5242 - val_acc: 0.8136
Epoch 4/20
 - 1s - loss: 0.4713 - acc: 0.8333 - val_loss: 0.4916 - val_acc: 0.8251
Epoch 5/20
 - 1s - loss: 0.4425 - acc: 0.8422 - val_loss: 0.4708 - val_acc: 0.8318
Epoch 6/20
 - 1s - loss: 0.4228 - acc: 0.8488 - val_loss: 0.4561 - val_acc: 0.8357
Epoch 7/20
 - 1s - loss: 0.4080 - acc: 0.8543 - val_loss: 0.4446 - val_acc: 0.8399
Epoch 8/20
 - 1s - loss: 0.3961 - acc: 0.8582 - val_loss: 0.4353 - val_acc: 0.8438
Epoch 9/20
 - 1s - loss: 0.3862 - acc: 0.8616 - val_loss: 0.4278 - val_acc: 0.8472
Epoch 10/20
 - 1s - loss: 0.3777 - acc: 0.8641 - val_loss: 0.4213 - val_acc: 0.8495
Epoch 11/20
 - 1s - loss: 0.3704 - acc: 0.8669 - val_loss: 0.4158 - val_acc: 0.8518
Epoch 12/20
 - 1s - loss: 0.3641 - 

In [11]:
print('\tRate\tMethod\tTime\t\t\tAccuracy')
n = len(relu_acc)
for i in range(n) :
    if i == 1 :
        print('relu:\t'+str(rates[i//(n//5)])+'\t'+str(meth_list[0])+'\t'+str(relu_times[i])+'\t\t'+str(relu_acc[i]))
    else :
        print('relu:\t'+str(rates[i//(n//5)])+'\t'+str(meth_list[0])+'\t'+str(relu_times[i])+'\t'+str(relu_acc[i]))
    print('sigm:\t'+str(rates[i//(n//5)])+'\t'+str(meth_list[0])+'\t'+str(sigm_times[i])+'\t'+str(sigm_acc[i]))

	Rate	Method	Time			Accuracy
relu:	0.0001	Adam	17.273399591445923	0.8808833333333334
sigm:	0.0001	Adam	17.2337327003479	0.8450666666984558
relu:	0.001	Adam	8.9581618309021		0.90035
sigm:	0.001	Adam	16.682024717330933	0.9027999999682108
relu:	0.01	Adam	7.4131529331207275	0.8854833333015442
sigm:	0.01	Adam	15.32559609413147	0.8648499999682109
relu:	0.1	Adam	4.91444993019104	0.3445333333492279
sigm:	0.1	Adam	4.097715377807617	0.10125000000794729
relu:	1	Adam	3.298017740249634	0.06666666666269302
sigm:	1	Adam	3.419442653656006	0.10000000000794729


# Problem 2
Using the notation from class today (and from the video) calculate one iteration of backpropagation by hand (or code something to do it for you).  That is, calculate the forward pass and then the backward pass to compute both the output of the network for the current weights, and the gradient (with respect to the Ws and the bs) on a (fully connected) neural network with two hidden layers of 2 neurons each (ReLu activation), two inputs, and a single output layer having a sigmoid activation function. Use the input data $x=(1,-1)$, $y=1$, and assume the current weights are

$W^1 = \left[ \begin{matrix} 0.25 & 0.1 \\ -0.2  & 0.9 \end{matrix} \right]$

$b^1 =   \left[ \begin{matrix} 0.1 \\ -0.2 \end{matrix} \right]$

$W^2 =  \left[ \begin{matrix} 0.5 & 0.8 \\ 0.3 & 0.7 \end{matrix} \right]$

$b^2 =  \left[ \begin{matrix} -0.3 \\ 0.1 \end{matrix} \right]$

$W^3 =  \left[ \begin{matrix} 0.1 & -0.2 \end{matrix} \right]$

$b^3 = 0.3$

So the structure of the network looks something like this:

            L_1      L_2      L_3
            
    x_0 ----> O ----> O ----> O ----> 

        \  /        \  /           /
       
         / \         / \         /
        
          x_1 ----> O ----> O

where the Os here represent neurons

$L_{1}$ :
$
\begin{align*}
z^1 &= w^1a^0+b^1 \\
&= \left[ \begin{matrix} 0.25 & 0.1 \\ -0.2  & 0.9 \end{matrix} \right] \left[ \begin{matrix} 1 \\ -1 \end{matrix} \right] + \left[ \begin{matrix} 0.1 \\ -0.2 \end{matrix} \right] \\
&= \left[ \begin{matrix} 0.25 \\ -1.3 \end{matrix} \right] \\
\hat{\sigma}(z^1) &= \left[ \begin{matrix} 0.25 \\ 0 \end{matrix} \right] = a^1 \\
\hat{\sigma}'(z^1) &= \left[ \begin{matrix} 1 \\ 0 \end{matrix} \right]
\end{align*}
$

$L_{2}$ :
$
\begin{align*}
z^2 &= w^2a^1+b^2 \\
&= \left[ \begin{matrix} 0.5 & 0.8 \\ 0.3 & 0.7 \end{matrix} \right] \left[ \begin{matrix} 0.25 \\ 0 \end{matrix} \right] + \left[ \begin{matrix} -0.3 \\ 0.1 \end{matrix} \right] \\
&= \left[ \begin{matrix} -0.175 \\ 0.175 \end{matrix} \right] \\
\hat{\sigma}(z^2) &= \left[ \begin{matrix} 0 \\ 0.175 \end{matrix} \right] = a^2 \\
\hat{\sigma}'(z^2) &= \left[ \begin{matrix} 0 \\ 1 \end{matrix} \right]
\end{align*}
$

$L_{3}$ :
$
\begin{align*}
z^3 &= w^3a^2+b^3 \\
&= \left[ \begin{matrix} 0.1 & -0.2 \end{matrix} \right] \left[ \begin{matrix} 0.25 \\ 0 \end{matrix} \right] + 0.3 \\
&= 0.265 \\
\hat{\sigma}(z^3) &= \frac{1}{1+e^{-0.265}} \approx 0.565865 = \hat{y} \\
\hat{\sigma}'(z^3) &= \frac{e^{-0.265}}{\left(1+e^{-0.265}\right)^{2}} \approx 0.24566
\end{align*}
$

$C = \frac{1}{2} \left|\left| \hat{y} - y \right|\right|_{2}^{2} = \frac{1}{2} \left|\left|1-\frac{1}{1+e^{-0.265}}\right|\right|_{2}^{2} = \frac{1}{2}\left|\frac{e^{-0.265}}{1+e^{-0.265}}\right|^{2} \approx \frac{1}{2}\left(0.434135\right)^{2} \approx 0.094236597855$.

$
\begin{align*}
\frac{\partial C}{\partial W_{1}} &= (\hat{y}-y)\hat{\sigma}'(z^3)W^3\hat{\sigma}'(z^2)W^2\hat{\sigma}'(z^1)(a^0)^{T} \\
&= (0.434135)(0.24566)\left[\begin{matrix}0.1&-0.2\end{matrix}\right]\left[\begin{matrix}0\\1\end{matrix}\right]\left[\begin{matrix}0.5&0.8\\0.3&0.7\end{matrix}\right]\left[\begin{matrix}1\\0\end{matrix}\right]\left[\begin{matrix}1&-1\end{matrix}\right] \\
&\approx \left[ \begin{matrix} -0.01066504 & 0.01066504 \\ -0.006399025 & 0.006399025 \end{matrix} \right] \\
\frac{\partial C}{\partial b_{1}} &= (\hat{y}-y)\hat{\sigma}'(z^3)W^3\hat{\sigma}'(z^2)W^2\hat{\sigma}'(z^1)1 \\
&= (0.434135)(0.24566)\left[\begin{matrix}0.1&-0.2\end{matrix}\right]\left[\begin{matrix}0\\1\end{matrix}\right]\left[\begin{matrix}0.5&0.8\\0.3&0.7\end{matrix}\right]\left[\begin{matrix}1\\0\end{matrix}\right] \\
&\approx \left[ \begin{matrix} -0.01066504 \\ -0.006399025 \end{matrix} \right]
\end{align*}
$

$
\begin{align*}
\frac{\partial C}{\partial W_{2}} &= (\hat{y}-y)\hat{\sigma}'(z^3)W^3\hat{\sigma}'(z^2)(a^1)^{T} \\
&= (0.434135)(0.24566)\left[\begin{matrix}0.1&-0.2\end{matrix}\right]\left[\begin{matrix}0\\1\end{matrix}\right]\left[\begin{matrix}0.25&0\end{matrix}\right] \\
&\approx \left[ \begin{matrix} -0.00533252 & 0 \end{matrix} \right] \\
\frac{\partial C}{\partial b_{2}} &= (\hat{y}-y)\hat{\sigma}'(z^3)W^3\hat{\sigma}'(z^2)1 \\
&= (0.434135)(0.24566)\left[\begin{matrix}0.1&-0.2\end{matrix}\right]\left[\begin{matrix}0\\1\end{matrix}\right] \\
&\approx -0.0213308
\end{align*}
$  
However, these have the wrong dimension, so I don't know what's going on.

$
\begin{align*}
\frac{\partial C}{\partial W_{3}} &= (\hat{y}-y)\hat{\sigma}'(z^3)(a^2)^{T} \\
&= (0.434135)(0.24566)\left[\begin{matrix}0.1&-0.2\end{matrix}\right] \\
&\approx \left[ \begin{matrix} 0.0106650385 & 0.02133007708 \end{matrix} \right] \\
\frac{\partial C}{\partial b_{3}} &= (\hat{y}-y)\hat{\sigma}'(z^3)1 \\
&\approx 0.106650385
\end{align*}
$