# Deep Learning Discussions

## MNIST Hello World (as before)

In [None]:
import matplotlib.pyplot as plt

def plotTraining(history):
    # summarize history for accuracy
    plt.figure(figsize=(14,6))
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='lower right')
    plt.subplot(1,2,2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper right')
    plt.show()

In [None]:
# hello world example

import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5)
plotTraining(history)

model.evaluate(x_test, y_test)


## Activation Functions

- Differnt Types of Activation Functions. What do you obeserve? Training speed, final performance, etc...
- Why do we need them at all?


In [None]:
# no activation function (linear)
# https://keras.io/activations/

import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  
  #PLAY AROUND HERE
 
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  #tf.keras.layers.Dense(64), #linear
  #tf.keras.layers.Dense(64, activation=tf.nn.sigmoid),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3)
plotTraining(history)

model.evaluate(x_test, y_test)

## Optimization - Weight initialization

In [None]:
# weight initialization
# https://keras.io/initializers/

import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  
  #PLAY AROUND HERE
  
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  #tf.keras.layers.Dense(64, activation=tf.nn.relu,  kernel_initializer=tf.keras.initializers.Zeros()),
  #tf.keras.layers.Dense(64, activation=tf.nn.relu,  kernel_initializer=tf.keras.initializers.Constant(value=-1)),
  #tf.keras.layers.Dense(64, activation=tf.nn.relu,  kernel_initializer=tf.keras.initializers.Constant(value=1)),
  
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3)
plotTraining(history)

model.evaluate(x_test, y_test)

## Optimization - Stochastic Gradient Descent

In [None]:
# optimizer & stochastic gradient descent
# https://keras.io/optimizers/

import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])

#PLAY AROUND HERE
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
#model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10)
#history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=1) #sgd
#history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=len(x_train)) #batch gradient descent
#history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64) #32 default
plotTraining(history)

model.evaluate(x_test, y_test)


## Data Augmentation to increase performance

In [None]:
#no data augmentation
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Make train data smaller -- only 100 examples!
np.random.seed(36)
train_data_idx=np.random.choice(range(0,len(x_train)),100,replace=False)
x_train=x_train[train_data_idx]
y_train=y_train[train_data_idx]

x_train=x_train.reshape((len(x_train),28,28,1))
x_test=x_test.reshape((len(x_test),28,28,1))

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28,1)),
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(x_train, y_train, epochs=50, batch_size=32, validation_data=(x_test, y_test))
plotTraining(history)

model.evaluate(x_test, y_test)

In [None]:
# do data aug
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.15,
    height_shift_range=0.15)

i=2
data_aug=datagen.flow(x=x_train[i:(i+1)], y=y_train[i:(i+1)], batch_size=1)
plt.imshow(x_train[i,:,:,0],cmap="gray")
# original image`

In [None]:
# augmented image
plt.figure(figsize=(15,15))
for i in range (0,25):
  plt.subplot(5,5,i+1)
  x_aug,y_aug=next(data_aug)
  plt.imshow(x_aug[0,:,:,0],cmap="gray")


In [None]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

# Make train data smaller -- only 100 examples!
np.random.seed(36)
train_data_idx=np.random.choice(range(0,len(x_train)),100,replace=False)
x_train=x_train[train_data_idx]
y_train=y_train[train_data_idx]

x_train=x_train.reshape((len(x_train),28,28,1))
x_test=x_test.reshape((len(x_test),28,28,1))

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28, 1)),
  tf.keras.layers.Dense(64, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=0,
    width_shift_range=0.1,
    height_shift_range=0.1)

# fits the model on batches with real-time data augmentation:
history = model.fit(datagen.flow(x_train, y_train, batch_size=32), epochs=300, steps_per_epoch=np.ceil(len(x_train)/32), validation_data=(x_test, y_test))
plotTraining(history)

model.evaluate(x_test, y_test)

### Accuracy on the test set is around 10% better if you use data augmentation

0.7491 -> 0.8349

## Other Architecture: CNN

model from scratch 

In [None]:
from tensorflow.keras.models import Sequential, Model, clone_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.datasets import mnist

(x_digits_train, y_digits_train), (x_digits_test, y_digits_test) = mnist.load_data()

# Make train data smaller
np.random.seed(36)
train_data_idx=np.random.choice(range(0,len(x_digits_train)),100,replace=False)
x_digits_train=x_digits_train[train_data_idx]
y_digits_train=y_digits_train[train_data_idx]

# Preprocess data 
x_digits_train = x_digits_train.astype('float32')
x_digits_test = x_digits_test.astype('float32')
x_digits_train = x_digits_train/ 255
x_digits_test = x_digits_test/ 255
x_digits_train=x_digits_train.reshape((len(x_digits_train),28,28,1))
x_digits_test=x_digits_test.reshape((len(x_digits_test),28,28,1))


# Define model 
model_digits = Sequential()
model_digits.add(Conv2D(8,(3,3),activation='relu',input_shape=(28,28,1)))
model_digits.add(Conv2D(8,(3,3),activation='relu'))
model_digits.add(MaxPooling2D((2,2)))
model_digits.add(Conv2D(16,(3,3),activation='relu'))
model_digits.add(Conv2D(16,(3,3),activation='relu'))
model_digits.add(Flatten())
model_digits.add(Dense(50, activation='relu'))
model_digits.add(Dense(10, activation='softmax'))
# Compile model
model_digits.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# train model
history=model_digits.fit(x_digits_train, y_digits_train,validation_data=(x_digits_test, y_digits_test),
                         batch_size=128, epochs=80)
plotTraining(history)

model_digits.evaluate(x_digits_test,y_digits_test)

## CNN + Data-Augmentation

In [None]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.15,
    height_shift_range=0.15)

# fits the model on batches with real-time data augmentation:
history=model_digits.fit(datagen.flow(x_digits_train, y_digits_train, batch_size=64),validation_data=(x_digits_test, y_digits_test),
                    steps_per_epoch=len(x_digits_train) / 64, epochs=120)
plotTraining(history)

model_digits.evaluate(x_digits_test,y_digits_test)

### Test performance summary (so far)

- simple model:
 
         0.7491 -> 0.8349
    
    
- CNN (for other task, much much higher!):
    
        0.7685 -> 0.8414

## CNN + Fine-Tuning

- train a model on letters (E-MNIST)
- fix lower layers and fine-tune to digits (MNIST)

In [None]:
# Downloading the data, if it does not exist (takes some time)
import urllib
import os
import zipfile

response = urllib.request.urlretrieve("http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/matlab.zip", "matlab.zip")
zf = zipfile.ZipFile("matlab.zip")
zf.extractall()


In [None]:
#load emnist data
from scipy import io as spio

emnist = spio.loadmat("matlab/emnist-letters.mat")

# load training dataset
x_letter_train = emnist["dataset"][0][0][0][0][0][0]
y_letter_train = emnist["dataset"][0][0][0][0][0][1]

# load test dataset
x_letter_test = emnist["dataset"][0][0][1][0][0][0]
y_letter_test = emnist["dataset"][0][0][1][0][0][1]

x_letter_train = x_letter_train.reshape(x_letter_train.shape[0], 28, 28, 1, order="A")
x_letter_test = x_letter_test.reshape(x_letter_test.shape[0], 28, 28, 1, order="A")
x_letter_train = x_letter_train.astype('float32')
x_letter_test = x_letter_test.astype('float32')
x_letter_train /= 255
x_letter_test /= 255



In [None]:
# Define model 
model_letters = Sequential()
model_letters.add(Conv2D(8,(3,3), activation='relu',input_shape=(28,28,1)))
model_letters.add(Conv2D(8,(3,3), activation='relu'))
model_letters.add(MaxPooling2D((2,2)))
model_letters.add(Conv2D(16,(3,3), activation='relu'))
model_letters.add(Conv2D(16,(3,3), activation='relu'))
model_letters.add(MaxPooling2D((2,2)))
model_letters.add(Flatten(name="Flat"))
model_letters.add(Dense(50, activation='relu'))
model_letters.add(Dense(27, activation='softmax'))
# Compile model
model_letters.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# train model
history=model_letters.fit(x_letter_train, y_letter_train,validation_data=(x_letter_test, y_letter_test),
                         batch_size=128, epochs=5, verbose=1)
plotTraining(history)

model_letters.evaluate(x_letter_test,y_letter_test)
model_letters.summary()

In [None]:
letter_model = Model(inputs=model_letters.input, outputs=model_letters.get_layer("Flat").output)
letter_model.summary()

In [None]:
x = letter_model.output
# add a hidden and the new output layer
x = Dense(50, activation='relu')(x)
predictions = Dense(10, activation='softmax')(x)

model = Model(inputs=letter_model.input, outputs=predictions)
model.summary()

In [None]:
#fix the lower layers
for layer in letter_model.layers:
    layer.trainable = False
for i, layer in enumerate(model.layers):
   print(i, layer.name,layer.trainable)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
#train
history=model.fit(x_digits_train, y_digits_train,validation_data=(x_digits_test, y_digits_test),
                         batch_size=64, epochs=80)
plotTraining(history)

model.evaluate(x_digits_test,y_digits_test)

### Test performance summary (so far)

- simple model:
 
         0.7491 -> 0.8349 (data aug.)
    
    
- CNN (for other task, much much higher!):
    
        0.7685 -> 0.8414 (data aug.)
        0.7685 -> 0.8974 (fine-tuning)

##  CNN + Data augmentation and Fine-Tuning


In [None]:
letter_model = Model(inputs=model_letters.input, outputs=model_letters.get_layer("Flat").output)

x = letter_model.output
x = Dense(50, activation='relu')(x)
predictions = Dense(10, activation='softmax')(x)
model = Model(inputs=letter_model.input, outputs=predictions)
for layer in letter_model.layers:
    layer.trainable = False   
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.15,
    height_shift_range=0.15)

# fits the model on batches with real-time data augmentation:
history=model.fit(datagen.flow(x_digits_train, y_digits_train, batch_size=64),validation_data=(x_digits_test, y_digits_test),
                    steps_per_epoch=len(x_digits_train) / 64, epochs=120)
plotTraining(history)

model.evaluate(x_digits_test,y_digits_test)

### Test performance summary
**trained ussing 100 images only!**

- simple model:
 
         0.7491 -> 0.8349 (data ug.)
    
- CNN (for other task, much much higher!):
    
        0.7685 -> 0.8414 (data aug.)
        0.7685 -> 0.8974 (fine-tuning)
        0.7685 -> 0.9251 (data aug. & fine-tuning)

## Error analysis

In [None]:
## same network from scratch using all training data

(x_digits_train, y_digits_train), (x_digits_test, y_digits_test) = mnist.load_data()


# Preprocess data 
x_digits_train = x_digits_train.astype('float32')
x_digits_test = x_digits_test.astype('float32')
x_digits_train = x_digits_train/ 255
x_digits_test = x_digits_test/ 255
x_digits_train=x_digits_train.reshape((len(x_digits_train),28,28,1))
x_digits_test=x_digits_test.reshape((len(x_digits_test),28,28,1))


# Define model 
model_digits_full = Sequential()
model_digits_full.add(Conv2D(16,(3,3),activation='relu',input_shape=(28,28,1)))
model_digits_full.add(Conv2D(16,(3,3),activation='relu'))
model_digits_full.add(MaxPooling2D((2,2)))
model_digits_full.add(Conv2D(32,(3,3),activation='relu'))
model_digits_full.add(Conv2D(32,(3,3),activation='relu'))

model_digits_full.add(Flatten())
model_digits_full.add(Dense(50, activation='relu'))
model_digits_full.add(Dense(10, activation='softmax'))
# Compile model
model_digits_full.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# train model
history=model_digits_full.fit(x_digits_train, y_digits_train,validation_data=(x_digits_test, y_digits_test),
                         batch_size=128, epochs=5, verbose=1)
plotTraining(history)

model_digits_full.evaluate(x_digits_test,y_digits_test)

In [None]:
wrong_idx=np.where(np.argmax(model_digits_full.predict(x_digits_test), axis=1)!=(y_digits_test))[0]
len(wrong_idx)

In [None]:
plt.figure(figsize=(20,40))
for i in range(0,50):
  plt.subplot(10,5,i+1)
  plt.imshow(x_digits_test[wrong_idx[i],:,:,0],cmap="gray")
  plt.title("pred: "+str(np.argmax(model_digits_full.predict(x_digits_test[wrong_idx[i:(i+1)]]), axis=1)[0])+ " true: "+str((y_digits_test[wrong_idx[i]])))
