## AIDL Deep Learning and Neural Networks
### Columbia University


# Assignment 3: Multilayer Perceptron (MLP)
In this assignment, you will be implementing an MLP using TensorFlow.

#### import packages

In [3]:
# Import modules
from __future__ import print_function
import tensorflow as tf
import numpy as np
import time
import os
import matplotlib.pyplot as plt
import pickle
import tarfile
import glob
import urllib.request as url
# from utils.cifar_utils import load_data

# Plot configurations
%matplotlib inline

# Notebook auto reloads code. (Ref: http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython)
%load_ext autoreload
%autoreload 2

#### Functions for downloading CIFAR-100 dataset

In [4]:
def unpickle(file):
    with open(file, 'rb') as fo:
        res = pickle.load(fo, encoding='bytes')
    return res


def download_data():
    """
    Download the CIFAR-100 data from the website, which is approximately 170MB.
    The data (a .tar.gz file) will be store in the ./data/ folder.
    :return: None
    """
    if not os.path.exists('./data'):
        os.mkdir('./data')
        print('Start downloading data...')
        url.urlretrieve("https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz",
                        "./data/cifar-100-python.tar.gz")
        print('Download complete.')
    else:
        if os.path.exists('./data/cifar-100-python.tar.gz'):
            print('CIFAR-100 package already exists.')
        print('Start downloading data...')
        url.urlretrieve("https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz",
                        "./data/cifar-100-python.tar.gz")
        print('Download complete.')



def load_data():
    """
    Unpack the CIFAR-100 dataset and load the coarse datasets (20-class).
    :return: A tuple of label_map, data/labels. For both training and test sets.
    """
    # If the data hasn't been downloaded yet, download it first.
    if not os.path.exists('./data/cifar-100-python.tar.gz'):
        download_data()
    # Check if the package has been unpacked, otherwise unpack the package
    if not os.path.exists('./data/cifar-100-python/'):
        package = tarfile.open('./data/cifar-100-python.tar.gz')
        package.extractall('./data')
        package.close()

    # Go to the location where the files are unpacked
    os.chdir('./')

    # load the label_map and data
    meta = unpickle('./data/cifar-100-python/meta')

    coarse_label_names = [t.decode('utf8') for t in meta[b'coarse_label_names']]

    train = unpickle('./data/cifar-100-python//train')
    y_train = np.array(train[b'coarse_labels'])
    X_train = np.array(train[b'data'])

    test = unpickle('./data/cifar-100-python//test')
    y_test = np.array(test[b'coarse_labels'])
    X_test = np.array(test[b'data'])

    return coarse_label_names, X_train, y_train, X_test, y_test

## Load Data

In [5]:
# Load the raw CIFAR-100 data.
label_map, X_train, y_train, X_test, y_test = load_data()

# Data organizations:
# Train data: 49000 samples from original train set: 1~49,000
# Validation data: 1000 samples from original train set: 49,000~50,000
# Test data: 10000 samples from original test set: 1~10,000
# Development data (for gradient check): 100 from the train set: 1~49,000
num_training = 49000
num_validation = 1000
num_dev = 100

X_val = X_train[-num_validation:, :]
y_val = y_train[-num_validation:]

mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]

X_train = X_train[:num_training, :]
y_train = y_train[:num_training]

# Preprocessing: subtract the mean value across every dimension for training data
mean_image = np.mean(X_train, axis=0)

X_train = X_train.astype(np.float32) - mean_image.astype(np.float32)
X_val = X_val.astype(np.float32) - mean_image
X_test = X_test.astype(np.float32) - mean_image
X_dev = X_dev.astype(np.float32) - mean_image

print(X_train.shape, X_val.shape, X_test.shape, X_dev.shape)
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('Development data shape:', X_dev.shape)
print('Development data shape', y_dev.shape)

(49000, 3072) (1000, 3072) (10000, 3072) (100, 3072)
Train data shape:  (49000, 3072)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3072)
Validation labels shape:  (1000,)
Test data shape:  (10000, 3072)
Test labels shape:  (10000,)
Development data shape: (100, 3072)
Development data shape (100,)


## Part 1: TensorFlow MLP
In this part, you will use TensorFlow modules to implement an MLP. We provide a demo of a two-layer net; for more information, look at https://www.tensorflow.org/guide/keras and https://www.tensorflow.org/guide/eager.

You will need to implement a multi-layer network with 3 layers in a similar style.

### Demo: Two-layer MLP in Tensorflow

In [6]:
## Demo: Two-layer net in tensorflow (eager execution mode)
hidden_dim = 300
reg_tf = tf.constant(0.01)

# define a tf.keras.Model class
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()
        self.W1 = tf.Variable(1e-2*np.random.rand(3072, hidden_dim).astype('float32'))
        self.b1 = tf.Variable(np.zeros((hidden_dim,)).astype('float32'))
        self.W2 = tf.Variable(1e-2*np.random.rand(hidden_dim, 20).astype('float32'))
        self.b2 = tf.Variable(np.zeros((20,)).astype('float32'))
    def call(self, inputs):
        """Run the model."""
        h1 = tf.nn.relu(tf.matmul(inputs, self.W1) + self.b1)
        out = tf.matmul(h1, self.W2) + self.b2
        return out

# Define and calculate loss function (Note that in eager execution, loss must be in a function)
def loss(model, inputs, targets, reg = tf.constant(0.01)):
    out = model(inputs)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits= out, labels=tf.one_hot(targets,20))
    L2_loss = tf.nn.l2_loss(model.W1) + tf.nn.l2_loss(model.W2)
    return tf.reduce_mean(cross_entropy) + reg * L2_loss

# calculate gradients for all variables using tf.GradientTape
def grad(model, inputs, targets, reg = tf.constant(0.01)):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, reg=reg)
    return tape.gradient(loss_value, [model.W1, model.b1, model.W2, model.b2])

# calculate classification accuracy
def eval_acc(model, inputs, targets):
    correct_prediction = tf.equal(targets, tf.cast(tf.argmax(model(inputs),1), tf.uint8))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

num_train = 49000
batch_size = 500
num_batch = num_train//batch_size
num_epochs = 12
model = Model()
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)

for e in range(num_epochs):
    for i in range(num_batch):
        batch_xs, batch_ys = X_train[i*batch_size:(i+1)*batch_size], y_train[i*batch_size:(i+1)*batch_size]
        x_tf = tf.Variable(batch_xs, dtype = tf.float32)
        y_tf = tf.Variable(batch_ys, dtype = tf.uint8)

        grads = grad(model, x_tf, y_tf, reg_tf)
        #optimization based on calculated gradients
        optimizer.apply_gradients(zip(grads, [model.W1, model.b1, model.W2, model.b2]))

    x_tf = tf.Variable(X_val, dtype = tf.float32)
    y_tf = tf.Variable(y_val, dtype = tf.uint8)
    accuracy = eval_acc(model, x_tf, y_tf)
    val_acc = accuracy.numpy()
    print('epoch {}: valid acc = {}'.format(e+1, val_acc))

x_tf = tf.Variable(X_test, dtype = tf.float32)
y_tf = tf.Variable(y_test, dtype = tf.uint8)
accuracy = eval_acc(model, x_tf, y_tf)
test_acc = accuracy.numpy()
print('test acc = {}'.format(test_acc))

epoch 1: valid acc = 0.0860000029206276
epoch 2: valid acc = 0.10400000214576721
epoch 3: valid acc = 0.0989999994635582
epoch 4: valid acc = 0.14499999582767487
epoch 5: valid acc = 0.12600000202655792
epoch 6: valid acc = 0.12600000202655792
epoch 7: valid acc = 0.15000000596046448
epoch 8: valid acc = 0.15700000524520874
epoch 9: valid acc = 0.16099999845027924
epoch 10: valid acc = 0.1770000010728836
epoch 11: valid acc = 0.20600000023841858
epoch 12: valid acc = 0.2070000022649765
test acc = 0.21969999372959137


### Create a Deeper Network

<span style="color:red"><strong>TODO</strong></span>: Create your MLP in tensorflow. Since you are going to create a deeper neural network, it is recommended to use "list" to store your network parameters (weights and bias). Consider using a loop to create your MLP network.

The model structure is described below:

**input tensor** -> **Dense layer (relu, 100 hidden dims)** -> **Dense layer (relu, 200 hidden dims)** -> **Affine layer ( 20 hidden dims)** -> **Softmax**

And add L2 regularization with 0.01 regularization weight when calculating the loss.

<span style="color:red"><strong>TODO</strong></span>: define the network structure

In [7]:
#Define network structure
input_dim = 3072
num_classes = 20
reg_tf = tf.constant(0.01) # regularization parameter
# %%%%%%%%%%%%%% implement your code below (3 lines) %%%%%%%%%%%%%%
hidden_dims = [100, 200, 20]  # list of hidden layer dimensions
num_layers = len(hidden_dims)  # total number of layers
layer_dims = [input_dim] + hidden_dims  # list of all layer dimensions including input
# %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%

<span style="color:red"><strong>TODO</strong></span>: define the network class

In [8]:
import tensorflow as tf


# using Clousre
class Model(tf.keras.Model):
    def __init__(self):
        super(Model, self).__init__()

        # %%%%%%%%%%%%%% implement your code below (2 lines) %%%%%%%%%%%%%%
        self.W = [
            tf.Variable(
                tf.random.normal([layer_dims[i], layer_dims[i+1]], stddev=0.1),
                name=f'W{i}'
            ) for i in range(num_layers)
        ]
        self.b = [
            tf.Variable(
                tf.zeros([layer_dims[i+1]]),
                name=f'b{i}'
            ) for i in range(num_layers)
        ]
        # %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%

    def call(self, inputs):
        hidden_layers = []

        # Add Affine layers with ReLU activation
        for i in range(num_layers - 1):  # iterate over each dense layer
            # %%%%%%%%%%%%%% implement your code below (2 lines) %%%%%%%%%%%%%%
            temp = tf.matmul(inputs, self.W[i]) + self.b[i]  # implement the dense layer
            hidden_layers.append(tf.nn.relu(temp))  # append with ReLU activation
            inputs = hidden_layers[-1]  # update input for next layer
            # %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%

        # Final layer
        # %%%%%%%%%%%%%% implement your code below (1 line) %%%%%%%%%%%%%%
        out = tf.matmul(inputs, self.W[-1]) + self.b[-1]  # implement final layer
        # %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%

        return tf.nn.softmax(out)  # Apply softmax to get probabilities


<span style="color:red"><strong>TODO</strong></span>: define the loss function

In [9]:
def loss(model, inputs, targets, reg = tf.constant(0.01)):
    L2_loss = 0.0 #initialize the loss
    # %%%%%%%%%%%%%% implement your code below (2 lines) %%%%%%%%%%%%%%
    out = model(inputs)  # get model output
    # Calculate cross entropy loss with softmax
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=tf.cast(targets, tf.int32),
        logits=out
    )
    # %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%
    for i in range(num_layers):
        # %%%%%%%%%%%%%% implement your code below (1 line) %%%%%%%%%%%%%%
        L2_loss += tf.reduce_sum(tf.square(model.W[i]))    #add the L2 regularization
        # %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%
    return tf.reduce_mean(cross_entropy) + reg * L2_loss

We can re-utilize the gradient calculation(**def grad()**), accuracy evaluation (**def eval_acc()**) function and the training loop:


In [10]:
def grad(model, inputs, targets, reg = tf.constant(0.01)):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, reg=reg)
    return tape.gradient(loss_value, (model.W + model.b))

def eval_acc(model, inputs, targets):
    correct_prediction = tf.equal(targets, tf.cast(tf.argmax(model(inputs),1), tf.uint8))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

num_train = 49000
batch_size = 500
num_batch = num_train//batch_size
num_epochs = 10
model = Model()
optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)

for e in range(num_epochs):
    for i in range(num_batch):
        batch_xs, batch_ys = X_train[i*batch_size:(i+1)*batch_size], y_train[i*batch_size:(i+1)*batch_size]
        x_tf = tf.Variable(batch_xs, dtype = tf.float32)
        y_tf = tf.Variable(batch_ys, dtype = tf.uint8)

        grads = grad(model, x_tf, y_tf, reg_tf)
        optimizer.apply_gradients(zip(grads, (model.W + model.b)))

    x_tf = tf.Variable(X_val, dtype = tf.float32)
    y_tf = tf.Variable(y_val, dtype = tf.uint8)
    accuracy = eval_acc(model, x_tf, y_tf)
    val_acc = accuracy.numpy()
    print('epoch {}: valid acc = {}'.format(e+1, val_acc))

x_tf = tf.Variable(X_test, dtype = tf.float32)
y_tf = tf.Variable(y_test, dtype = tf.uint8)
accuracy = eval_acc(model, x_tf, y_tf)
test_acc = accuracy.numpy()
print('test acc = {}'.format(test_acc))

epoch 1: valid acc = 0.05999999865889549
epoch 2: valid acc = 0.07800000160932541
epoch 3: valid acc = 0.09700000286102295
epoch 4: valid acc = 0.08799999952316284
epoch 5: valid acc = 0.10300000011920929
epoch 6: valid acc = 0.10899999737739563
epoch 7: valid acc = 0.10999999940395355
epoch 8: valid acc = 0.13699999451637268
epoch 9: valid acc = 0.15000000596046448
epoch 10: valid acc = 0.14100000262260437
test acc = 0.12919999659061432


## Part 2: Introduction to TensorFlow.keras

As you can see, as the network structure becomes larger it gets harder to handle variables from every layer. Here we introduce the `tf.keras` tool to build the network in a much simpler way.

<span style="color:red"><strong>TODO</strong></span>: Follow this official example: https://www.tensorflow.org/datasets/keras_example#step_2_create_and_train_the_model to build and train the MLP model shown below:

**input tensor** -> **Dense layer (relu, 100 hidden dims)** -> **Dense layer (relu, 200 hidden dims)** -> **Affine layer (20 hidden dims)** -> **Softmax**

*You should keep the same optimizer (SGD with the same learning rate and batch size) and loss function (cross entropy with L2 regularization) as the previous task.*

You need to study the usage of
[`tf.keras.layers.Dense`](https://keras.io/api/layers/core_layers/dense/) to fully equip all the functionalities that we used above.

You need to check the usage of [`model.fit()`](https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit) and feed the model with our own data.

As the result, it should return a similar accuracy as our above sample (~0.33 validation accuracy ). The reason why we want you to replicate the model and hyper-parameters is that you can learn almost all aspects of basic model implementation in Keras. In the future, you can freely use these useful building blocks to build your own models.

**Tips:**
* Softmax is also registered as a layer operation in tf.keras.
* You can use `model.summary()` to visualize the model after you build it.
* Use `verbose=2` in `model.fit()` to get similar training logs
* **Keras Important API Reference:** https://keras.io/api/

In [11]:
initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=1e-3, seed=None) # the default initialization of Dense Layer is not random normalization
#############################################################
# TODO: build the model with tf.keras.models.Sequential
# %%%%%%%%%%%%%% implement your code below (1 line) %%%%%%%%%%%%%%
keras_model = keras_model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(3072,)),
    tf.keras.layers.Dense(100, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(200, activation='relu', kernel_initializer=initializer),
    tf.keras.layers.Dense(20, kernel_initializer=initializer),
    tf.keras.layers.Softmax()]) # add code into the parentheses
# %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%
#############################################################



#############################################################
# TODO: compile the model, set optimizer and loss and metrics
# %%%%%%%%%%%%%% implement your code below (1 line) %%%%%%%%%%%%%%
keras_model.compile(
    optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
# %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%
#############################################################



#############################################################
# TODO: train the model with our own dataset
keras_model.summary()

# train the model
y_train_keras = tf.one_hot(y_train, 20)
y_val_keras = tf.one_hot(y_val, 20)
# %%%%%%%%%%%%%% implement your code below (1 line) %%%%%%%%%%%%%%
keras_model.fit(X_train,
    y_train_keras,
    batch_size=32,
    epochs=10,
    validation_data=(X_val, y_val_keras),
    verbose=2) # add code into the parentheses
# %%%%%%%%%%%%%% your code ends here %%%%%%%%%%%%%%
#############################################################

  super().__init__(**kwargs)


Epoch 1/10
1532/1532 - 6s - 4ms/step - accuracy: 0.0891 - loss: 2.9943 - val_accuracy: 0.0880 - val_loss: 2.9827
Epoch 2/10
1532/1532 - 4s - 3ms/step - accuracy: 0.1179 - loss: 2.7775 - val_accuracy: 0.1540 - val_loss: 2.6610
Epoch 3/10
1532/1532 - 7s - 4ms/step - accuracy: 0.2024 - loss: 2.5402 - val_accuracy: 0.1980 - val_loss: 2.5099
Epoch 4/10
1532/1532 - 4s - 3ms/step - accuracy: 0.2535 - loss: 2.3934 - val_accuracy: 0.2520 - val_loss: 2.4030
Epoch 5/10
1532/1532 - 4s - 3ms/step - accuracy: 0.2876 - loss: 2.2935 - val_accuracy: 0.2800 - val_loss: 2.3325
Epoch 6/10
1532/1532 - 6s - 4ms/step - accuracy: 0.3153 - loss: 2.2121 - val_accuracy: 0.3030 - val_loss: 2.2398
Epoch 7/10
1532/1532 - 4s - 3ms/step - accuracy: 0.3376 - loss: 2.1438 - val_accuracy: 0.3240 - val_loss: 2.2084
Epoch 8/10
1532/1532 - 6s - 4ms/step - accuracy: 0.3535 - loss: 2.0871 - val_accuracy: 0.3310 - val_loss: 2.2003
Epoch 9/10
1532/1532 - 6s - 4ms/step - accuracy: 0.3690 - loss: 2.0386 - val_accuracy: 0.3140 - 

<keras.src.callbacks.history.History at 0x7dbe2e6822f0>