# Intro to AI-driven Science on Supercomputers

## Week 2 Homework

#### Dan Horner (danhorner@berkeley.edu)
---

# Hyperparameter search

## Set up

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

import tensorflow as tf

import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
np.random.seed(1)

In [3]:
# repeating the data prep from the previous notebook
(x_train_orig, y_train_orig), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train_orig = x_train_orig.astype(np.float32)
x_test  = x_test.astype(np.float32)

x_train_orig /= 255.
x_test  /= 255.

print(x_train_orig.shape)
x_train_orig = x_train_orig.reshape(x_train_orig.shape[0], np.prod(x_train_orig[0,:,:].shape))
x_test = x_test.reshape(x_test.shape[0], np.prod(x_test[0,:,:].shape))

print(x_train_orig.shape)
y_train_orig = y_train_orig.astype(np.int32)
y_test  = y_test.astype(np.int32)

x_train_i, x_val, y_train_i, y_val = train_test_split(x_train_orig, y_train_orig, test_size=0.2)
print(x_train_i.shape)
print(x_val.shape)


print()
print('MNIST data loaded: train:',len(x_train_orig),'test:',len(x_test))
print('X_train:', x_train_i.shape)
print('y_train:', y_train_i.shape)
print('X_val:', x_val.shape)
print('y_val:', y_val.shape)

# one-hot encoding:
nb_classes = 10
y_train_onehot_i = tf.keras.utils.to_categorical(y_train_i, nb_classes)
y_val_onehot = tf.keras.utils.to_categorical(y_val, nb_classes)
y_test_onehot = tf.keras.utils.to_categorical(y_test, nb_classes)

(60000, 28, 28)
(60000, 784)
(48000, 784)
(12000, 784)

MNIST data loaded: train: 60000 test: 10000
X_train: (48000, 784)
y_train: (48000,)
X_val: (12000, 784)
y_val: (12000,)


In [4]:
# Here we import an implementation of a two-layer neural network 
# this code is based on pieces of the first assignment from Stanford's CSE231n course, 
# hosted at https://github.com/cs231n/cs231n.github.io with the MIT license
from fc_net import TwoLayerNet

In [11]:
def accuracy(model, x, true_values):
    scores = model.loss(x)
    predictions = np.argmax(scores, axis=1)
    N = predictions.shape[0]
    acc = (true_values == predictions).sum() / N
    return acc

In [12]:
# a simple implementation of stochastic gradient descent
def sgd(model, gradients, learning_rate):
    for p, w in model.params.items():
        dw = gradients[p]
        new_weights = w - learning_rate * dw
        model.params[p] = new_weights
    return model

In [13]:
# one training step
def learn(model, x_train, y_train_onehot, learning_rate):
    loss, gradients = model.loss(x_train, y_train_onehot)
    model = sgd(model, gradients, learning_rate)
    return loss, model

In [30]:

n_search = 100
num_epochs = 50

hd_rng = (100, 1000)
ws_rng = (0.01, 0.2)
learning_rate_rng = (0.05, 0.50) 
batch_size_rng = (100, 100)#(50, 500)

best_acc = 0.0

for ival in range(n_search):
    print(ival)
    
    hd = int(np.random.uniform(low = hd_rng[0], high = hd_rng[1]))
    ws = np.random.uniform(low = ws_rng[0], high = ws_rng[1])
    learning_rate = np.random.uniform(low = learning_rate_rng[0], high = learning_rate_rng[1])
    batch_size = int(np.random.uniform(low = batch_size_rng[0], high = batch_size_rng[1]))
    
    x_train = x_train_i.copy()
    y_train = y_train_i.copy()
    y_train_onehot = y_train_onehot_i.copy()

    num_features = x_train.shape[1]

    num_examples = x_train.shape[0]
    num_batches = int(num_examples / batch_size)

    model = TwoLayerNet(input_dim=num_features, hidden_dim=hd, num_classes=nb_classes, weight_scale=ws)

    losses = np.zeros(num_batches*num_epochs,)
    indices = np.arange(num_examples)

    i = 0
    for epoch in range(0, num_epochs):
        # in each epoch, we loop over all of the training examples
        for step in range(0, num_batches):
            # grabbing the next batch
            offset = step * batch_size
            batch_range = range(offset, offset+batch_size)
            x_train_batch = x_train[batch_range, :]
            y_train_batch = y_train_onehot[batch_range,:]
        
            # feed the next batch in to do one sgd step
            loss, model = learn(model, x_train_batch, y_train_batch, learning_rate)
            losses[i] = loss
            i += 1
   
        # reshuffle the data so that we get a new set of batches
        np.random.shuffle(indices)
        x_train = x_train[indices,:]
        y_train = y_train[indices] # keep this shuffled the same way for use in accuracy calculation
        y_train_onehot = y_train_onehot[indices,:]
    
    acc = accuracy(model, x_val, y_val)
    if(acc > best_acc):
        best_acc = acc
        best_params = (hd, ws, learning_rate, batch_size)
        best_model = model
        print('Best!')
    print(accuracy(model, x_val, y_val))
    print(hd, ws, learning_rate, batch_size)
    print('')

    

0
Best!
0.9765
336 0.15981224951291315 0.4472879912339568 100

1
Best!
0.98075
506 0.02492791121867254 0.39151707425964777 100

2
0.9729166666666667
197 0.19594247701670905 0.43102109171724956 100

3
0.9763333333333334
520 0.163670370040972 0.1908233578663519 100

4
0.9775
524 0.09623376293122037 0.20388447507625584 100

5
0.9719166666666667
215 0.14881931287840053 0.0880725328553 100

6
0.9764166666666667
773 0.10517155889936351 0.22418485154658807 100

7
0.9770833333333333
677 0.08363017459680365 0.15082383381806985 100

8
0.9745833333333334
822 0.16607510215879084 0.2142820315336918 100

9
0.97675
708 0.1457563567593147 0.40348901023168365 100

10
0.97925
968 0.1131071693642286 0.2823896597888211 100

11
0.9749166666666667
633 0.18515181423625376 0.35585508227573714 100

12
0.9749166666666667
308 0.07149125137338239 0.05276130328307656 100

13
Best!
0.9819166666666667
995 0.02527787077651309 0.44246482823913197 100

14
0.97775
965 0.056495551028172844 0.07582244377141942 100

15
0.9

In [31]:
best_params

(995, 0.02527787077651309, 0.44246482823913197, 100)

In [32]:
print(accuracy(best_model, x_train, y_train))
print(accuracy(best_model, x_val, y_val))
print(accuracy(best_model, x_test, y_test))

1.0
0.9819166666666667
0.9824


In [None]:
# epochs, add layers, width of layers, scale factors,

# Validation test train

#plot acc also