In [1]:
import numpy as np 
import math
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import sklearn.datasets
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
%matplotlib inline

In [2]:
train = pd.read_csv("./digit-recognizer/train.csv")
#test = pd.read_csv("/content/drive/My Drive/20192/digit-recognizer/test.csv")

Y_train = train["label"]

# Drop 'label' column
X_train = train.drop(labels = ["label"],axis = 1)

# Free some space
del train

# Normalize the data
X_train = np.array(X_train) / 255.0

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=2)

In [3]:
def to_categorical(y, num_classes, dtype='float32'):
    #from keras utils
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=dtype)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

In [4]:
# One-hot coding
Y_train = to_categorical(Y_train, num_classes = 10)
Y_val = to_categorical(Y_val, num_classes = 10)

# Transpose X, Y
X_train = np.array(X_train.T)
Y_train = np.array(Y_train.T)
X_val = np.array(X_val.T)
Y_val = np.array(Y_val.T)

In [38]:
# Check shape of X, Y
print ("X_train.shape = " + str(X_train.shape)
        + "\nY_train.shape = " + str(Y_train.shape))
print ("X_val.shape = " + str(X_val.shape)
        + "\nY_val.shape = " + str(Y_val.shape))

X_train.shape = (784, 37800)
Y_train.shape = (10, 37800)
X_val.shape = (784, 4200)
Y_val.shape = (10, 4200)


In [28]:
def initialize_parameters_deep(layers_dims):
    parameters = {}
    L = len(layer_dims)
    
    for l in range (1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros(shape=(layer_dims[l], 1))
    
    return parameters

In [44]:
def linear_forward(A, W, b):
    print("---")
    print(W.shape)
    print(A.shape)
    print(b.shape)
    Z = np.dot(W, A) + b
    linear_cache = (A, W, b)
    
    return Z, linear_cache

In [52]:
def theta(Z):
    T = np.exp(Z) / np.sum(np.exp(Z), axis = 0)
    return T, Z

In [35]:
def relu(Z):
    R = np.maximum(0, Z)
    return R, Z

In [39]:
def linear_activation_forward(A_prev, W, b, activation):
    if activation == "theta":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = theta(Z)
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    cache = (linear_cache, Z)

    return A, cache

In [53]:
def L_model_forward(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A, W=parameters['W' + str(l)], b=parameters['b' + str(l)], activation="relu")
        caches.append(cache) 
    
    AL, cache = linear_activation_forward(A, W=parameters['W' + str(l+1)], b=parameters['b' + str(l+1)], activation="theta")
    caches.append(cache)
    
    return AL, caches

In [14]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    
    cost = -np.sum(Y*np.log(AL) + (1-Y)*np.log(1-AL)) / m
    cost = np.squeeze(cost)
    
    return cost

In [15]:
def linear_backward(dZ, linear_cache):
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    
    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis=1, keepdims=True) / m
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

In [70]:
def linear_activation_backward(dA, Y, cache, activation):
    linear_cache, Z = cache 
    A, W, b = linear_cache
    if activation == "relu":
        dZ = np.array(dA, copy=True)
        dZ[Z <= 0 ] = 0
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    return dA_prev, dW, db

In [78]:
def L_model_backward(AL, Y, caches):
    grads = {}
    L = len(caches)
    m = AL.shape[1]
    current_cache = caches[L-1]
    linear_cache, Z = current_cache
    dZ = AL - Y
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = dA_prev, dW, db
    for l in reversed(range(L-1)):

        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dA=grads["dA" + str(l+1)], Y=Y, cache=current_cache, activation="relu")
        grads["dA" + str(l)] = dA_prev
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

In [18]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads['dW' + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads['db' + str(l+1)]
        
    return parameters

In [19]:
def random_mini_batches(X, Y, mini_batch_size=64, seed=0):
    m = X.shape[1]
    mini_batches = []
    np.random.seed(seed)

    # Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((10,m))

    num_complete_minibatches = math.floor(m / mini_batch_size) 

    #Partition 
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size:
                                  (k+1) * mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size:(k+1)
                                    * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    if m % mini_batch_size !=0:
        end = m - mini_batch_size * math.floor(m / mini_batch_size)
        mini_batch_X = shuffled_X[:, num_complete_minibatches
                                    * mini_batch_size:]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches
                                    * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches

In [45]:
def L_layer_model(X, Y, layers_dims, num_epochs = 3000, learning_rate = 0.05, mini_batch_size = 64, print_cost = False):
  parameters = initialize_parameters_deep(layers_dims)
  seed = 10
  costs = []

  for i in range(num_epochs):
    seed = seed + 1
    minibatches = random_mini_batches(X, Y, mini_batch_size, seed)
    cost_total = 0

    for minibatch in minibatches:
      #Select a minibatch
      (minibatch_X, minibatch_Y) = minibatch

      #foward propagation
      AL, caches = L_model_forward(minibatch_X, parameters)

      cost_total += compute_cost(AL, minibatch_Y)

      grads = L_model_backward(AL, minibatch_Y, caches)
      #update parameter
      parameters = update_parameters(parameters, grads, learning_rate)

    cost_avg = cost_total / len(minibatches)
    if print_cost:
      print ("Cost after iteration %i: %f" % (i, cost_avg))
    if print_cost:
      costs.append(cost_avg)

  plt.plot(costs)
  plt.ylabel('cost')
  plt.xlabel('epochs (per 100)')
  plt.title("Learning rate = " + str(learning_rate))
  plt.show()
  return parameters

In [42]:
layers_dims = [784, 64, 30, 18, 10]

In [79]:
dnn_parameters = L_layer_model(X_train, Y_train, layers_dims, num_epochs = 1,
                               learning_rate=0.05, mini_batch_size=32, print_cost = True)

---
(64, 784)
(784, 32)
(64, 1)
---
(30, 64)
(64, 32)
(30, 1)
---
(18, 30)
(30, 32)
(18, 1)
---
(10, 18)
(18, 32)
(10, 1)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 18 but corresponding boolean dimension is 30