In [1]:
# Libraries I will use to build the model
import numpy as np

In [2]:
# Libraries to install mnist dataset from web
import gzip
import os
from urllib.request import urlretrieve

In [3]:
def download_mnist(url, filename):
    if not os.path.exists(filename):
        print("Downloading ", filename)
        urlretrieve(url, filename)
    else:
        print("File ", filename, " already exists.")

In [4]:
def read_mnist_images(filename):
    with gzip.open(filename, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16)
    return data.reshape(-1, 784)

In [5]:
def read_mnist_labels(filename):
    with gzip.open(filename, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=8)
    return data

In [6]:
# Downloading the MNIST dataset
base_url = 'http://yann.lecun.com/exdb/mnist/'
download_mnist(base_url + 'train-images-idx3-ubyte.gz', "train-images-idx3-ubyte.gz")
download_mnist(base_url + 'train-labels-idx1-ubyte.gz', "train-labels-idx1-ubyte.gz")
download_mnist(base_url + 't10k-images-idx3-ubyte.gz', "t10k-images-idx3-ubyte.gz")
download_mnist(base_url + 't10k-labels-idx1-ubyte.gz', "t10k-labels-idx1-ubyte.gz")

File  train-images-idx3-ubyte.gz  already exists.
File  train-labels-idx1-ubyte.gz  already exists.
File  t10k-images-idx3-ubyte.gz  already exists.
File  t10k-labels-idx1-ubyte.gz  already exists.


In [7]:
# Reading and converting the data to numpy arrays
X_train = read_mnist_images('train-images-idx3-ubyte.gz')
y_train = read_mnist_labels('train-labels-idx1-ubyte.gz')
X_test = read_mnist_images('t10k-images-idx3-ubyte.gz')
y_test = read_mnist_labels('t10k-labels-idx1-ubyte.gz')

In [8]:
print("Train images shape:", X_train.shape)
print("Train labels shape:", y_train.shape)
print("Test images shape:", X_test.shape)
print("Test labels shape:", y_test.shape)

Train images shape: (60000, 784)
Train labels shape: (60000,)
Test images shape: (10000, 784)
Test labels shape: (10000,)


In [9]:
# function to one hot encode labels
def one_hot_encoder(labels):
    encoded_labels = np.zeros(shape=(labels.shape[0], 10))
    for idx, label in enumerate(labels):
        encoded_labels[idx][label] = 1
    return encoded_labels

In [10]:
# one hot encoding labels
y_train, y_test = one_hot_encoder(y_train), one_hot_encoder(y_test)

In [11]:
print("Train images shape:", X_train.shape)
print("Train labels shape:", y_train.shape)
print("Test images shape:", X_test.shape)
print("Test labels shape:", y_test.shape)

Train images shape: (60000, 784)
Train labels shape: (60000, 10)
Test images shape: (10000, 784)
Test labels shape: (10000, 10)


In [12]:
# function for normalizing pixels
def normalize(array):
    return array / 255.

In [13]:
X_train, X_test = normalize(X_train), normalize(X_test)

In [14]:
# setting seed for random
np.random.seed(42)

# defining train valid split function
def train_valid_split(valid_ratio = 0.1):
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    shuffled_X_train = X_train[indices]
    shuffled_y_train = y_train[indices]

    split_index = int(valid_ratio * shuffled_X_train.shape[0])

    # return X_train, y_train, X_valid, y_valid
    return shuffled_X_train[split_index:], shuffled_y_train[split_index:], shuffled_X_train[:split_index], shuffled_y_train[:split_index]

In [15]:
X_train, y_train, X_valid, y_valid = train_valid_split()
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(54000, 784) (54000, 10) (6000, 784) (6000, 10)


In [16]:
# ReLu activation function
def relu(x):
    return np.maximum(0,x)

# ReLu prime function
def relu_prime(x):
    return x > 0

In [17]:
# fonction for initializing model structure
def struct_model():
    W1 = np.random.uniform(low = -1., high= 1., size=(784, 784))
    b1 = np.random.uniform(low = -1., high= 1., size=(1, 784))

    W2 = np.random.uniform(low = -1., high= 1., size=(784, 10))
    b2 = np.random.uniform(low = -1., high= 1., size=(1, 10))

    return W1, b1, W2, b2

In [18]:
# I will declare weights and biases as global variables
###############################
W1, b1, W2, b2 = struct_model()
###############################

In [19]:
# feed forward function to calculate the output
def feed_forward():
    Z1 = np.dot(X_train, W1) + b1
    A1 = relu(Z1)
    
    Z2 = np.dot(A1, W2) + b2
    
    # Softmax normalization
    Z2 -= np.max(Z2, axis=1, keepdims=True)  # Subtracting the max for numerical stability, otherwise getting NaN and inf values
    pred = np.exp(Z2) / np.sum(np.exp(Z2), axis=1, keepdims=True)

    return Z1, A1, Z2, pred

In [20]:
# I will use cross-entropy loss in back propogation and in metrics along with accuracy
def cross_entropy_loss(y_pred):
    
    m = y_train.shape[0]
    epsilon = 1e-10  # Small constant to avoid division by zero
    loss = -np.sum(y_train * np.log(y_pred + epsilon)) / m
    return loss

def accuracy(pred):
    
    global y_train
    return np.sum(np.argmax(pred,axis=1) == np.argmax(y_train, axis=1)) / y_train.shape[0]

In [21]:
# function for predicting the digit in the MNIST dataset image
def predict(img):
    Z1 = np.dot(img, W1) + b1
    A1 = relu(Z1)
    
    Z2 = np.dot(A1, W2) + b2
    # Softmax normalization
    Z2 -= np.max(Z2, axis=1, keepdims=True)  # Subtract the max for numerical stability
    pred = np.exp(Z2) / np.sum(np.exp(Z2), axis=1, keepdims=True)

    return np.argmax(pred)

In [22]:
# defining another function to calculate accuracy for validation set
def valid_accuracy():
    global X_valid, y_valid

    preds = []
    for i in range(X_valid.shape[0]):
        pred = predict(X_valid[i])
        preds.append(pred)
    pred = np.array(preds)

    m = y_valid.shape[0]
    
    acc = np.sum(np.argmax(y_valid, axis=1) == pred) / m

    return acc

In [23]:
def back_propagation(Z1, A1, Z2, pred):
    m = X_train.shape[0] # total # of entries

    # calculate gradients of cross-entropy loss with respect to Z2
    dZ2 = pred - y_train

    # gradients of W2 and b2
    dW2 = np.dot(A1.T, dZ2) / m
    db2 = np.sum(dZ2, axis=0, keepdims=True) / m

    # gradients of A1 and Z1
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * relu_prime(Z1)

    # gradients of W1 and b1
    dW1 = np.dot(X_train.T, dZ1) / m
    db1 = np.sum(dZ1, axis=0, keepdims=True) / m

    return dW1, db1, dW2, db2

In [24]:
def update_weights(dW1, db1, dW2, db2, learning_rate):
    global W1
    W1 = W1 - learning_rate * dW1
    global b1
    b1 = b1 - learning_rate * db1
    global W2
    W2 = W2 - learning_rate * dW2
    global b2
    b2 = b2 - learning_rate * db2

In [25]:
# defining the model checkpoint callback -> saving the weights and biases of the model with best validation accuracy
def model_checkpoint(best_val_acc, current_val_acc):
    path = "weights/"
    if current_val_acc >= best_val_acc:
        best_val_acc = current_val_acc
        np.save(path + "W1.npy", W1)
        np.save(path + "b1.npy", b1)
        np.save(path + "W2.npy", W2)
        np.save(path + "b2.npy", b2)

In [26]:

def start_gradient_descent(number_of_iterations, learning_rate):
    # my weights and biases are global variables
    print("-"*80)

    best_val_acc = 0

    for i in range(number_of_iterations):
        
        # feed forward step
        Z1, A1, Z2, pred = feed_forward()

        # back propagation step
        dW1, db1, dW2, db2 = back_propagation(Z1, A1, Z2, pred)

        # adding gradients to weights and biases
        update_weights(dW1, db1, dW2, db2, learning_rate)

        # calculating and printing metrics
        train_loss = cross_entropy_loss(pred)
        train_acc = accuracy(pred)
        valid_acc = valid_accuracy()

        model_checkpoint(best_val_acc, valid_acc)

        print("Iteration {:<4}: Train Loss: {:.2f}, Train Accuracy: {:.2f}, Valid Accuracy: {:.2f}".format(i+1, train_loss, train_acc, valid_acc))
        print("-"*80)
        

In [27]:
# evaluating the model on the test set 
def evaluate():
    global X_test, y_test
    correct = 0
    for i in range(X_test.shape[0]):
        pred = predict(X_test[i])
        correct += (pred == np.argmax(y_test[i]))
    return correct/len(X_test)

In [28]:
NUMBER_OF_EPOCHS = 100
LEARNING_RATE = 0.1

In [29]:
# starting the gradient descent algorithm
start_gradient_descent(NUMBER_OF_EPOCHS, LEARNING_RATE)

--------------------------------------------------------------------------------


Iteration 1   : Train Loss: 19.82, Train Accuracy: 0.11, Valid Accuracy: 0.17
--------------------------------------------------------------------------------
Iteration 2   : Train Loss: 18.06, Train Accuracy: 0.16, Valid Accuracy: 0.22
--------------------------------------------------------------------------------
Iteration 3   : Train Loss: 16.42, Train Accuracy: 0.22, Valid Accuracy: 0.26
--------------------------------------------------------------------------------
Iteration 4   : Train Loss: 16.21, Train Accuracy: 0.26, Valid Accuracy: 0.25
--------------------------------------------------------------------------------
Iteration 5   : Train Loss: 16.13, Train Accuracy: 0.26, Valid Accuracy: 0.39
--------------------------------------------------------------------------------
Iteration 6   : Train Loss: 12.99, Train Accuracy: 0.38, Valid Accuracy: 0.49
--------------------------------------------------------------------------------
Iteration 7   : Train Loss: 10.79, Train Accur

In [30]:
# loading the best weights and biases

path = "weights/"
W1 = np.load(path + "W1.npy")
b1 = np.load(path + "b1.npy")
W2 = np.load(path + "W2.npy")
b2 = np.load(path + "b2.npy")

In [33]:
# evaluting the accuracy of the model on the test set
test_acc = evaluate()
print('Test Accuracy: {:.2f}'.format(test_acc))

Test Accuracy: 0.87


In [41]:
# testing on the first 20 images in the test set
def compare(idx):
    preds = []
    targets = []
    results = []
    for i in range(idx):
        preds.append(predict(X_test[i]))
        targets.append(np.argmax(y_test[i]))
        results.append("T" if preds[i] == targets[i] else "F")

    print("Targets: ", targets)
    print("Preds:   ", preds)
    print("\nResults: ", results)

compare(20)

Targets:  [7, 2, 1, 0, 4, 1, 4, 9, 5, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4]
Preds:    [7, 2, 1, 0, 9, 1, 4, 8, 6, 9, 0, 6, 9, 0, 1, 5, 9, 7, 3, 4]

Results:  ['T', 'T', 'T', 'T', 'F', 'T', 'T', 'F', 'F', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T']
