# Neural Network for MNIST dataset of digit recognition

Tutorial: https://www.geeksforgeeks.org/handwritten-digit-recognition-using-neural-network/

In [2]:
from scipy.io import loadmat
import numpy as np
from scipy.optimize import minimize
from tkinter import *
from PIL import ImageGrab

In [3]:
# Load mat file of data
data = loadmat('../data/mnist-original.mat')
print("Keys: ", data.keys())
print("Dataset shape: ", data['data'].shape)

# Extract features and transpose
X = data['data'].T

# Normalise the data
X = X / 255

# Extract labels from data and flatten
y = data['label'].flatten()

Keys:  dict_keys(['__header__', '__version__', '__globals__', 'mldata_descr_ordering', 'data', 'label'])
Dataset shape:  (784, 70000)


In [4]:
# Split data into training set with 60,000 samples and test set with 10,000 samples (capital for matrix, lower case for vector)
X_train = X[:60000, :]
y_train = y[:60000]
print("Training size: ", X_train.shape, y_train.shape)

X_test = X[60000:, :]
y_test = y[60000:]
print("Testing size: ", X_test.shape, y_test.shape)

Training size:  (60000, 784) (60000,)
Testing size:  (10000, 784) (10000,)


In [5]:
# Extract number of rows in dataset (70,000)
m = X.shape[0]
print("Rows: ", m)

# Images are 28x28 pixels -> 784 features
input_layer_size = 28 * 28
hidden_layer_size = 100

# 10 classes (0-9)
num_labels = 10

Rows:  70000


In [6]:
# Function to randomly initialise Thetas (weights) between a range of [-epsilon, epsilon]
def initialise(a, b, epsilon):
    # Scale and shift random values to be within range
    c = (np.random.rand(a, b + 1) * (2 * epsilon)) - epsilon
    
    # Returns matrix of randomly initialised weights, of dimensions a x (b + 1)
    return c

epsilon = 0.15
initial_Theta1 = initialise(hidden_layer_size, input_layer_size, epsilon)
initial_Theta2 = initialise(num_labels, hidden_layer_size, epsilon)
print("Theta1:\n ", initial_Theta1, "\n", initial_Theta1.shape)
print("\nTheta2:\n ", initial_Theta2, "\n", initial_Theta2.shape)

Theta1:
  [[-0.13896074 -0.11880193 -0.07574241 ...  0.05831591  0.05111481
  -0.00017887]
 [ 0.14801023  0.09427621  0.00686578 ... -0.05123832  0.14674092
  -0.00602045]
 [ 0.11397638  0.12878197 -0.09460539 ... -0.08737971 -0.12949902
   0.0470588 ]
 ...
 [ 0.08524591 -0.11647716  0.04382956 ...  0.08680118 -0.07553966
  -0.13953899]
 [-0.06415511 -0.14573925 -0.00343968 ... -0.0363637  -0.10829765
  -0.04152082]
 [ 0.09602654 -0.1256353   0.11411442 ... -0.12284823  0.06632072
  -0.13456616]] 
 (100, 785)

Theta2:
  [[-0.09256467  0.09522605  0.14777629 ...  0.11935187  0.14333026
   0.04195252]
 [ 0.0033565   0.06042257 -0.04056208 ... -0.08786528  0.13790553
   0.13560795]
 [-0.01541967  0.01569668  0.08529858 ...  0.04879656  0.08209945
   0.01176578]
 ...
 [ 0.06465887 -0.11115589 -0.14515513 ...  0.11617388  0.01941106
  -0.12468471]
 [ 0.0867371   0.10526089 -0.13367968 ...  0.07611201  0.13117422
   0.08930042]
 [-0.05920509  0.07053332  0.00303719 ... -0.03696197  0.0107062

In [7]:
# Unroll (combine) the weight matrices into a single column vector (easier for optimisation algorithm)
initial_nn_params = np.concatenate((initial_Theta1.flatten(), initial_Theta2.flatten())) 

# Hyperparameters: max iterations and regularisation parameter (to prevent overfitting)
maxiter = 100
lambda_reg = 0.1

# Package all arguments into a tuple for optimisation function
myargs = (input_layer_size, hidden_layer_size, num_labels, X_train, y_train, lambda_reg)
print(myargs)

(784, 100, 10, array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([0., 0., 0., ..., 9., 9., 9.]), 0.1)


In [8]:
# Create the neural network model
def neural_network(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lamb):
    # Split weights back into Theta1 and Theta2
    Theta1 = np.reshape(nn_params[:(hidden_layer_size * (input_layer_size + 1))], (hidden_layer_size, (input_layer_size + 1)))
    Theta2 = np.reshape(nn_params[(hidden_layer_size * (input_layer_size + 1)):], (num_labels, (hidden_layer_size + 1)))
    
    # FORWARD PROPAGATION
    m = X.shape[0]
    ones = np.ones((m, 1))              # Ones matrix
    X = np.append(ones, X, axis=1)      # Add bias unit to input (first) layer
    a1 = X                              # Input layer
    z2 = np.dot(X, Theta1.T)            # Hidden layer
    a2 = 1 / (1 + np.exp(-z2))          # Activation for hidden (second) layer
    ones = np.ones((m, 1))
    a2 = np.append(ones, a2, axis=1)    # Add bias unit to hidden layer
    z3 = np.dot(a2, Theta2.T)           # Output layer
    a3 = 1 / (1 + np.exp(-z3))          # Activation for output (third) layer
    
    # Convert y labels into a vectors of binary (boolean) vectors
    y_vect = np.zeros((m, num_labels))
    for i in range(m):
        y_vect[i, int(y[i])] = 1
    
    # Calculate cost function
    J = (
        (1 / m) * 
        (np.sum(np.sum(-y_vect * np.log(a3) - (1 - y_vect) * np.log(1 - a3)))) 
        + 
        (lamb / (2 * m)) * 
        (np.sum(np.sum(pow(Theta1[:, 1:], 2))) + np.sum(np.sum(pow(Theta2[:, 1:], 2))))
    )
    
    # BACK PROPAGATION
    Delta3 = a3 - y_vect
    Delta2 = np.dot(Delta3, Theta2) * a2 * (1 - a2)
    Delta2 = Delta2[:, 1:]
    
    # Calculate gradients
    Theta1[:, 0] = 0
    Theta1_grad = (1 / m) * np.dot(Delta2.T, a1) + (lamb / m) * Theta1
    Theta2[:, 0] = 0
    Theta2_grad = (1 / m) * np.dot(Delta3.T, a2) + (lamb / m) * Theta2
    grad = np.concatenate((Theta1_grad.flatten(), Theta2_grad.flatten()))
    
    return J, grad


# Minimise the cost function using the L-BFGS-B optimisation algorithm
results = minimize(neural_network, x0=initial_nn_params, args=myargs, options={'disp': True, 'maxiter': maxiter}, method='L-BFGS-B', jac=True)
print(results)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =        79510     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  7.40760D+00    |proj g|=  6.49343D-01


 This problem is unconstrained.



At iterate    1    f=  3.48849D+00    |proj g|=  8.43750D-02

At iterate    2    f=  3.31387D+00    |proj g|=  3.72599D-02

At iterate    3    f=  3.25606D+00    |proj g|=  2.35497D-02

At iterate    4    f=  3.21630D+00    |proj g|=  2.75261D-02

At iterate    5    f=  3.09271D+00    |proj g|=  3.75353D-02

At iterate    6    f=  2.79712D+00    |proj g|=  5.92387D-02

At iterate    7    f=  2.09104D+00    |proj g|=  7.27950D-02

At iterate    8    f=  1.97742D+00    |proj g|=  1.21491D-01

At iterate    9    f=  1.39721D+00    |proj g|=  3.09159D-02

At iterate   10    f=  1.33863D+00    |proj g|=  1.89036D-02

At iterate   11    f=  1.28466D+00    |proj g|=  2.25966D-02

At iterate   12    f=  1.18983D+00    |proj g|=  3.57352D-02

At iterate   13    f=  1.04909D+00    |proj g|=  4.44491D-02

At iterate   14    f=  9.87449D-01    |proj g|=  2.78562D-02

At iterate   15    f=  9.28647D-01    |proj g|=  1.37513D-02

At iterate   16    f=  8.89384D-01    |proj g|=  1.56977D-02

At iter

In [9]:
# Extract the optimised weights (trained Theta)
nn_params = results["x"]
print("Optimised weights: \n", nn_params, "\n\n", nn_params.shape)

Optimised weights: 
 [-0.9468377  -0.11525239 -0.0734794  ... -3.33489795 -0.28057966
  0.99891378] 

 (79510,)


#### Why 79,510?
`Theta1`
-   hidden_layer_size * (input_layer_size + 1)
-   100 * (784 + 1)
-   78,500

`Theta2`
-   num_labels * (hidden_layer_size + 1)
-   10 * (100 + 1)
-   1,010

`TOTAL`
-   79,510

In [10]:
# Split weights back into Theta1 (100 x 785) and Theta2 (10 x 101)
Theta1 = np.reshape(nn_params[:(hidden_layer_size * (input_layer_size + 1))], (hidden_layer_size, (input_layer_size + 1)))
Theta2 = np.reshape(nn_params[(hidden_layer_size * (input_layer_size + 1)):], (num_labels, (hidden_layer_size + 1)))
print(Theta1.shape, Theta2.shape)

(100, 785) (10, 101)


In [11]:
# Performs forward propagation to predict the label (digit) of an input image
def predict(Theta1, Theta2, X):
    m = X.shape[0] 
    ones = np.ones((m, 1))                      # Ones matrix
    X = np.append(ones, X, axis=1)              # Add bias unit to input (first) layer 
    z2 = np.dot(X, Theta1.transpose()) 
    a2 = 1 / (1 + np.exp(-z2))                  # Activation for hidden (second) layer
    ones = np.ones((m, 1)) 
    a2 = np.append(ones, a2, axis=1)            # Adding bias unit to hidden layer 
    z3 = np.dot(a2, Theta2.transpose()) 
    a3 = 1 / (1 + np.exp(-z3))                  # Activation for output (third) layer 
    p = (np.argmax(a3, axis=1))                 # Predicting the class on the basis of max value of hypothesis 
    return p 


# Check test set accuracy of model
pred = predict(Theta1, Theta2, X_test)
print("Test set accuracy: ", np.mean(pred == y_test) * 100)

# Check train set accuracy of model
pred = predict(Theta1, Theta2, X_train)
print("Train set accuracy: ", np.mean(pred == y_train) * 100)

Test set accuracy:  97.52
Train set accuracy:  99.345


In [12]:
# Evaluate precision of model 
true_positive = 0

# Iterate through predictions and compare with actual labels
for i in range(len(pred)):
    if pred[i] == y_train[i]:
        true_positive += 1
        
# Calculate precision
false_positive = len(y_train) - true_positive 
print('Precision =', true_positive/(true_positive + false_positive))

Precision = 0.99345


In [13]:
# Save Thetas in .txt files: (100 x 785) and (10 x 101)
np.savetxt('Theta1.txt', Theta1, delimiter=' ') 
np.savetxt('Theta2.txt', Theta2, delimiter=' ')

In [15]:
window = Tk() 
window.title("Handwritten digit recognition") 
l1 = Label() 


def MyProject(): 
	global l1 

	widget = cv 
	# Setting co-ordinates of canvas 
	x = window.winfo_rootx() + widget.winfo_x() 
	y = window.winfo_rooty() + widget.winfo_y() 
	x1 = x + widget.winfo_width() 
	y1 = y + widget.winfo_height() 

	# Image is captured from canvas and is resized to (28 X 28) px 
	img = ImageGrab.grab().crop((x, y, x1, y1)).resize((28, 28)) 

	# Converting rgb to grayscale image 
	img = img.convert('L') 

	# Extracting pixel matrix of image and converting it to a vector of (1, 784) 
	x = np.asarray(img) 
	vec = np.zeros((1, 784)) 
	k = 0
	for i in range(28): 
		for j in range(28): 
			vec[0][k] = x[i][j] 
			k += 1

	# Loading Thetas 
	Theta1 = np.loadtxt('Theta1.txt') 
	Theta2 = np.loadtxt('Theta2.txt') 

	# Calling function for prediction 
	pred = predict(Theta1, Theta2, vec / 255) 

	# Displaying the result 
	l1 = Label(window, text="Digit = " + str(pred[0]), font=('Algerian', 20)) 
	l1.place(x=230, y=420) 


lastx, lasty = None, None


# Clears the canvas 
def clear_widget(): 
	global cv, l1 
	cv.delete("all") 
	l1.destroy() 


# Activate canvas 
def event_activation(event): 
	global lastx, lasty 
	cv.bind('<B1-Motion>', draw_lines) 
	lastx, lasty = event.x, event.y 


# To draw on canvas 
def draw_lines(event): 
	global lastx, lasty 
	x, y = event.x, event.y 
	cv.create_line((lastx, lasty, x, y), width=30, fill='white', capstyle=ROUND, smooth=TRUE, splinesteps=12) 
	lastx, lasty = x, y 


# Label 
L1 = Label(window, text="Handwritten Digit Recoginition", font=('Algerian', 25), fg="blue") 
L1.place(x=35, y=10) 

# Button to clear canvas 
b1 = Button(window, text="1. Clear Canvas", font=('Algerian', 15), bg="orange", fg="black", command=clear_widget) 
b1.place(x=120, y=370) 

# Button to predict digit drawn on canvas 
b2 = Button(window, text="2. Prediction", font=('Algerian', 15), bg="white", fg="red", command=MyProject) 
b2.place(x=320, y=370) 

# Setting properties of canvas 
cv = Canvas(window, width=350, height=290, bg='black') 
cv.place(x=120, y=70) 

cv.bind('<Button-1>', event_activation) 
window.geometry("600x500") 
window.mainloop() 


: 