In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
data = pd.read_csv('D:/datasets/train.csv')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = np.array(data)
m, n = data.shape
print(n)
np.random.shuffle(data) 

data_dev = data[0:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255.

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255.
_,m_train = X_train.shape
Y_train

785


array([1, 8, 3, ..., 8, 1, 3], dtype=int64)

In [3]:
def init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

In [4]:
def ReLU(Z):
    return np.maximum(Z, 0)

In [5]:
def softmax(Z):
    Z = Z.astype(np.float32)
    A = np.exp(Z) / np.sum(np.exp(Z), axis=0)
    A = A.astype(np.float32)
    return A

In [6]:
def forward_prop(W1, b1, W2, b2, X):
    n=8
    # Quantize weights 
    W1_q = quantize(W1, n)  
    W2_q = quantize(W2, n)
    
    # Forward pass
    Z1 = W1_q.dot(X) + b1
    A1 = ReLU(Z1)
    
    # Quantize activations
    A1_q = quantize(A1, n)   
    
    Z2 = W2_q.dot(A1_q) + b2
    A2 = softmax(Z2)
    
    return Z1, A1, Z2, A2

In [7]:
def ReLU_deriv(Z):
    return Z > 0

In [8]:
def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

In [9]:
def backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    n=8
    one_hot_Y = one_hot(Y)
    
    # Calculate gradients
    dZ2 = A2 - one_hot_Y 
    dW2 = 1/m * dZ2.dot(A1.T)
    db2 = 1/m * np.sum(dZ2)
    
    dZ1 = W2.T.dot(dZ2) * ReLU_deriv(Z1) 
    dW1 = 1/m * dZ1.dot(X.T)
    db1 = 1/m * np.sum(dZ1)

    # Quantize gradients 
    dW1_q = quantize(dW1, n)
    dW2_q = quantize(dW2, n)
    db1_q = quantize(db1, n)
    db2_q = quantize(db2, n)
    
    return dW1_q, db1_q, dW2_q, db2_q

In [10]:
def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    n=8
    # Quantize gradients
    dW1_q = quantize(dW1, n)
    db1_q = quantize(db1, n)
    dW2_q = quantize(dW2, n)
    db2_q = quantize(db2, n)

    # Update weights  
    W1 = W1 - alpha * dW1_q 
    b1 = b1 - alpha * db1_q
    W2 = W2 - alpha * dW2_q
    b2 = b2 - alpha * db2_q
    
    return W1, b1, W2, b2

In [11]:
def get_predictions(A2):
    return np.argmax(A2, 0)

In [12]:
def get_accuracy(predictions, Y):
    #print(predictions, Y)
    return (np.sum(predictions == Y) / Y.size)*100

In [13]:
def gradient_descent(X, Y, alpha, iterations):
    W1, b1, W2, b2 = init_params()
    for i in range(iterations):
        with tf.device('/device:GPU:0'):
            Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
            dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
            W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
            loss = compute_loss(A2, Y)
            print("Loss: ", loss)
    return W1, b1, W2, b2

In [14]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

In [15]:
def test_prediction(index, W1, b1, W2, b2):
    current_image = X_train[:, index, None]
    prediction = make_predictions(X_train[:, index, None], W1, b1, W2, b2)
    label = Y_train[index]
    print("Prediction: ", prediction)
    print("Label: ", label)
    
    current_image = current_image.reshape((28, 28)) * 255
    plt.gray()
    plt.imshow(current_image, interpolation='nearest')
    plt.show()

In [16]:
def quantize(vals, n):
    scale = 2**n - 1
    q_vals = np.clip(vals * scale, -scale, scale)
    q_vals = np.fix(q_vals) / scale
    return q_vals

In [17]:
def compute_loss(A2, Y):
    one_hot_Y = one_hot(Y)
    p = A2
    gamma = 2  # Focal loss hyperparameter
    loss = -(1 - p) ** gamma * np.log(p)
    return np.mean(loss)

In [None]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 0.01, 1500)

Iteration:  0
8.039024390243902
Loss:  2.0579886
Iteration:  10
7.729268292682927
Loss:  2.0271077
Iteration:  20
7.785365853658536
Loss:  2.005813
Iteration:  30
7.892682926829268
Loss:  1.9908073
Iteration:  40
8.097560975609756
Loss:  1.9794334
Iteration:  50
8.624390243902438
Loss:  1.9708602
Iteration:  60
9.13170731707317
Loss:  1.9647229
Iteration:  70
9.770731707317074
Loss:  1.9595737
Iteration:  80
11.324390243902439
Loss:  1.9557121
Iteration:  90
12.317073170731707
Loss:  1.9524688
Iteration:  100
13.287804878048782
Loss:  1.9493846
Iteration:  110
14.51951219512195
Loss:  1.9478722
Iteration:  120
15.646341463414634
Loss:  1.9460487
Iteration:  130
16.67317073170732
Loss:  1.9448724
Iteration:  140
17.568292682926828
Loss:  1.9438108
Iteration:  150
18.485365853658536
Loss:  1.943516
Iteration:  160
19.385365853658538
Loss:  1.9429221
Iteration:  170
20.18536585365854
Loss:  1.9431759
Iteration:  180
20.93170731707317
Loss:  1.9434268
Iteration:  190
21.702439024390245
Los

In [None]:
test_prediction(0, W1, b1, W2, b2)
test_prediction(1, W1, b1, W2, b2)
test_prediction(2, W1, b1, W2, b2)
test_prediction(3, W1, b1, W2, b2)

In [None]:
dev_predictions = make_predictions(X_dev, W1, b1, W2, b2)

In [None]:
print("Accuracy:",get_accuracy(dev_predictions, Y_dev))

In [None]:
# Calculate the total number of parameters
total_params = (W1.size + b1.size + W2.size + b2.size)

# Print the size of the model
print(f"The size of the model (total number of trainable parameters) is: {total_params}")