In [1]:
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

In [2]:
# Defining our basic neural netowrk layers

# Linear Layer
class MyLinear():
    def __init__(self, in_features, out_features):
        # Initialization of the parameters (Weights and Bias)
        self.W = np.random.randn(*(in_features,out_features)) / out_features
        self.b = np.random.randn(*(out_features,)) / out_features
    def __call__(self,x):
        return self.forward(x)
    def forward(self, x):
        # Caching the input
        self.x = x
        # Calculating the output 
        # o = x W + b
        self.o = x @ self.W + self.b
        return self.o
    def backward(self, grad_L):
        # Calculating the gradient of the loss with respect to each parameter and the input
        # Using the gradient of the loss with respect to the output as input
        # grad_L_W = grad_L_O grad_O_W
        self.grad_W = self.x.T @ grad_L  
        # grad_L_b = grad_L_O grad_O_b
        self.grad_b = np.sum(grad_L,axis=0)
        # grad_L_x = grad_L_O grad_O_x
        grad_x = grad_L @ self.W.T
        return grad_x
    def step(self,optim):
        # Optimizing the parameters using the gradients
        self.W = optim.step(self.W, self.grad_W)
        self.b = optim.step(self.b, self.grad_b)

# Sigmoid Layer (activation function)
class MySigmoid():
    def __init__(self):
        pass
    def __call__(self,x):
        return self.forward(x)
    def forward(self, x):
        self.o = 1 / (1 + np.exp(-x))
        return self.o
    def backward(self, grad_L):
        return (self.o) * (1 - self.o) * grad_L
    def step(self,optim):
        pass

# Tanh Layer (activation function)
class MyTanh():
    def __init__(self):
        pass
    def __call__(self,x):
        return self.forward(x)
    def forward(self, x):
        self.o = (1 - np.exp(-x)) / (1 + np.exp(-x))
        return self.o
    def backward(self, grad_L):
        return (1 - self.o** 2) / 2  * grad_L
    def step(self,optim):
        pass

# Sequential model
class MySequential():
    def __init__(self,layers=[]):
        self.layers = layers
    def __call__(self,x):
        return self.forward(x)
    def forward(self, x):
        o = x
        for l in self.layers:
            o = l.forward(o)
        return o
    def backward(self, grad_L):
        for l in reversed(self.layers):
            grad_L = l.backward(grad_L)
    def step(self, optim):
        for l in self.layers:
            l.step(optim)

In [3]:
# Optimizer SGD (Stochastic Gradient Descent)
class MySGD():
    def __init__(self, lr=0.1):
        self.lr = lr
    def step(self, w, grad_w):
        return w - self.lr * grad_w

In [4]:
# MSE Loss function (Mean Square Erro Loss)
class MyMSELoss():
    def __init__(self):
        pass
    def __call__(self,o,t):
        N = len(o)
        loss = np.sum((o - t) ** 2 / 2) / N
        grad_loss = (o - t) / N
        return loss, grad_loss

In [5]:
class MyDataset():
    def __init__(self,x,y,batch_size=10):
        self.x = x
        self.y = y
        self.batch_size = batch_size
        N = len(self.x)
        self.my_size = int(np.ceil(N / batch_size))
    def __len__(self):
        return self.my_size
    def __getitem__(self, idx):
        return self.x[idx * self.batch_size:(idx + 1) * self.batch_size], self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

In [6]:
# Simple Sample: XOR 

# Data
train_x = np.array([[0,0],[0,1],[1,0],[1,1]])
train_y = np.array([[1,0],[0,1],[0,1],[1,0]])

train_ds = MyDataset(train_x, train_y,batch_size=4)

In [7]:
# Defining model, optimizer and loss function
model = MySequential(layers=[
    MyLinear(2,3),
    MyTanh(),
    MyLinear(3,2)
])

optim = MySGD(lr=0.5)
crit = MyMSELoss()

In [8]:
# Training for 10 epochs
for epoch in range(10):
    avg_l = 0
    for i in range(len(train_ds)):
        x,y = train_ds[i]
        o = model(x)
        l,g_l = crit(o, y)
        model.backward(g_l)
        model.step(optim)
        avg_l += l
    print(avg_l / len(train_ds))

0.4201442619764585
0.2641425523771533
0.2574638648335259
0.2565894122810018
0.25606912752716277
0.25561925631405413
0.2552118132372963
0.2548401043627225
0.25450014375255103
0.2541886578768792


In [9]:
np.argmax(model(train_x),axis=1)

array([0, 1, 0, 1])

In [10]:
model(train_x)

array([[0.5007766 , 0.38772255],
       [0.43674275, 0.52615266],
       [0.56698474, 0.46623191],
       [0.50254359, 0.60212222]])

In [11]:
# Simple Sample: digits

# Data
raw_data = load_digits()

data_images = raw_data["images"].reshape((-1,8 * 8)) / 15.0
data_target = raw_data["target"]

train_x, test_x, train_y, test_y = train_test_split(data_images, data_target,train_size=0.8)

n = len(train_x)
train_y_proc = np.zeros((n,10))
train_y_proc[np.arange(n),train_y] = 1

train_ds = MyDataset(train_x,train_y_proc,batch_size = 100)

In [12]:
# Defining model, optimizer and loss function
model = MySequential(layers=[
    MyLinear(64,32),
    MyTanh(),
    MyLinear(32,16),
    MyTanh(),
    MyLinear(16,10),
])

optim = MySGD(lr=0.5)
crit = MyMSELoss()

In [30]:
# Training for 10 epochs
for epoch in range(10):
    avg_loss = 0
    for i in range(len(train_ds)):
        x,y = train_ds[i]
        o = model(x)
        l,g_l = crit(o, y)
        model.backward(g_l)
        model.step(optim)
        avg_loss += l
    print(avg_loss / len(train_ds))

0.16193891196854265
0.16190632873003202
0.16218198148882793
0.16018446684287496
0.15437777542960485
0.15100263030624134
0.16194737561185132
0.1509100801948943
0.15453346750119673
0.15718901115207567


In [18]:
np.argmax(model(train_x[:20]),axis=1)

array([2, 5, 3, 2, 9, 0, 6, 8, 0, 3, 6, 4, 1, 6, 3, 8, 2, 7, 5, 3])

In [19]:
train_y[:20]

array([3, 5, 3, 2, 9, 0, 6, 8, 0, 3, 6, 4, 1, 6, 3, 8, 3, 7, 5, 3])

In [31]:
np.sum(np.argmax(model(train_x),axis=1) == train_y)

1328