In [1]:
import pandas as pd
import numpy as np
from scipy.special import softmax
np.random.seed(1)

In [2]:
train_data = pd.read_csv('toy_dataset/toy_dataset_train.csv',header=None)
X_train = train_data.iloc[:, 1:].values
X_train = X_train.astype(np.float)
X_train = X_train / 255
y_train = train_data.iloc[:, 0].values
y_true= np.array(pd.get_dummies(y_train))

In [3]:
class Activation_function():
    def __init__(self, activation_function):
        self.name = activation_function
        self.function = self.activation_function()
        self.derivative = self.activation_function_derivative()

    def activation_function(self):
        if self.name == 'sigmoid':
            return lambda x: 1 / (1 + np.exp(-x))
        elif self.name== 'relu':
            return lambda x: np.maximum(0, x)
        elif self.name == 'tanh':
            return lambda x: np.tanh(x)
        elif self.name == 'softmax':
            return lambda x: np.exp(x - np.max(x)) / np.sum(np.exp(x- np.max(x)),axis=0)

    def activation_function_derivative(self):
        if self.name == 'sigmoid':
            return lambda x: x * (1 - x)
            # return lambda x: self.activation_function(x) * (1 - self.activation_function(x))
        elif self.name == 'relu':
            return lambda x: 1. * (x > 0)
        elif self.name == 'tanh':
            return lambda x: 1 - np.power(x, 2)
        elif self.name == 'softmax':
            # return lambda x: softmax(x,axis = 0)*(1-softmax(x,axis = 0))
            return lambda x: x*(1-x)
        
        

def loss_function_CE(y_true, y_pred, derivative):
    if not derivative:
        return -np.sum(y_true * np.log(y_pred))/y_true.shape[1]
    else:
        return (y_pred-y_true)/y_true.shape[1]


def loss_function_MSE(y_true, y_pred, derivative=False):
    if not derivative:
        return np.sum((y_true - y_pred) ** 2) / 2
    else:
        print(y_pred.shape, y_true.shape)
        temp = np.sum(2*(y_pred - y_true)*y_pred,axis = 0)
        f = 2*(y_pred - y_true)*y_pred*(1-y_pred) - y_pred*temp
        print(temp.shape, f.shape)
        return f/y_true.shape[1]

In [4]:
a = np.arange(8).reshape(2,4)
b = np.arange(8).reshape(2,4)*0.2

In [5]:
c = np.sum(2*(b - a)*b,axis = 0)

In [6]:
b

array([[0. , 0.2, 0.4, 0.6],
       [0.8, 1. , 1.2, 1.4]])

In [7]:
c

array([ -5.12,  -8.32, -12.8 , -18.56])

In [8]:
b*c

array([[ -0.   ,  -1.664,  -5.12 , -11.136],
       [ -4.096,  -8.32 , -15.36 , -25.984]])

In [9]:
x = np.ones(y_true.T.shape)

In [10]:
loss_function_MSE(y_true.T, x,derivative=True)

(2, 3000) (2, 3000)
(3000,) (2, 3000)


array([[-0.00066667, -0.00066667, -0.00066667, ..., -0.00066667,
        -0.00066667, -0.00066667],
       [-0.00066667, -0.00066667, -0.00066667, ..., -0.00066667,
        -0.00066667, -0.00066667]])

In [11]:
class Neural_Network():

    def __init__(
        self, input_size, hidden_layer_size_array, output_size, 
        activation_function, output_activation_function, loss_function
    ):
        self.input_size = input_size
        self.number_of_hidden_layers = len(hidden_layer_size_array)
        self.hidden_layer_size_array = hidden_layer_size_array
        self.output_size = output_size
        self.activation_function = Activation_function(activation_function)
        self.output_activation_function = Activation_function(output_activation_function)
        self.weights = self.weights_initializer()
        self.loss_function = loss_function
        self.weights_history = []

    def weights_initializer(self):
        weights = []
        for i in range(self.number_of_hidden_layers+1):
            if i == 0:
                weights.append(
                    np.random.normal(size = (self.input_size + 1, self.hidden_layer_size_array[i]))
                    * np.sqrt(2 / (self.input_size + self.hidden_layer_size_array[i] + 1))
                    .astype(np.float32)
                )
            elif i == self.number_of_hidden_layers:
                weights.append(
                    np.random.normal(size = (self.hidden_layer_size_array[i - 1] + 1, self.output_size)) 
                    * np.sqrt(2 / (self.hidden_layer_size_array[i - 1] + self.output_size + 1))
                    .astype(np.float32)
                )
            else:
                weights.append(
                    np.random.normal(size = (self.hidden_layer_size_array[i - 1] + 1, self.hidden_layer_size_array[i])) 
                    * np.sqrt(2 / (self.hidden_layer_size_array[i - 1] + self.hidden_layer_size_array[i] + 1))
                    .astype(np.float32)
                )
        return weights
    
    def feed_forward(self, input_data):
        # Feed forward
        # Input layer
        input_layer = np.array(input_data, ndmin=2).T
        
        # Hidden layers
        a_s = []
        z_s = []
        for i in range(self.number_of_hidden_layers):
            input_layer = np.concatenate((np.ones((1, input_layer.shape[1])), input_layer), axis=0)
            z = np.dot(self.weights[i].T, input_layer)
            z_s.append(z)
            a_s_i = self.activation_function.function(z)
            a_s.append(a_s_i)
            input_layer = a_s[i]
            a_s[i] = np.concatenate((np.ones((1, a_s[i].shape[1])), a_s[i]), axis=0)
        
        # Output layer
        input_layer = np.concatenate((np.ones((1, input_layer.shape[1])), input_layer), axis=0)
        z = np.dot(self.weights[-1].T, input_layer)
        z_s.append(z)
        a_s.append(self.output_activation_function.function(z))
        return a_s[-1], a_s, z_s

    def back_propagation2(self, input_data, y_true):
        input_data = np.array(input_data, ndmin=2)
        y_pred, a_s, z_s = self.feed_forward(input_data)

        # Output layer
        y_true = np.array(y_true, ndmin=2).T
    
        # delL_dzs = [(y_pred-y_true)/y_true.shape[1]]
        delL_dzs = [self.loss_function(y_true, y_pred, derivative=True)]
        delL_das = [np.dot(self.weights[-1], delL_dzs[-1])]

        # Hidden layers
        for i in range(self.number_of_hidden_layers-1, -1, -1):
            delL_dz_i = delL_das[-1]*self.activation_function.derivative(a_s[i])
            delL_dzs.append(delL_dz_i[1:])
            delL_das.append(np.dot(self.weights[i], delL_dzs[-1]))
        
        delL_dzs.reverse()
        delL_dws = []
        for i in range(self.number_of_hidden_layers+1):
            if i==0:
                input_data = np.c_[np.ones(input_data.shape[0]), input_data]
                delL_dws.append(np.dot(input_data.T, delL_dzs[i].T))
            else:
                delL_dws.append(np.dot(a_s[i-1], delL_dzs[i].T))
        
        return delL_dws

    def train(self, input_data, y_true,epochs,batch_size,learning_rate,adaptive_learning_rate = False):
        # Training
        iter = 0
        for i in range(epochs):
            for j in range(0, len(input_data), batch_size):
                if j + batch_size > len(input_data):
                    batch_input_data = input_data[j:]
                    batch_y_true = y_true[j:]
                else:
                    batch_input_data = input_data[j:j+batch_size]
                    batch_y_true = y_true[j:j+batch_size]
                # del_w = self.back_propagation(batch_input_data, batch_y_true)
                del_w = self.back_propagation2(batch_input_data, batch_y_true)
                if adaptive_learning_rate == True:
                    learning_rate = learning_rate * np.sqrt(1/(iter+1))
                for k in range(len(self.weights)):
                    self.weights[k] -= learning_rate * del_w[k]
                iter += 1
                if i==0 and iter == 4:
                    self.weights_history.append(self.weights)
            if i==4:
                self.weights_history.append(self.weights)
            if (i+1)%50==0:
                print(f'Epoch:{i+1}', self.evaluate(input_data, y_true))
    def predict(self, input_data):
        input_data = np.array(input_data, ndmin=2)
        y_pred, _, _ = self.feed_forward(input_data)
        return y_pred

    def evaluate(self, input_data, y_true):
        y_pred = self.predict(input_data)
        return self.loss_function(y_true.T, y_pred)

In [21]:
nn = Neural_Network(input_size = 200,
    hidden_layer_size_array = [100, 50,20,10],
    output_size = 2,
    activation_function = 'relu',
    output_activation_function = 'softmax',
    loss_function = loss_function_CE
)

In [22]:
nn.train(
    X_train, y_true,
    epochs=5,batch_size=100,
    learning_rate=0.001,
    adaptive_learning_rate=False
)

In [23]:
from sklearn.metrics import r2_score

for i in range(5):
    k = np.load(f'checker_weights/toy_dataset/tc_3/ac_w_{i+1}.npy')
    l = nn.weights_history[1][i]
    print(r2_score(k,l))

for i in range(5):
    k = np.load(f'checker_weights/toy_dataset/tc_3/ac_w_{i+1}_iter.npy')
    l = nn.weights_history[0][i]
    print(r2_score(k,l))

-1.012659156889688
-1.0242650918810787
-1.1188947919149268
-1.0986614611714791
-4.678288841881175
-1.0126145869763976
-1.023988347577898
-1.1189594361712538
-1.0978414414647804
-4.47475092259864
