In [33]:
import numpy as np
from sklearn.datasets import fetch_california_housing , load_iris
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split

In [31]:
class _matrix_multiplication:
    def __init__(self, X, W): 
        self.X = X 
        self.W = W 

    def forward(self):
        ### N = W @ X
        self.N = np.dot(self.X, self.W)

    def backward(self):
        ### dN/dW = X.T
        self.dN_dW = (self.X).T

In [5]:
class _bias_addition :
    def __init__(self, WX, bias):
        self.B = bias
        self.WX = WX
    
    def forward(self):
        ### WXpB = N + B
        ### N = W @ X
        self.WXpB = self.WX + self.B
    
    def backward(self):
        ### dWXpB/dB = identity matrix
        self.dWXpB_dB = np.identity(self.B.shape[1])

In [6]:
class _linear:
    def __init__(self, Z):
        self.Z = Z 
    
    def forward(self, ):
        self.aZ = self.Z 
    
    def backward(self,):
        self.daZ_dZ = np.identity(self.Z.shape[1])

In [7]:
class _mean_squared_error : 
    def __init__(self ,Y ,Y_pred):
        self.Y = Y 
        self.Y_pred = Y_pred
    
    def forward(self):
        ### mean squared loss is (Y_predicted - Y)**2
        self.L = np.mean((self.Y_pred - self.Y)**2)
        
    def backward(self):
        ### dL/dY_pred = (1/M)*(2*(Y_pred - Y))
        ### M = no of training examples
        self.dL_daZ = (2*(self.Y_pred - self.Y).T)/len(self.Y)      

In [8]:
class _softmax:
    def __init__(self, Z):
        self.Z = Z
    
    ### standalone function (do not require any information from class) but semantically somehow related to the class so staticmethod is used
    ### without @staticmethod, error was coming 
    @staticmethod
    def _softmax(Z):
        max_Z = np.max(Z, axis=1, keepdims=True )
        return (np.exp(Z - max_Z))/np.sum(np.exp(Z - max_Z), axis=1, keepdims=True)
        
    def forward(self):
        self.aZ = self._softmax(self.Z)
    
    def backward(self):
        self.daZ_dZ = np.diag(self.aZ.reshape(-1))-(self.aZ.T)@((self.aZ))
    

In [9]:
class _sigmoid:
    def __init__(self, Z):
        self.Z = Z 
    
    def forward(self,):
        self.aZ = self.sigmoid(self.Z)

    def backward(self,):
        diag_entries = np.multiply(self.aZ, 1-self.aZ).reshape(-1)
        self.daZ_dZ = np.diag(diag_entries) 
    
    def sigmoid(Z):
        return  1./(1+np.exp(-Z))

In [10]:
class _cross_entropy_loss :
    def __init__(self, Y, Y_pred): 
        self.eps = 1e-40
        self.Y = Y
        self.aZ = Y_pred
    
    def forward(self):
        self.L = -np.sum(self.Y * np.log(self.aZ+self.eps))
        
    def backward(self):
        self.dL_daZ = -1*(self.Y/(self.aZ + self.eps)).T

In [11]:
def load(dataset_name='california', normalize_X=False, one_hot = False, test_size=0.3):
    
    ### loading dataset as per requirement
    if dataset_name == 'california' : 
        data = fetch_california_housing()
    elif dataset_name == 'iris' : 
        data = load_iris()

    X = data['data']
    y = data['target'].reshape(-1,1)
    
    ### Normalizer used for normalizing the dataset

    if normalize_X == True : 
        normalizer = Normalizer()
        X  = normalizer.fit_transform(X)
    
    ### Conversion of y to one hot encoding as per demand
    if one_hot == True : 
        y = np.eye(3)[y.reshape(-1)]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    return X_train, y_train, X_test, y_test

In [12]:
def _forward(X_sample, Y_sample, W, B, act='linear', loss='mean_squared'):
    
    ### matrix multiplication
    updateLayer = _matrix_multiplication(X_sample, W)
    updateLayer.forward()

    ### adding bias
    addBias = _bias_addition(updateLayer.N, B)
    addBias.forward()

    ### activation layer as per requirement
    if act == 'softmax': 
        actLayer = _softmax(addBias.WXpB)
    elif act == 'linear' : 
        actLayer = _linear(addBias.WXpB)

    ### activation
    actLayer.forward()
    
    ### loss function as per requirement
    if loss=='cross_entropy':
        lossLayer = _cross_entropy_loss(Y_sample, actLayer.aZ )
    elif loss == 'mean_squared':
        lossLayer = _mean_squared_error(Y_sample, actLayer.aZ )
    
    ### finding loss
    lossLayer.forward()
    
    return updateLayer, addBias, actLayer, lossLayer

In [13]:
def _backward(updateLayer, addBias, actLayer, lossLayer): 

    ### returning all the losses
    ### not necessarily required
    lossLayer.backward()
    actLayer.backward()
    addBias.backward()
    updateLayer.backward()

    return lossLayer, actLayer, addBias, updateLayer 

In [14]:
train_acc_q3 = []
test_acc_q3 = []

In [15]:
def GradiantDescent(X_train, y_train, X_test, y_test, inShape = 1, seed = 42, n_iters = 100, outShape= 1, learning_rate = 0.001, activation = 'linear', loss = 'mean_squared', problem='regression'):

    ### random seed
    np.random.seed(seed)

    ### initializing weights and biases randomly
    W = np.random.random((inShape, outShape))
    B  = np.random.random((1, outShape))

    for _ in range(n_iters):
        for j in range(len(X_train)): 
            X_sample = X_train[j, :].reshape(1, inShape)
            Y_sample = y_train[j, :].reshape(1, outShape)

            ### Forward Pass
            updateLayer, addBias, actLayer, lossLayer = _forward(X_sample, Y_sample, W, B, activation,loss)

            ### Backward Pass 
            lossLayer, actLayer, addBias, updateLayer = _backward(updateLayer, addBias, actLayer, lossLayer)

            dL_daZ = lossLayer.dL_daZ 
            ### print(dL_daZ)
            dL_dZ = np.dot( actLayer.daZ_dZ, dL_daZ ) 
            ### print(dL_dZ)
            dL_dW = np.dot( updateLayer.dN_dW , dL_dZ.T)
            ### print(dL_dW)
            dL_dB = np.dot( addBias.dWXpB_dB, dL_dZ).T
            ### print(dL_dB)

            ### Update the weights and bias
            W -=  learning_rate*dL_dW 
            B -=  learning_rate*dL_dB

    if problem =='classification': 
        ### TRAINING
        ### truth value
        y_true = np.argmax(y_train, axis=1)
        _, _, _, lossLayer = _forward( X_train, y_train , W, B, activation, loss)

        ### predicted value
        y_pred = np.argmax(lossLayer.aZ, axis=1)

        ### finding accuracy
        acc = (y_pred == y_true)
        print(f"Train Accuracy: {sum(acc)*100/len(acc)} %")

        train_acc_q3.append(sum(acc)*100/len(acc))

        ### TESTING
        ### truth value
        y_true = np.argmax(y_test,axis=1)

        ### predicted value
        _, _, _, lossLayer = _forward( X_test, y_test , W, B, activation, loss)
        y_pred = np.argmax( lossLayer.aZ, axis=1)

        ### finding accuracy
        acc = (y_pred == y_true)
        print(f"Test Accuracy: {sum(acc)*100/len(acc)} %")
        test_acc_q3.append(sum(acc)*100/len(acc))

        return train_acc_q3, test_acc_q3
            
    if problem =='regression':
        ### TRAINING
        _ , _, _, lossLayer = _forward( X_train, y_train , W, B, activation, loss)
        print(f"Train Error : {lossLayer.L}")
                    
        ### TESTING
        _ , _, _, lossLayer = _forward( X_test, y_test , W, B, activation, loss)
        print(f"Test Error : {lossLayer.L}")
    



In [16]:
X_train, y_train, X_test, y_test  = load('california', normalize_X=True, test_size=0.3)

In [17]:
GradiantDescent(X_train, y_train, X_test, y_test, inShape=X_train.shape[1], outShape=y_train.shape[1], problem='regression')

Train Error : 1.2934855427739083
Test Error : 1.3186464066060344


In [18]:
X_train, y_train, X_test, y_test = load('iris', normalize_X=True, one_hot=True)

In [19]:
trq3, tesq3 = GradiantDescent(X_train, y_train, X_test, y_test, inShape=X_train.shape[1], outShape=y_train.shape[1], n_iters=50, learning_rate=0.03, activation='softmax', problem='classification', loss='cross_entropy')

Train Accuracy: 97.14285714285714 %
Test Accuracy: 95.55555555555556 %


In [20]:
ls = np.linspace(0, 1, 101)

In [21]:
ls

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [22]:
train_acc = []
test_acc = []
for i in range(len(ls)):
    print(ls[i], i)
    trainq3, testq3 = GradiantDescent(X_train, y_train, X_test, y_test, inShape=X_train.shape[1], outShape=y_train.shape[1], n_iters=50, learning_rate=ls[i], activation='softmax', problem='classification', loss='cross_entropy')
    train_acc.append(trainq3)
    test_acc.append(testq3)

0.0 0
Train Accuracy: 42.857142857142854 %
Test Accuracy: 33.333333333333336 %
0.01 1
Train Accuracy: 82.85714285714286 %
Test Accuracy: 77.77777777777777 %
0.02 2
Train Accuracy: 95.23809523809524 %
Test Accuracy: 86.66666666666667 %
0.03 3
Train Accuracy: 97.14285714285714 %
Test Accuracy: 95.55555555555556 %
0.04 4
Train Accuracy: 97.14285714285714 %
Test Accuracy: 93.33333333333333 %
0.05 5
Train Accuracy: 97.14285714285714 %
Test Accuracy: 93.33333333333333 %
0.06 6
Train Accuracy: 96.19047619047619 %
Test Accuracy: 93.33333333333333 %
0.07 7
Train Accuracy: 94.28571428571429 %
Test Accuracy: 93.33333333333333 %
0.08 8
Train Accuracy: 94.28571428571429 %
Test Accuracy: 93.33333333333333 %
0.09 9
Train Accuracy: 94.28571428571429 %
Test Accuracy: 93.33333333333333 %
0.1 10
Train Accuracy: 94.28571428571429 %
Test Accuracy: 93.33333333333333 %
0.11 11
Train Accuracy: 94.28571428571429 %
Test Accuracy: 93.33333333333333 %
0.12 12
Train Accuracy: 96.19047619047619 %
Test Accuracy: 93.

In [32]:
train_acc[0].pop()

96.19047619047619

In [26]:
plt.plot(ls, train_acc[0])

NameError: name 'plt' is not defined

In [28]:
test_acc[0].pop()

93.33333333333333

In [29]:
plt.plot(ls, test_acc[0])

NameError: name 'plt' is not defined

In [None]:
### Conclusion: Best learning rate is around 0.3