In [1]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)
def relu(x):
    return np.maximum(0, x)
def relu_derivative(x):
    return np.where(x > 0, 1, 0)
def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def mean_squared_error(predictions, targets):
    return np.mean((predictions - targets) ** 2)

def mean_squared_error_derivative(predictions, targets):
    return 2 * (predictions - targets) / targets.size
def binary_cross_entropy(output,target):
    output = np.clip(output, 1e-9, 1 - 1e-9)
    return -np.sum([y*np.log(p) + (1-y)*np.log(1-p) for y,p in zip(target,output)])
def categorical_cross_entropy(predictions, targets):
    predictions = np.clip(predictions, 1e-9, 1 - 1e-9)
    return -np.sum(targets * np.log(predictions)) / targets.shape[0]
def cross_entropy_derivative(output, target):
    return output - target  

"""
Инициализация весов с использованием He Initialization.

Аргументы:
shape -- кортеж, определяющий размерность матрицы весов (например, (n_l, n_{l-1}))

Возвращает:
weights -- инициализированная матрица весов с размером shape, где weights(a, b), a - число нейронов в слое, b - число входов с предыдущего слоя
"""
def he_initialization(shape):
    if type(shape) == int  or type(shape) == np.int64:
        stddev = np.sqrt(2.0 / shape)
        weights = np.random.randn(shape) * stddev
    else:
        stddev = np.sqrt(2.0 / shape[1])
        weights = np.random.randn(*shape) * stddev

    return weights
def xavier_initialization(shape):
    if type(shape) == int or type(shape) == np.int64:
        stddev = np.sqrt(6.0)/shape
        weights = np.random.randn(shape) * stddev
    else:
        stddev = np.sqrt(6.0) / np.sqrt(shape[0] + shape[1])
        weights = np.random.randn(*shape) * stddev
    return weights

def leCun_initialization(shape):
    if type(shape) == int or type(shape) == np.int64:
        stddev = np.sqrt(1.0 / shape)
        weights = np.random.randn(shape) * stddev
    else:
        stddev = np.sqrt(1.0 / shape[1])
        weights = np.random.randn(*shape) * stddev
    return weights

In [2]:
import numpy as np 
class Layer:
    def __init__(self, input_size, output_size,activation=None,derivative=None,initialize_function=np.random.randn,batch_norm=False, momentum=0.9, epsilon=1e-5):
        self.init = initialize_function
        self.weights = self.init(input_size,output_size)
        self.bias = self.init(1,output_size)
        self.input = None  
        self.z = None 
        self.output = None 
        self.activation = activation
        self.derivative = derivative
        self.error = None
        self.batch_norm = batch_norm
        # Adding Batch Normalization parameters
        if self.batch_norm:
            self.momentum = momentum
            self.gamma =  np.ones((1,output_size))
            self.beta = np.zeros((1,output_size))
            self.running_mean = np.zeros(output_size)
            self.running_var = np.ones(output_size)
            self.cache = None
        # Adding adam optimization parameters
        self.m_w, self.v_w = np.zeros_like(self.weights), np.zeros_like(self.weights)
        self.m_b, self.v_b = np.zeros_like(self.bias), np.zeros_like(self.bias)
        self.beta1, self.beta2 = 0.9,0.99
        self.t = 0
        self.epsilon = epsilon
    def __call__(self,X,train=True):
        self.input = X 
        self.z = np.dot(X, self.weights) + self.bias
        if self.batch_norm:
            if train:
                mean = np.mean(self.z, axis=0)
                var = np.var(self.z, axis=0)
                z_norm = (self.z - mean) / np.sqrt(var + self.epsilon)
                self.output = self.gamma * z_norm + self.beta
                self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * mean
                self.running_var = self.momentum * self.running_var + (1 - self.momentum) * var
                self.cache = (self.z, z_norm, mean, var)
            else:
                self.z_norm = (self.z - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
                self.output = self.gamma * self.z_norm + self.beta  
        else:
            self.output = self.activation(self.z)
        return self.output 
    def backward(self,error, learning_rate,L2,adam_optimize,clipping_mode='norm',clip_threshold=1.0):
        self.error = error *  self.derivative(self.z)
        self.t += 1
        penalty = self.L2(L2,self.weights) if L2 is not None else 0
        self.gradient_clipping(clipping_mode,clip_threshold)
        if self.batch_norm:
            self.error, self.gamma,self.beta = self.batch_normalization(self.error,self.gamma, self.beta,learning_rate)
        if adam_optimize: 
            self.adam(learning_rate)
        else:
            self.weights -= learning_rate * (np.dot(self.input.T, self.error) + penalty)
            self.bias -= learning_rate * np.sum(self.error,axis=0,keepdims=True)

        propagated_error = np.dot(self.error, self.weights.T)
        return propagated_error

    def adam(self, learning_rate):
        self.m_w = self.beta1 * self.m_w + (1 - self.beta1) * np.dot(self.input.T, self.error)
        self.v_w = self.beta2 * self.v_w + (1 - self.beta2) * (np.dot(self.input.T, self.error) ** 2)
        m_w_hat = self.m_w / (1 - self.beta1 ** self.t)
        v_w_hat = self.v_w / (1 - self.beta2 ** self.t)
        self.weights -= learning_rate * m_w_hat / (np.sqrt(v_w_hat) + self.epsilon)

        self.m_b = self.beta1 * self.m_b + (1 - self.beta1) * np.sum(self.error, axis=0, keepdims=True)
        self.v_b = self.beta2 * self.v_b + (1 - self.beta2) * (np.sum(self.error, axis=0, keepdims=True) ** 2)
        m_b_hat = self.m_b / (1 - self.beta1 ** self.t)
        v_b_hat = self.v_b / (1 - self.beta2 ** self.t)
        self.bias -= learning_rate * m_b_hat / (np.sqrt(v_b_hat) + self.epsilon)
    def gradient_clipping(self,mode,clip_threshold):
        if mode == 'value':
            self.error = np.clip(self.error, -clip_threshold, clip_threshold)
        elif mode == 'norm':
            norm = np.linalg.norm(self.error)
            if norm > clip_threshold:
                self.error = self.error / norm * clip_threshold
        else:
            return  
    def batch_normalization(self,error,gamma,beta, learning_rate):
        z , z_norm , mean , var = self.cache
        m = z.shape[0]
        dbeta = np.sum(error,axis=0)
        dgamma = np.sum(error * z_norm,axis=0)
        dz_norm = error * self.gamma
        dvar = np.sum(dz_norm * (z - mean) * -0.5 * (var + self.epsilon)**-1.5, axis=0)
        dmean = np.sum(dz_norm * -1.0 / np.sqrt(var + self.epsilon), axis=0) + dvar * np.mean(-2.0 * (z - mean), axis=0)

        error = dz_norm * 1.0 / np.sqrt(var + self.epsilon) + dvar * 2.0 * (z - mean) / m + dmean / m
        gamma -= learning_rate * dgamma
        beta -= learning_rate * dbeta
        return error, gamma, beta 
    def L2(self, C, weights):
        return 2/C * weights
    def set_initialization(self,func):
        self.init = func
class NeuralNetwork:
    def __init__(self,sizes,activation_func=lambda x : x, derivative_func=lambda x: 1,init_func=np.random.randn,use_batch_norm=True):
        self.layers = [
            Layer(input_size=input,output_size=output,activation=activation_func,derivative=derivative_func,initialize_function=init_func,batch_norm=use_batch_norm) 
            for input,output
            in zip(sizes[:-1],sizes[1:])
        ]
        self.loss = None
        self.loss_derivative = None 
    def forward(self,X):
        for layer in self.layers:
            X = layer(X)
        return X 
    def backward(self,error,learning_rate,L2,adam_optimizer,clipping_mode,clipping_threshold):
        for layer in reversed(self.layers):
            error = layer.backward(error,learning_rate,L2,adam_optimizer,clipping_mode,clipping_threshold)
            
    def train(self,X,y,*,epochs=100, learning_rate=10e-4,L2=None,adam_optimizer=False,clipping_mode='norm',clipping_threshold=1.0,batch_size=2):
        rows = X.shape[0]
        for epoch in range(epochs):
            indices = np.random.permutation(rows)
            X_permuted = X[indices]
            y_permuted = y[indices]
            for start in range(0,rows,batch_size):
                end = min(start + batch_size, rows)
                X_batch = X_permuted[start:end]
                y_batch = y_permuted[start:end]
                output = self.forward(X_batch)
                error = self.loss_derivative(output,y_batch)
                self.backward(error,learning_rate,L2,adam_optimizer,clipping_mode,clipping_threshold)
            print(f"Iteration: {epoch}, Loss : {self.loss(self.forward(X),y)}")
    def set_output_function(self,func,derivative):
        self.layers[-1].activation =  func 
        self.layers[-1].derivative = derivative
    def set_loss_function(self,func,derivative):
        self.loss = func 
        self.loss_derivative = derivative
    def predict(self,X):
        for layer in self.layers:
            X = layer(X,False)
        return X 

In [6]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from scipy.special import softmax

digits = load_digits()
# One-hot encode the labels
encoder = OneHotEncoder(sparse_output=False)
y_onehot = encoder.fit_transform(digits.target.reshape(-1, 1))
X_train,X_test,y_train,y_test = train_test_split(digits.data,y_onehot,test_size=0.2)

In [7]:
batch_size = 80
size = [64,128,32,10]
nn = NeuralNetwork(size,relu,relu_derivative,lambda x,y: he_initialization((x,y)),use_batch_norm=True)
nn.set_output_function(lambda x: softmax(x), lambda x : softmax(x))
nn.set_loss_function(categorical_cross_entropy,lambda x,y : x - y)

In [8]:
nn.train(X_train,y_train,epochs=100,learning_rate=10e-3,L2=10,clipping_mode='norm',clipping_threshold=1.0,adam_optimizer=False,batch_size=batch_size)

Iteration: 0, Loss : 11.437662176427967
Iteration: 1, Loss : 11.326383092076966
Iteration: 2, Loss : 11.37032818915962
Iteration: 3, Loss : 11.341968381382609
Iteration: 4, Loss : 11.25681875436386
Iteration: 5, Loss : 11.085937901312835
Iteration: 6, Loss : 11.059317175752588
Iteration: 7, Loss : 11.016567629328689
Iteration: 8, Loss : 10.949255342055165
Iteration: 9, Loss : 10.847183044275962
Iteration: 10, Loss : 10.769190866215359
Iteration: 11, Loss : 10.672937229209689
Iteration: 12, Loss : 10.550011013109621
Iteration: 13, Loss : 10.499053840336162
Iteration: 14, Loss : 10.402743306124018
Iteration: 15, Loss : 10.306485521243722
Iteration: 16, Loss : 10.254527176514689
Iteration: 17, Loss : 10.210400655136658
Iteration: 18, Loss : 10.0195288760453
Iteration: 19, Loss : 9.908534949486164
Iteration: 20, Loss : 9.732933290498188
Iteration: 21, Loss : 9.600185630573346
Iteration: 22, Loss : 9.449245227798956
Iteration: 23, Loss : 9.297664845306008
Iteration: 24, Loss : 9.14787948330

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Forward method to get predictions
predictions = nn.predict(X_test)
# If predictions are in the form of probabilities, convert them to class labels
predicted_labels = np.argmax(predictions, axis=1)
true_labels = np.argmax(y_test, axis=1)
predicted_labels

array([0, 4, 0, 0, 0, 0, 8, 9, 5, 3, 3, 2, 2, 4, 0, 1, 3, 2, 8, 8, 0, 3,
       5, 3, 1, 4, 4, 9, 3, 4, 0, 9, 7, 7, 7, 4, 7, 0, 5, 4, 0, 7, 2, 3,
       6, 7, 9, 1, 4, 7, 7, 7, 4, 2, 8, 1, 1, 4, 2, 0, 7, 6, 0, 1, 2, 0,
       2, 6, 6, 3, 1, 0, 6, 4, 2, 4, 9, 6, 2, 2, 8, 6, 7, 2, 1, 0, 3, 2,
       5, 7, 1, 2, 8, 7, 9, 3, 7, 4, 6, 4, 9, 8, 0, 1, 3, 5, 0, 0, 5, 8,
       0, 0, 0, 4, 9, 0, 8, 5, 9, 4, 9, 5, 6, 1, 6, 4, 5, 2, 7, 9, 1, 2,
       9, 8, 7, 4, 2, 3, 8, 2, 9, 5, 5, 6, 2, 9, 1, 8, 6, 8, 2, 5, 4, 5,
       2, 2, 4, 5, 2, 0, 1, 5, 6, 1, 9, 1, 1, 5, 2, 9, 3, 6, 4, 1, 2, 6,
       6, 9, 8, 9, 1, 4, 2, 3, 5, 0, 4, 3, 0, 4, 8, 7, 4, 9, 0, 9, 3, 1,
       4, 8, 2, 9, 7, 1, 1, 7, 7, 1, 5, 2, 1, 0, 2, 4, 6, 3, 4, 8, 4, 7,
       1, 9, 2, 1, 4, 2, 3, 6, 8, 4, 9, 3, 4, 5, 4, 7, 6, 0, 7, 1, 6, 2,
       4, 5, 1, 0, 2, 2, 1, 4, 2, 3, 5, 1, 7, 2, 4, 9, 8, 0, 9, 8, 3, 6,
       2, 0, 7, 6, 7, 0, 2, 6, 7, 8, 5, 7, 6, 3, 3, 6, 9, 7, 9, 8, 5, 5,
       4, 3, 0, 2, 7, 4, 4, 6, 1, 8, 5, 1, 2, 1, 8,

In [12]:
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, confusion_matrix
print(f"Accuracy score: {accuracy_score(true_labels,predicted_labels)}")
print(precision_score(true_labels, predicted_labels, average='macro'))
print(recall_score(true_labels, predicted_labels, average='macro'))
print(confusion_matrix(true_labels, predicted_labels))
print(f1_score(true_labels, predicted_labels, average='macro'))

Accuracy score: 0.9277777777777778
0.9268386766212853
0.9223740604038158
[[43  0  0  0  0  0  0  0  0  0]
 [ 0 30  2  0  0  0  0  0  0  1]
 [ 0  0 41  0  0  0  0  0  0  0]
 [ 0  0  2 32  0  0  0  0  0  2]
 [ 1  2  0  0 45  0  0  0  0  0]
 [ 0  0  0  0  0 28  0  0  0  1]
 [ 0  0  1  0  0  0 32  0  0  0]
 [ 0  0  0  0  1  0  0 34  0  1]
 [ 0  3  0  1  0  0  0  0 23  4]
 [ 0  0  0  0  0  0  0  0  4 26]]
0.9229974009000482


In [13]:
# Define network architecture
batch_size = 2
sizes = [batch_size, 5,1]

# Initialize network
nn = NeuralNetwork(sizes, sigmoid, sigmoid_derivative,init_func=lambda x,y: xavier_initialization((x,y)),use_batch_norm=False)
nn.set_loss_function(mean_squared_error, mean_squared_error_derivative)
nn.set_output_function(sigmoid,sigmoid_derivative)

X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([[0],[0],[0],[1]])
nn.train(X,y,epochs=1000,learning_rate=10e-1,batch_size=2)
nn.forward(X)

Iteration: 0, Loss : 0.23534449897541018
Iteration: 1, Loss : 0.17951921927881992
Iteration: 2, Loss : 0.1689733809766195
Iteration: 3, Loss : 0.1616249405912087
Iteration: 4, Loss : 0.15880304636731019
Iteration: 5, Loss : 0.1570854558812189
Iteration: 6, Loss : 0.15510839106795843
Iteration: 7, Loss : 0.153118303169114
Iteration: 8, Loss : 0.1511461331985599
Iteration: 9, Loss : 0.149134005950864
Iteration: 10, Loss : 0.14731058110607181
Iteration: 11, Loss : 0.14576858539122858
Iteration: 12, Loss : 0.14291104483186165
Iteration: 13, Loss : 0.14058706897508277
Iteration: 14, Loss : 0.1385003389314597
Iteration: 15, Loss : 0.13630031078604932
Iteration: 16, Loss : 0.13428549020706992
Iteration: 17, Loss : 0.13228550958273463
Iteration: 18, Loss : 0.13030561374763427
Iteration: 19, Loss : 0.127683571569716
Iteration: 20, Loss : 0.1256437685227509
Iteration: 21, Loss : 0.12383097244465181
Iteration: 22, Loss : 0.12185585133522858
Iteration: 23, Loss : 0.11920234682457181
Iteration: 24,

array([[6.84523978e-04],
       [2.58524231e-02],
       [2.48995743e-02],
       [9.66421984e-01]])

In [9]:
batch_size = 1
X = np.array([[-1],[-2],[-3],[1],[2],[3]])
y = np.array([[1],[2],[3],[-1],[-2],[-3]])
size = [1,2,1]
nn = NeuralNetwork(size,lambda x: x,lambda x: 1,init_func=lambda x,y: xavier_initialization((x,y)),use_batch_norm=False)
nn.set_loss_function(mean_squared_error, mean_squared_error_derivative)
nn.train(X,y,epochs=100,learning_rate=0.01, batch_size=3)
nn.forward([[-25],[5],[-4],[4]])

Iteration: 0, Loss : 4.6256037596674915
Iteration: 1, Loss : 3.141815460215979
Iteration: 2, Loss : 2.092505233622914
Iteration: 3, Loss : 1.3166255515480165
Iteration: 4, Loss : 0.7967779476806705
Iteration: 5, Loss : 0.4655553478348294
Iteration: 6, Loss : 0.2894731666914195
Iteration: 7, Loss : 0.191451458174326
Iteration: 8, Loss : 0.1288762906705466
Iteration: 9, Loss : 0.08452455767479518
Iteration: 10, Loss : 0.05761672905320724
Iteration: 11, Loss : 0.037987432208739944
Iteration: 12, Loss : 0.02503346082537029
Iteration: 13, Loss : 0.016873306370959542
Iteration: 14, Loss : 0.011381623919114528
Iteration: 15, Loss : 0.007664189935640949
Iteration: 16, Loss : 0.005091499444436252
Iteration: 17, Loss : 0.0033698303616038357
Iteration: 18, Loss : 0.0022501270643953356
Iteration: 19, Loss : 0.0014576617698522046
Iteration: 20, Loss : 0.0009734078843813263
Iteration: 21, Loss : 0.000650290137643332
Iteration: 22, Loss : 0.000435692508433223
Iteration: 23, Loss : 0.00028791034728484

array([[24.99999998],
       [-5.        ],
       [ 4.        ],
       [-4.        ]])

In [15]:
batch_size = 1 
size = [2,3,1]
X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([[0],[1],[1],[0]])
nn = NeuralNetwork(size,sigmoid,sigmoid_derivative,use_batch_norm=False)
nn.set_loss_function(mean_squared_error,mean_squared_error_derivative)
nn.train(X,y,epochs=1000,learning_rate=10e-3,L2=None,clipping_mode='norm',adam_optimizer=True,batch_size=batch_size)
nn.forward(X)

Iteration: 0, Loss : 0.3273192813941069
Iteration: 1, Loss : 0.3231620457619328
Iteration: 2, Loss : 0.31847123627134194
Iteration: 3, Loss : 0.31358156521103636
Iteration: 4, Loss : 0.3092275355094654
Iteration: 5, Loss : 0.3036810266499065
Iteration: 6, Loss : 0.2991717185124966
Iteration: 7, Loss : 0.2942886192030614
Iteration: 8, Loss : 0.29004389345731557
Iteration: 9, Loss : 0.2875134780849379
Iteration: 10, Loss : 0.2841809256441531
Iteration: 11, Loss : 0.2805468531976735
Iteration: 12, Loss : 0.27776684040649385
Iteration: 13, Loss : 0.2759967293962527
Iteration: 14, Loss : 0.2730880384465447
Iteration: 15, Loss : 0.2716348755939501
Iteration: 16, Loss : 0.2700635307896075
Iteration: 17, Loss : 0.2679957896384892
Iteration: 18, Loss : 0.2663799644503365
Iteration: 19, Loss : 0.26428193523530746
Iteration: 20, Loss : 0.26306270837359724
Iteration: 21, Loss : 0.26170355635380943
Iteration: 22, Loss : 0.26088993749106487
Iteration: 23, Loss : 0.260213684144212
Iteration: 24, Loss

array([[0.00636636],
       [0.97875838],
       [0.97868883],
       [0.03390598]])