# NN3: Implementacja momentu i normalizacji gradientu
Adrianna Grudzień

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import random

In [2]:
class Layer_with_momentum:
    def __init__(self, shape, activation='sigmoid'):
        self.shape = shape
        self.activation = activation
        self._initialize_weights()

    def _initialize_weights(self, min_val=-0.5, max_val=0.5):
        self.weights = np.random.uniform(min_val, max_val, size=self.shape)
        self.biases = np.random.uniform(min_val, max_val, size=self.shape[1])

    def calculate(self, x):
        return np.matmul(x, self.weights) + self.biases

    def activate(self, x):
        if self.activation == 'sigmoid':
            return Layer_with_momentum.sigmoid(x)
        if self.activation == 'linear':
            return x   
        
    def update(self, momentum_weights, momentum_biases, learning_rate):            
        self.weights += np.array(np.reshape([learning_rate*i for i in momentum_weights],self.shape))
        self.biases += np.array(np.reshape([learning_rate*i for i in momentum_biases], self.shape[1]))
        
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

class NN_with_momentum:
    def __init__(self, input_shape, output_size, layers_num=1, neurons_num=[5], activations=['linear']):
        self.input_shape = input_shape
        # self.output_size = output_size
        self.layers_num = layers_num
        self.neurons_num = neurons_num
        self.activations = activations
        self._build()

    def visualise_weights(self):
        for i in range(self.layers_num):
            plt.subplot(self.layers_num, 1, i+1)
            plt.scatter(np.arange(self.neurons_num[i]), self.layers[i].weights)
            
        plt.show()

    @staticmethod
    def sigmoid_derivative(x):
        return np.exp(-x) / np.square((1 + np.exp(-x)))

    @staticmethod
    def mse(y_true, y_pred):
         return np.mean(np.square(y_true - y_pred))

    def calculate_errors(self, y_true, y_pred):
        errors = []
        for i in range(self.layers_num - 1, -1, -1):
            layer = self.layers[i]
            if layer.activation == 'linear':
                derivative = np.ones(shape=(layer.shape[-1], 1))
            if layer.activation == 'sigmoid':
                derivative = NN_with_momentum.sigmoid_derivative(self.recent_calculations[i + 1].reshape(layer.shape[-1], 1))
            if i == self.layers_num - 1:
                errors.append(np.multiply((y_pred - y_true), derivative))
            else:
                errors.append(np.multiply(derivative, np.dot(self.layers[i + 1].weights, errors[-1])))
        errors.reverse()
        return errors
      
    def update_layers(self):
        for i in range(self.layers_num):  
            self.layers[i].update(self.momentum_weights[i], self.momentum_biases[i], self.learning_rate)
            
    def backpropagate(self, y_pred, y_true, x):
        delta_weights = []
        delta_biases = []
        
        errors = self.calculate_errors(y_true, y_pred) 
        for i in range(self.layers_num - 1, 0, -1):     
            a = self.layers[i - 1].activate(self.recent_calculations[i].reshape(self.layers[i - 1].shape[-1], 1))       
            delta_weights.insert(0, -self.learning_rate * np.outer(a, errors[i]) / self.batch_size)
            delta_biases.insert(0, -self.learning_rate * errors[i] / self.batch_size)
        delta_weights.insert(0, -self.learning_rate * np.outer(x, errors[0]) / self.batch_size)
        delta_biases.insert(0, -self.learning_rate * errors[0] / self.batch_size)
        
        return delta_weights, delta_biases
                
    def sum_deltas(self, delta_weights, delta_biases):
        for i in range(self.layers_num):
            self.delta_weights[i] += np.array(delta_weights[i])
            self.delta_biases[i] += np.array(delta_biases[i])

    def _build(self):
        self.layers = []

        layer = Layer_with_momentum(shape=(self.input_shape[1], self.neurons_num[0]), activation=self.activations[0])
        self.layers.append(layer)

        for i in range(1, self.layers_num):
            layer = Layer_with_momentum(shape=(self.layers[i - 1].shape[1], self.neurons_num[i]), activation=self.activations[i])
            self.layers.append(layer)

        # layer = Layer_with_momentum(shape=(self.layers[-1].shape[1], self.output_size), activation=self.activations[-1])
        # self.layers.append(layer)

    @staticmethod
    def convert_to_numpy_array(x_train, y_train, x_test, y_test):
        if x_test is None or y_test is None:
            return np.array(x_train), np.array(y_train), None, None
        else:
            return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

    def print_results(self, epoch):
        print(f'Epoch number {epoch}/{self.n_epochs}')
        print(f'MSE on training set: {NN_with_momentum.mse(self.y_train, self.predict(self.x_train))}', end=' ')
        if self.x_test is not None:
            print(f'     , MSE on test set: {NN_with_momentum.mse(self.y_test, self.predict(self.x_test))}')

    def predict(self, input):
        self.recent_calculations = []
        self.recent_calculations.append(input)

        x = self.layers[0].calculate(input)
        self.recent_calculations.append(x)
        x = self.layers[0].activate(x)
        

        for i in range(1, self.layers_num):
            x = self.layers[i].calculate(x)
            self.recent_calculations.append(x)
            x = self.layers[i].activate(x)
            assert ~np.isnan(x).any() 

        return x
    
    def fit(self, x_train, y_train, lambdaa, batch_size, n_epochs, learning_rate=0.003, x_test=None, y_test=None):
        """
        lambdaa - współczynnik wygaszania momentu (z przedziału (0,1))
        """
        self.x_train, self.y_train, self.x_test, self.y_test = NN_with_momentum.convert_to_numpy_array(x_train, y_train, x_test, y_test)
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        
        n = y_train.shape[0]
        self.momentum_weights = np.zeros((1,n))
        self.momentum_biases = np.zeros((1,n))

        indices = np.arange(n)
        epoch = 1
        while(epoch <= self.n_epochs):
            np.random.shuffle(indices)
            mini_batches = np.split(indices, [i * batch_size for i in range(1, n // batch_size)])
            for batch in mini_batches:      
                for j in range(batch_size):
                    y_pred = self.predict(self.x_train[batch[j]])
                    if j == 0:
                        self.delta_weights, self.delta_biases = self.backpropagate(y_pred, self.y_train[batch[j]], self.x_train[batch[j]]) # inicjalizowanie delta_weights i delta_bias
                    else:   
                        temp_delta_weights, temp_delta_biases = self.backpropagate(y_pred, self.y_train[batch[j]], self.x_train[batch[j]])
                        self.sum_deltas(temp_delta_weights, temp_delta_biases)
                  

                self.momentum_weights = self.delta_weights + [i*lambdaa for i in self.momentum_weights]
                self.momentum_biases = self.delta_biases + [i*lambdaa for i in self.momentum_biases]
        
                self.update_layers()

            if epoch % 30 == 0:
                 self.print_results(epoch)
            epoch += 1


In [3]:
sq = pd.read_csv('mio1/regression/square-simple-training.csv', index_col=0)
sq_x_train = np.reshape(np.array(sq.x), (100, 1))
sq_y_train = np.reshape(np.array(sq.y), (100, 1))


sq_test = pd.read_csv('mio1/regression/square-simple-test.csv', index_col=0)
sq_x_test = np.reshape(np.array(sq_test.x), (100, 1))
sq_y_test = np.reshape(np.array(sq_test.y), (100, 1))

In [7]:
nn_with_momentum_sq = NN_with_momentum(input_shape=[len(sq_x_train), 1], output_size=1, layers_num=2, neurons_num=[90, 1], activations=['sigmoid', 'linear'])
nn_with_momentum_sq.fit(x_train=sq_x_train, y_train=sq_y_train, lambdaa=0.8, batch_size=2, n_epochs=700, x_test=sq_x_test, y_test=sq_y_test, learning_rate=0.001)

Epoch number 30/700
MSE on training set: 10107.44357736024      , MSE on test set: 8695.858528843639
Epoch number 60/700
MSE on training set: 10041.77448139226      , MSE on test set: 8614.297748578883
Epoch number 90/700
MSE on training set: 9980.423372846317      , MSE on test set: 8537.610428564465
Epoch number 120/700
MSE on training set: 9923.032732534652      , MSE on test set: 8465.40926802242
Epoch number 150/700
MSE on training set: 9869.279260668754      , MSE on test set: 8397.344467929926
Epoch number 180/700
MSE on training set: 9818.868043914683      , MSE on test set: 8333.097040929351
Epoch number 210/700
MSE on training set: 9771.525194178746      , MSE on test set: 8272.36982046207
Epoch number 240/700
MSE on training set: 9727.008399655013      , MSE on test set: 8214.901729486266
Epoch number 270/700
MSE on training set: 9685.086618613595      , MSE on test set: 8160.441766562543
Epoch number 300/700
MSE on training set: 9645.552637920619      , MSE on test set: 810

In [None]:
plt.scatter(sq_x_test, sq_y_test, color='green')
plt.scatter(sq_x_test, nn_with_momentum_sq.predict(sq_x_test), color='red')
plt.show()

## Zbiór steps-small

In [None]:
ss_train = pd.read_csv('mio1/regression/steps-small-training.csv', index_col=0)
ss_x_train = np.reshape(np.array(ss_train.x), (len(ss_train.x), 1))
ss_y_train = np.reshape(np.array(ss_train.y), (len(ss_train.x), 1))


ss_test = pd.read_csv('mio1/regression/steps-small-test.csv', index_col=0)
ss_x_test = np.reshape(np.array(ss_test.x), (len(ss_test.x), 1))
ss_y_test = np.reshape(np.array(ss_test.y), (len(ss_test.x), 1))

In [None]:
nn_with_momentum_ss = NN_with_momentum(input_shape=[len(ss_x_train), 1], output_size=1, layers_num=2, neurons_num=[100, 1], activations=['sigmoid', 'linear'])
nn_with_momentum_ss.fit(x_train=ss_x_train, y_train=ss_y_train, lambdaa=0.8, batch_size=2, n_epochs=700, x_test=ss_x_test, y_test=ss_y_test, learning_rate=0.001)

In [None]:
plt.scatter(ss_x_test, ss_y_test, color='green')
plt.scatter(ss_x_test, nn_with_momentum_ss.predict(ss_x_test), color='red')
plt.show()

## Zbiór multimodal-large

In [None]:
ml = pd.read_csv('mio1/regression/multimodal-large-training.csv')
ml_x_train = np.reshape(np.array(ml.x), (len(ml.x), 1))
ml_y_train = np.reshape(np.array(ml.y), (len(ml.x), 1))


ml_test = pd.read_csv('mio1/regression/multimodal-large-test.csv')
ml_x_test = np.reshape(np.array(ml_test.x), (len(ml_test.x), 1))
ml_y_test = np.reshape(np.array(ml_test.y), (len(ml_test.x), 1))

In [None]:
nn_with_momentum_ml = NN_with_momentum(input_shape=[len(ml_x_train), 1], output_size=1, layers_num=2, neurons_num=[100, 1], activations=['sigmoid', 'linear'])
nn_with_momentum_ml.fit(x_train=ml_x_train, y_train=ml_y_train, lambdaa=0.8, batch_size=2, n_epochs=700, x_test=ml_x_test, y_test=ml_y_test, learning_rate=0.001)

In [None]:
plt.scatter(ml_x_test, ml_y_test, color='green')
plt.scatter(ml_x_test, nn_with_momentum_ss.predict(ml_x_test), color='red')
plt.show()

In [6]:
# nn_sq.fit(x_train=sq_x_train, y_train=sq_y_train, batch_size=2, n_epochs=2000, x_test=sq_x_test, y_test=sq_y_test, learning_rate=0.0003)

In [5]:
# nn_with_momentum_sq.fit(x_train=sq_x_train, y_train=sq_y_train, lambdaa=0.7, batch_size=2, n_epochs=700, x_test=sq_x_test, y_test=sq_y_test, learning_rate=0.001)

In [107]:
class Layer_with_RMSProp:
    def __init__(self, shape, activation='sigmoid'):
        self.shape = shape
        self.activation = activation
        self._initialize_weights()

    def _initialize_weights(self, min_val=-0.5, max_val=0.5):
        self.weights = np.random.uniform(min_val, max_val, size=self.shape)
        self.biases = np.random.uniform(min_val, max_val, size=self.shape[1])

    def calculate(self, x):
        return np.matmul(x, self.weights) + self.biases

    def activate(self, x):
        if self.activation == 'sigmoid':
            return Layer_with_RMSProp.sigmoid(x)
        if self.activation == 'linear':
            return x   
        
    def update(self, g_weights, g_biases, e_g2_weights, e_g2_biases, learning_rate):
#         self.weights -= np.array(np.reshape([learning_rate*g_weights[i]/
#                                              np.sqrt(e_g2_weights[i]) for i in len(e_g2_weights)],self.shape))
#         self.biases -= np.array(np.reshape([learning_rate*g_biases[i]/np.sqrt(e_g2_biases[i]) for i in len(e_g2_biases)],self.shape))
        for j in range(len(e_g2_weights)):
            print(g_weights[j])
            print(e_g2_weights[j])
            print(self.weights[j])
            
            self.weights[j] -= learning_rate*np.asmatrix(g_weights[j])
        
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

class NN_with_RMSProp:
    def __init__(self, input_shape, output_size, layers_num=1, neurons_num=[5], activations=['linear']):
        self.input_shape = input_shape
        # self.output_size = output_size
        self.layers_num = layers_num
        self.neurons_num = neurons_num
        self.activations = activations
        self._build()

    def visualise_weights(self):
        for i in range(self.layers_num):
            plt.subplot(self.layers_num, 1, i+1)
            plt.scatter(np.arange(self.neurons_num[i]), self.layers[i].weights)
            
        plt.show()

    @staticmethod
    def sigmoid_derivative(x):
        return np.exp(-x) / np.square((1 + np.exp(-x)))

    @staticmethod
    def mse(y_true, y_pred):
         return np.mean(np.square(y_true - y_pred))

    def calculate_errors(self, y_true, y_pred):
        errors = []
        for i in range(self.layers_num - 1, -1, -1):
            layer = self.layers[i]
            if layer.activation == 'linear':
                derivative = np.ones(shape=(layer.shape[-1], 1))
            if layer.activation == 'sigmoid':
                derivative = NN_with_RMSProp.sigmoid_derivative(self.recent_calculations[i + 1].reshape(layer.shape[-1], 1))
            if i == self.layers_num - 1:
                errors.append(np.multiply((y_pred - y_true), derivative))
            else:
                errors.append(np.multiply(derivative, np.dot(self.layers[i + 1].weights, errors[-1])))
        errors.reverse()
        return errors
      
    def update_layers(self):
        # print(self.delta_weights)
        for i in range(self.layers_num):  
            self.layers[i].update(self.g_weights[i], self.g_biases[i], self.e_g2_weights[i], self.e_g2_biases[i], self.learning_rate)
            
    def backpropagate(self, y_pred, y_true, x):
        delta_weights = []
        delta_biases = []
        
        errors = self.calculate_errors(y_true, y_pred) 
        for i in range(self.layers_num - 1, 0, -1):     
            a = self.layers[i - 1].activate(self.recent_calculations[i].reshape(self.layers[i - 1].shape[-1], 1))       
            delta_weights.insert(0, self.learning_rate * np.outer(a, errors[i]) / self.batch_size)
            # print(f'Deltas: {delta_weights[0]}')
            delta_biases.insert(0, self.learning_rate * errors[i] / self.batch_size)
        delta_weights.insert(0, self.learning_rate * np.outer(x, errors[0]) / self.batch_size)
        # print(f'Deltas: {delta_weights[0]}')
        delta_biases.insert(0, self.learning_rate * errors[0] / self.batch_size)
        
        return delta_weights, delta_biases
    

        
        
        return delta_weights, delta_biases
        
    def sum_deltas(self, g_weights, g_biases):
        for i in range(self.layers_num):
            self.g_weights[i] += np.array(g_weights[i])
            self.g_biases[i] += np.array(g_biases[i])

    def _build(self):
        self.layers = []

        layer = Layer_with_RMSProp(shape=(self.input_shape[1], self.neurons_num[0]), activation=self.activations[0])
        self.layers.append(layer)

        for i in range(1, self.layers_num):
            layer = Layer_with_RMSProp(shape=(self.layers[i - 1].shape[1], self.neurons_num[i]), activation=self.activations[i])
            self.layers.append(layer)

        # layer = Layer_with_momentum(shape=(self.layers[-1].shape[1], self.output_size), activation=self.activations[-1])
        # self.layers.append(layer)

    @staticmethod
    def convert_to_numpy_array(x_train, y_train, x_test, y_test):
        if x_test is None or y_test is None:
            return np.array(x_train), np.array(y_train), None, None
        else:
            return np.array(x_train), np.array(y_train), np.array(x_test), np.array(y_test)

    def print_results(self, epoch):
        print(f'Epoch number {epoch}/{self.n_epochs}')
        print(f'MSE on training set: {NN_with_RMSProp.mse(self.y_train, self.predict(self.x_train))}', end=' ')
        if self.x_test is not None:
            print(f'     , MSE on test set: {NN_with_RMSProp.mse(self.y_test, self.predict(self.x_test))}')

    def predict(self, input):
        self.recent_calculations = []
        self.recent_calculations.append(input)

        x = self.layers[0].calculate(input)
        self.recent_calculations.append(x)
        x = self.layers[0].activate(x)
        

        for i in range(1, self.layers_num):
            x = self.layers[i].calculate(x)
            self.recent_calculations.append(x)
            x = self.layers[i].activate(x)
            assert ~np.isnan(x).any() 

        return x
    
    def fit(self, x_train, y_train, beta, batch_size, n_epochs, learning_rate=0.003, x_test=None, y_test=None):
        """
        beta - współczynnik wygaszania
        """
        self.x_train, self.y_train, self.x_test, self.y_test = NN_with_RMSProp.convert_to_numpy_array(x_train, y_train, x_test, y_test)
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        
        n = y_train.shape[0]
        self.e_g2_weights = np.zeros((1,n)) # E[g^2]
        self.e_g2_biases = np.zeros((1,n)) # E[g^2]

        indices = np.arange(n)
        epoch = 1
        while(epoch <= self.n_epochs):
            np.random.shuffle(indices)
            mini_batches = np.split(indices, [i * batch_size for i in range(1, n // batch_size)])
            for batch in mini_batches:      
                for j in range(batch_size):
                    y_pred = self.predict(self.x_train[batch[j]])
                    if j == 0:
                        self.g_weights, self.g_biases = self.backpropagate(y_pred, self.y_train[batch[j]], self.x_train[batch[j]]) # inicjalizowanie g_weights i g_bias
                    else:   
                        temp_g_weights, temp_g_biases = self.backpropagate(y_pred, self.y_train[batch[j]], self.x_train[batch[j]])
                        self.sum_deltas(temp_g_weights, temp_g_biases)

#                 self.momentum_weights = self.delta_weights + [i*lambdaa for i in self.momentum_weights]
#                 self.momentum_biases = self.delta_biases + [i*lambdaa for i in self.momentum_biases]
        
                print(self.e_g2_weights.shape)
                self.e_g2_weights = np.array([beta*i for i in self.e_g2_weights] + [(1-beta)*i**2 for i in self.g_weights])
                self.e_g2_biases = np.array([beta*i for i in self.e_g2_biases] + [(1-beta)*i**2 for i in self.g_biases])

                self.update_layers()

            if epoch % 10 == 0:
                 self.print_results(epoch)
            epoch += 1


In [108]:
nn_with_rmsprop_sq = NN_with_RMSProp(input_shape=[len(sq_x_train), 1], output_size=1, layers_num=2, neurons_num=[100, 1], activations=['sigmoid', 'linear'])
nn_with_rmsprop_sq.fit(x_train=sq_x_train, y_train=sq_y_train, beta=0.7, batch_size=2, n_epochs=700, x_test=sq_x_test, y_test=sq_y_test, learning_rate=0.001)

(1, 100)
[-2.79718399e-03  1.13068074e-03  9.24006534e-04  2.90639781e-03
  6.21524081e-04  2.38092833e-03 -2.64894228e-03 -7.33249527e-04
 -2.30186553e-04  5.29864763e-04  1.33507079e-03 -5.75942616e-04
 -3.13293228e-03 -1.99170208e-03 -2.60668003e-03 -2.14367639e-04
  2.37337607e-03 -3.17795500e-04 -2.37436677e-03  1.45228892e-03
  6.33825980e-04 -2.97986062e-03 -2.45974909e-03  2.73374694e-04
 -1.55961466e-03 -1.40639901e-04  1.94787061e-03  2.51543471e-03
  2.62741897e-03  1.30806812e-04  2.27806144e-03 -2.32299657e-03
 -8.16273995e-04 -1.37928033e-03 -2.19746398e-03 -8.75721661e-04
 -2.87346001e-04 -1.80544709e-03  2.35048770e-03 -1.24232241e-04
 -2.30705085e-03 -1.77479880e-03 -7.79298925e-04  1.04125468e-03
 -2.23964092e-03 -1.44271184e-03  3.03563533e-03  2.88042653e-03
 -2.78477991e-03 -2.21431402e-03 -2.17750320e-03 -1.83036635e-03
  1.89597436e-03 -2.50428645e-03  2.00459547e-03  3.03756720e-04
 -2.30776065e-03 -1.25137790e-03 -1.48633240e-04  2.05896065e-03
  1.96715453e-04

  self.e_g2_weights = np.array([beta*i for i in self.e_g2_weights] + [(1-beta)*i**2 for i in self.g_weights])
  self.e_g2_biases = np.array([beta*i for i in self.e_g2_biases] + [(1-beta)*i**2 for i in self.g_biases])


ValueError: non-broadcastable output operand with shape (100,) doesn't match the broadcast shape (1,100)

In [20]:
4*3**2

36