# Zadanie 5


Celem ćwiczenia jest implementacja perceptronu wielowarstwowego oraz wybranego algorytmu optymalizacji gradientowej z algorytmem propagacji wstecznej.

Następnie należy wytrenować perceptron wielowarstwowy do klasyfikacji zbioru danych [MNIST](http://yann.lecun.com/exdb/mnist/). Zbiór MNIST dostępny jest w pakiecie `scikit-learn`.

Punktacja:
1. Implementacja propagacji do przodu (`forward`) [1 pkt]
2. Implementacja wstecznej propagacji (zademonstrowana na bramce XOR) (`backward`) [2 pkt]
3. Przeprowadzenie eksperymentów na zbiorze MNIST, w tym:
    1. Porównanie co najmniej dwóch architektur sieci [1 pkt]
    2. Przetestowanie każdej architektury na conajmniej 3 ziarnach [1 pkt]
    3. Wnioski 1.[5 pkt]
4. Jakość kodu 0.[5 pkt]

Polecane źródła - teoria + intuicja:
1. [Karpathy, CS231n Winter 2016: Lecture 4: Backpropagation, Neural Networks 1](https://www.youtube.com/watch?v=i94OvYb6noo&ab_channel=AndrejKarpathy)
2. [3 Blude one Brown, Backpropagation calculus | Chapter 4, Deep learning
](https://www.youtube.com/watch?v=tIeHLnjs5U8&t=4s&ab_channel=3Blue1Brown)


In [1]:
from abc import abstractmethod, ABC
from typing import List
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

x = load_digits()

[[[ 0.  0.  5. ...  1.  0.  0.]]

 [[ 0.  0. 13. ... 15.  5.  0.]]

 [[ 0.  3. 15. ... 11.  8.  0.]]

 ...

 [[ 0.  4. 16. ... 16.  6.  0.]]

 [[ 0.  8. 16. ... 16.  8.  0.]]

 [[ 0.  1.  8. ... 12.  1.  0.]]]
[[[0]]

 [[1]]

 [[2]]

 ...

 [[8]]

 [[9]]

 [[8]]]


In [45]:
class Layer(ABC):
    """Basic building block of the Neural Network"""

    def __init__(self) -> None:
        self._learning_rate = 0.01

    @abstractmethod
    def forward(self, x:np.ndarray)->np.ndarray:
        """Forward propagation of x through layer"""
        pass

    @abstractmethod
    def backward(self, output_error_derivative) ->np.ndarray:
        """Backward propagation of output_error_derivative through layer"""
        pass

    @property
    def learning_rate(self):
        return self._learning_rate

    @learning_rate.setter
    def learning_rate(self, learning_rate):
        assert learning_rate < 1, f"Given learning_rate={learning_rate} is larger than 1"
        assert learning_rate > 0, f"Given learning_rate={learning_rate} is smaller than 0"
        self._learning_rate = learning_rate

class FullyConnected(Layer):
    def __init__(self, input_size:int, output_size:int) -> None:
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.weights = np.random.rand(self.input_size, self.output_size) - 0.5
        self.bias = np.random.rand(1, self.output_size) - 0.5

    def forward(self, x:np.ndarray)->np.ndarray:
        self.x = x
        return np.dot(x, self.weights) + self.bias

    def backward(self, output_error_derivative)->np.ndarray:
        input_error = np.dot(output_error_derivative, self.weights.T)
        weight_error = np.dot(self.x.T, output_error_derivative)
        self.weights -= self.learning_rate * weight_error
        self.bias -= self.learning_rate * output_error_derivative
        return input_error


class Tanh(Layer):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x:np.ndarray)->np.ndarray:
        self.x = x
        return np.tanh(x)

    def backward(self, output_error)->np.ndarray:
        return (1 - np.tanh(self.x) ** 2) * output_error

class Loss:
    def __init__(self, loss_function:callable, loss_function_derivative:callable)->None:
        self.loss_function = loss_function
        self.loss_function_derivative = loss_function_derivative

    def loss(self, x:np.ndarray, y:np.ndarray)->np.ndarray:
        """Loss function for a particular x"""
        return self.loss_function(x, y)

    def loss_derivative(self, x:np.ndarray, y:np.ndarray)->np.ndarray:
        """Loss function derivative for a particular x and y"""
        return self.loss_function_derivative(x, y)

class Network:
    def __init__(self, layers:List[Layer], learning_rate:float)->None:
        self.layers = layers
        self.learning_rate = learning_rate
        self.set_learning_rate(self.learning_rate)

    def compile(self)->None:
        """Define the loss function and loss function derivative"""
        self.loss = Loss(lambda y, y_pred: np.mean(np.power(y - y_pred, 2)), lambda y, y_pred: 2 * (y_pred - y) / y.size)

    def __call__(self, x:np.ndarray) -> np.ndarray:
        """Forward propagation of x through all layers"""
        pass

    def predict(self, input_data):
        samples = len(input_data)
        result = []
        for i in range(samples):
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward(output)
            result.append(output)

        return result

    def set_learning_rate(self, learning_rate):
        for layer in self.layers:
            layer._learning_rate = learning_rate

    def fit(self,
            x_train:np.ndarray,
            y_train:np.ndarray,
            epochs:int,
            verbose:int=0)->None:
        """Fit the network to the training data"""
        samples = len(x_train)
        
        for i in range(epochs):
            err = 0
            for j in range(samples):
                output = x_train[j]
                for layer in self.layers:
                    output = layer.forward(output)
                err += self.loss.loss_function(y_train[j], output)
                error = self.loss.loss_function_derivative(y_train[j], output)
                for layer in self.layers[::-1]:
                    error = layer.backward(error)
            err /= samples
            if verbose:
                print(f'Epoch {i+1}/{epochs} error={err}')

In [35]:
layers = [FullyConnected(2, 3), Tanh(), FullyConnected(3, 1), Tanh()]
net = Network(layers, 0.1)
net.compile()
# XOR
x_train = np.array([[[0,0]], [[0,1]], [[1,0]], [[1,1]]])
y_train = np.array([[[0]], [[1]], [[1]], [[0]]])
net.fit(x_train, y_train, epochs=1000)
print(net.predict(x_train))
print(y_train)

[array([[0.00109358]]), array([[0.97602066]]), array([[0.97576804]]), array([[-0.00188589]])]
[[[0]]

 [[1]]

 [[1]]

 [[0]]]


# Eksperymenty

In [50]:
# ziarno 123
layers = [FullyConnected(64, 150), Tanh(), FullyConnected(150, 10), Tanh()]
net = Network(layers, 0.01)
net.compile()
x_train, x_test, y_train, y_test = train_test_split(x.data, x.target, test_size=0.1, random_state = 123)
x_train = np.reshape(x_train, (1617, 1, 64))
y = []
for i in y_train:
    y.append([0] * 10) 
    y[-1][i] = 1
y_train = np.reshape(y, (1617, 1, 10))
net.fit(x_train, y_train, epochs=100)

out = net.predict(x_test)
wyn = 0
for p, q in zip(out, y_test):
    wyn += max(p[0]) == p[0][q]
print('Skuteczność: ', wyn/len(out) * 100, '%', sep='')

# ziarno 1
layers = [FullyConnected(64, 150), Tanh(), FullyConnected(150, 10), Tanh()]
net = Network(layers, 0.01)
net.compile()
x_train, x_test, y_train, y_test = train_test_split(x.data, x.target, test_size=0.1, random_state=1)
x_train = np.reshape(x_train, (1617, 1, 64))
y = []
for i in y_train:
    y.append([0] * 10) 
    y[-1][i] = 1
y_train = np.reshape(y, (1617, 1, 10))
net.fit(x_train, y_train, epochs=100)

out = net.predict(x_test)
# print(out, y_test)
wyn = 0
for p, q in zip(out, y_test):
    wyn += max(p[0]) == p[0][q]
print('Skuteczność: ', wyn/len(out) * 100, '%', sep='')

#ziarno 20

layers = [FullyConnected(64, 150), Tanh(), FullyConnected(150, 10), Tanh()]
net = Network(layers, 0.01)
net.compile()
x_train, x_test, y_train, y_test = train_test_split(x.data, x.target, test_size=0.1, random_state=20)
x_train = np.reshape(x_train, (1617, 1, 64))
y = []
for i in y_train:
    y.append([0] * 10) 
    y[-1][i] = 1
y_train = np.reshape(y, (1617, 1, 10))
net.fit(x_train, y_train, epochs=100)

out = net.predict(x_test)
# print(out, y_test)
wyn = 0
for p, q in zip(out, y_test):
    wyn += max(p[0]) == p[0][q]
print('Skuteczność: ', wyn/len(out) * 100, '%', sep='')



Skuteczność: 92.22222222222223%
Skuteczność: 90.0%
Skuteczność: 94.44444444444444%


In [24]:
layers = [FullyConnected(64, 100), Tanh(), FullyConnected(100, 50), Tanh(), FullyConnected(50, 10), Tanh()]
net = Network(layers, 0.01)
net.compile()
x_train, x_test, y_train, y_test = train_test_split(x.data, x.target, test_size=0.1)
x_train = np.reshape(x_train, (1617, 1, 64))
y = []
for i in y_train:
    y.append([0] * 10) 
    y[-1][i] = 1
y_train = np.reshape(y, (1617, 1, 10))
net.fit(x_train[:1000], y_train[:1000], epochs=1000)

out = net.predict(x_test)
# print(out, y_test)
wyn = 0
for p, q in zip(out, y_test):
    wyn += max(p[0]) == p[0][q]
print('Skuteczność: ', wyn/len(out) * 100, '%', sep='')

Skuteczność: 88.88888888888889%


In [11]:
layers = [FullyConnected(64, 100), Tanh(), FullyConnected(100, 50), Tanh(), FullyConnected(50, 10), Tanh()]
net = Network(layers, 0.1)
net.compile()
x_train, x_test, y_train, y_test = train_test_split(x.data, x.target, test_size=0.1, random_state=123)
x_train = np.reshape(x_train, (1617, 1, 64))
y = []
for i in y_train:
    y.append([0] * 10) 
    y[-1][i] = 1
y_train = np.reshape(y, (1617, 1, 10))
net.fit(x_train[:1000], y_train[:1000], epochs=1000)

out = net.predict(x_test)
# print(out, y_test)
wyn = 0
for p, q in zip(out, y_test):
    wyn += max(p[0]) == p[0][q]
print('Skuteczność: ', wyn/len(out) * 100, '%', sep='')

[array([[ 4.32639328e-04, -3.04128090e-03,  4.63954720e-04,
         9.92853246e-01, -1.51637254e-03, -5.54908817e-04,
        -2.34574016e-03, -4.93397222e-04, -4.41819309e-03,
         3.29037665e-03]]), array([[ 3.38696202e-04, -2.92779687e-03, -2.91461444e-03,
         9.92908310e-01, -1.65544773e-03, -4.82154823e-04,
        -2.49482019e-03, -6.24302269e-04, -4.24136988e-03,
         4.59882381e-03]]), array([[-1.07016128e-03, -2.71192853e-03,  1.14992834e-03,
        -1.68043644e-03,  9.93623857e-01, -5.64121933e-04,
        -1.53988023e-03, -7.78445833e-04, -8.15485088e-03,
        -4.05517856e-03]]), array([[ 7.39514810e-04,  5.43775597e-02,  9.61181835e-04,
        -1.87244688e-03,  9.93595596e-01, -6.67656243e-04,
        -1.82258821e-03, -1.61019212e-03, -5.11186206e-02,
        -3.77840903e-03]]), array([[ 4.98460489e-04,  9.91479050e-01, -1.85022142e-04,
        -1.55760804e-03,  6.56182558e-03,  1.61597512e-03,
         3.29143792e-02, -4.78460335e-03, -4.78594252e-02,
  

# Wnioski

Skuteczność nauki jest zależna od wielu czynników m.in. learning_rate, ilości epok i rozmiarów sieci oraz rozmiarów danych na których się uczy. 