Imports

In [65]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, balanced_accuracy_score

Import data

In [66]:
data = pd.read_csv('iris.csv')

features = data.drop(columns=['class'])
labels = data['class']

X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.30, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
X_test = X_test.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()
y_test = y_test.to_numpy()

Codificação do y

In [129]:
encoder = OneHotEncoder(sparse=False)
y_train_encoded = encoder.fit_transform(y_train.reshape(-1, 1))
y_val_encoded = encoder.transform(y_val.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.reshape(-1, 1))

"Currently, we do not usually use the sigmoid function for the hidden layers in MLPs and CNNs. Instead, we use ReLU or Leaky ReLU there."  
"We do not usually use the sigmoid function in the hidden layers because of the following drawbacks.
The sigmoid function has the vanishing gradient problem. This is also known as saturation of the gradients.
The sigmoid function has slow convergence.
Its outputs are not zero-centered. Therefore, it makes the optimization process harder.
This function is computationally expensive as an e^z term is included."

ReLU activation function

In [68]:
def ReLU_act(x, der=False):
    if der:
        return np.where(x > 0, 1, 0)
    else:
        return np.maximum(0, x)

"Inventing ReLU is one of the most important breakthroughs made in deep learning.
This function does not have the vanishing gradient problem.  
This function is computationally inexpensive. It is considered that the convergence of ReLU is 6 times faster than sigmoid and tanh functions."

Derivada da função ReLU

In [69]:
def relu_derivative(x):
    return np.where(x > 0, 1, 0)

SoftMax activtion function

In [70]:
def softmax_act(x):
    exps = np.exp(x - np.max(x))
    return exps / np.sum(exps, axis=0)

Duas camadas escondidas já são capazes de representar qualquer relação entre os dados, mesmo aquelas que não podem ser representadas por equações. Mais do que duas camadas escondidas só são necessárias em problemas ainda mais complexos como séries temporais e visão computacional, onde há uma certa inter-relação entre as dimensões que os dados contêm

Leaky ReLU activation function

In [71]:
def leaky_ReLU_act(x, alpha=0.01):
    return np.where(x > 0, x, alpha * x)

"If the input value is 0 greater than 0, the leaky ReLU function outputs the input as it is like the default ReLU function does. However, if the input is less than 0, the leaky ReLU function outputs a small negative value defined by αz (where α is a small constant value, usually 0.01 and z is the input value)."

In [72]:
n_camadas = [2, 3]
print ("Range do nº de camadas: ", n_camadas)

Range do nº de camadas:  [2, 3]


Abordagens:  
O número de neurônios escondidos deve estar entre o tamanho da camada de entrada e o da camada de saída.  
O número de neurônios escondidos deve ser 2/3 do tamanho da camada de entrada, mais o tamanho da camada de saída

In [73]:
n_neur = [0] * 2
n_neur [0] = int ((X_train.shape[1] + y_train_encoded.shape[1])/2)
n_neur [1] = int(((X_train.shape[1]*2)/3) + y_train_encoded.shape[1])
print ("Número de neurónios escondidos: ", n_neur)

Número de neurónios escondidos:  [4, 6]


"We should use a non-linear activation function in hidden layers. The choice is made by considering the performance of the model or convergence of the loss function. Start with the ReLU activation function and if you have a dying ReLU problem, try leaky ReLU."

ADAM Optimizer

In [332]:
class AdamOptim():
    def __init__(self, eta=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.eta = eta

    def update(self, t, w, dw):
        m_dw = np.zeros_like(w)  # Inicializa com zeros com a mesma forma de w
        v_dw = np.zeros_like(w)  # Inicializa com zeros com a mesma forma de w
        
        if dw.shape != w.shape:
            raise ValueError(f"Forma de gradiente {dw.shape} não corresponde à forma de pesos {w.shape}")

        # Cópia dos pesos originais antes da atualização
        updated_w = np.copy(w)

        # Momentum beta 1
        m_dw = self.beta1 * m_dw + (1 - self.beta1) * dw

        # RMS beta 2
        v_dw = self.beta2 * v_dw + (1 - self.beta2) * (dw ** 2)

        # Bias correction
        m_dw_corr = m_dw / (1 - self.beta1 ** t)
        v_dw_corr = v_dw / (1 - self.beta2 ** t)

        # Atualizar pesos
        updated_w -= self.eta * (m_dw_corr / (np.sqrt(v_dw_corr) + self.epsilon))
        return updated_w



Cálculo dos gradientes

In [333]:
def hypothesis(X, theta):
    return np.dot(X, theta)

In [334]:
def gradient(X, y, theta):
    h = hypothesis(X, theta)
    grad = np.dot(X.T, (h - y))
    return grad

In [335]:
def cost(X, y, theta):
    h = hypothesis(X, theta)
    J = np.dot((h - y).T, (h - y)) / 2
    return J[0]

In [336]:
def create_mini_batches(X, y, batch_size):
    mini_batches = []
    data = np.hstack((X, y))
    np.random.shuffle(data)
    n_minibatches = data.shape[0] // batch_size
    i = 0
    for i in range(n_minibatches + 1):
        mini_batch = data[i * batch_size:(i + 1) * batch_size, :]
        X_mini = mini_batch[:, :-1]
        Y_mini = mini_batch[:, -1].reshape((-1, 1))
        mini_batches.append((X_mini, Y_mini))
    if data.shape[0] % batch_size != 0:
        mini_batch = data[i * batch_size:data.shape[0]]
        X_mini = mini_batch[:, :-1]
        Y_mini = mini_batch[:, -1].reshape((-1, 1))
        mini_batches.append((X_mini, Y_mini))
    return mini_batches

In [337]:
def gradientDescent(X, y, learning_rate=0.001, batch_size=32):
    theta = np.zeros((X.shape[1], 1))
    error_list = []
    max_iters = 3
    adam = AdamOptim(learning_rate)
    t = 0
    for itr in range(max_iters):
        mini_batches = create_mini_batches(X, y, batch_size)
        for mini_batch in mini_batches:
            X_mini, y_mini = mini_batch
            grad = gradient(X_mini, y_mini, theta)
            t += 1
            theta = adam.update(t, theta, grad)
            error_list.append(cost(X_mini, y_mini, theta))
    return theta, error_list

In [342]:
def compute_gradients(x, y_true, parameters):
    # Obter pesos e biases
    W1, b1, W2, b2, W3, b3 = parameters['W1'], parameters['b1'], parameters['W2'], parameters['b2'], parameters['W3'], parameters['b3']
    
    # Forward pass
    z1 = np.dot(W1, x) + b1
    a1 = ReLU_act(z1)
    z2 = np.dot(W2, a1) + b2
    a2 = ReLU_act(z2)
    z3 = np.dot(W3, a2) + b3
    y_pred = softmax_act(z3)
    
    # Backward pass
    dz3 = y_pred - y_true
    dW3 = np.dot(dz3, a2.T)
    db3 = np.sum(dz3, axis=1, keepdims=True)
    da2 = np.dot(W3.T, dz3)
    dz2 = da2 * ReLU_act(z2, der=True)
    dW2 = np.dot(dz2, a1.T)
    db2 = np.sum(dz2, axis=1, keepdims=True)
    da1 = np.dot(W2.T, dz2)
    dz1 = da1 * ReLU_act(z1, der=True)
    dW1 = np.dot(dz1, x.T)
    db1 = np.sum(dz1, axis=1, keepdims=True)

    grads = {
        'dW1': dW1,
        'db1': db1,
        'dW2': dW2,
        'db2': db2,
        'dW3': dW3,
        'db3': db3
    }
    
    return grads

In [343]:
def check_convergence(w0, w1, threshold=1e-4):
    print(np.linalg.norm(w0 - w1))
    return np.linalg.norm(w0 - w1) < threshold

Inicialização dos pesos usando uma distribuição gaussiana

In [344]:
def initialize_weights(shape, mean=0.0, stddev=0.01):
    return np.random.normal(loc=mean, scale=stddev, size=shape)

MLP com 2 camadas

In [345]:
p = n_neur[0]
q = n_neur[1]
num_classes = 3

w1 = initialize_weights((p, X_train.shape[1]))
b1 = initialize_weights((p, 1)) 
w2 = initialize_weights((q, p))
b2 = initialize_weights((q, 1))  
wOut = initialize_weights((num_classes, q))
bOut = initialize_weights((num_classes, 1))

# Inicializar parâmetros do ADAM
adam1 = AdamOptim()
adam2 = AdamOptim()
adamOut = AdamOptim()
t = 1

# Inicializar parâmetros
parameters = {
    'W1': w1,
    'b1': b1,
    'W2': w2,
    'b2': b2,
    'W3': wOut,
    'b3': bOut
}

# Treinar o modelo
converged = False
while not converged:
    for i in range(0, X_train.shape[0]):
        x = X_train[i].reshape(-1, 1)
        y_true = y_train_encoded[i].reshape(-1, 1)

        grads = compute_gradients(x, y_true, parameters)

        w1_old = parameters['W1'].copy()
        parameters['W1'] = adam1.update(t, parameters['W1'], grads['dW1'])
        parameters['b1'] = adam1.update(t, parameters['b1'], grads['db1'])
        parameters['W2'] = adam2.update(t, parameters['W2'], grads['dW2'])
        parameters['b2'] = adam2.update(t, parameters['b2'], grads['db2'])
        parameters['W3'] = adamOut.update(t, parameters['W3'], grads['dW3'])
        parameters['b3'] = adamOut.update(t, parameters['b3'], grads['db3'])

        if check_convergence(parameters['W1'], w1_old):
            converged = True
            break
        else:
            t += 1

# Testar o modelo em X_test
class_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

y_pred = np.array([np.argmax(softmax_act(np.dot(parameters['W3'], ReLU_act(np.dot(parameters['W2'], ReLU_act(np.dot(parameters['W1'], x.reshape(-1, 1)) + parameters['b1'])) + parameters['b2'])) + parameters['b3'])) for x in X_test])

y_pred_labels = [class_names[pred] for pred in y_pred]

balanced_acc = balanced_accuracy_score(y_test, y_pred_labels)
conf_matrix = confusion_matrix(y_test, y_pred_labels)

print(f"Balanced Accuracy: {balanced_acc:.2f}")
print("Matriz de Confusão:")
print(conf_matrix)

0.022360166577705315
0.016639329408191388
0.014282870422924728
0.0
Balanced Accuracy: 0.33
Matriz de Confusão:
[[ 0  6  0]
 [ 0 10  0]
 [ 0  7  0]]


MLP com 3 camadas

In [244]:
for j in range (2):
    # Número de perceptrons por camada
    p = n_neur[j]  # Layer 1
    q = n_neur[j]
    r = n_neur[j] # Layer 2
    num_classes = 3  # Número de classes na previsão multiclasse

    # Taxa de aprendizagem
    eta = 1/623

    # Inicializar pesos e biases
    w1 = 2 * np.random.rand(p, X_train.shape[1]) - 0.5  # Layer 1
    b1 = np.random.rand(p)
    w2 = 2 * np.random.rand(q, p) - 0.5  # Layer 2
    b2 = np.random.rand(q)
    w3 = 2 * np.random.rand(q, p) - 0.5  # Layer 2
    b3 = np.random.rand(q)
    wOut = 3 * np.random.rand(num_classes, r) - 0.5  # Output Layer
    bOut = np.random.rand(num_classes)

    mu = []
    vec_y = []
    class_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']

    # Loop sobre os dados de treino
    for i in range(0, X_train.shape[0]):
        # Input data
        x = X_train[i]
        y_true = y_train_encoded[i]

        # Feedforward
        z1 = ReLU_act(np.dot(w1, x) + b1)  # output layer 1
        z2 = ReLU_act(np.dot(w2, z1) + b2)  # output layer 2
        z3 = ReLU_act(np.dot(w3, z2) + b3)
        y = softmax_act(np.dot(wOut, z3) + bOut)  # Output of the Output layer

        # Compute the output layer's error (cross-entropy)
        delta_Out = y - y_true

        # Backpropagate
        delta_3 = np.dot(delta_Out, wOut) * ReLU_act(z3, der=True)
        delta_2 = np.dot(delta_3, w3) * ReLU_act(z2, der=True) # Second Layer Error
        delta_1 = np.dot(delta_2, w2) * ReLU_act(z1, der=True)  # First Layer Error

        # Gradient descent
        wOut -= eta * np.outer(delta_Out, z3)  # Outer Layer
        bOut -= eta * delta_Out
        
        w3 -= eta * np.outer(delta_3, z2)  # Hidden Layer 2
        b3 -= eta * delta_3

        w2 -= eta * np.outer(delta_2, z1)  # Hidden Layer 2
        b2 -= eta * delta_2

        w1 -= eta * np.outer(delta_1, x)  # Hidden Layer 1
        b1 -= eta * delta_1

        # Computação da função de perda (cross-entropy)
        loss = -np.sum(y_true * np.log(y + 1e-9))
        mu.append(loss)
        vec_y.append(np.argmax(y))

    # Previsão final em X_test
    y_pred = np.array([np.argmax(softmax_act(np.dot(wOut, ReLU_act(np.dot(w2, ReLU_act(np.dot(w1, x) + b1)) + b2)) + bOut)) for x in X_test])

    # Converte previsões para strings
    y_pred_labels = [class_names[pred] for pred in y_pred]

    # Cálculo da balanced accuracy
    balanced_acc = balanced_accuracy_score(y_test, y_pred_labels)

    # Matriz de confusão
    conf_matrix = confusion_matrix(y_test, y_pred_labels)

    print(f"Balanced Accuracy: {balanced_acc:.2f}")
    print("Matriz de Confusão:")
    print(conf_matrix)

Balanced Accuracy: 0.33
Matriz de Confusão:
[[ 6  0  0]
 [10  0  0]
 [ 7  0  0]]
Balanced Accuracy: 0.33
Matriz de Confusão:
[[ 0  0  6]
 [ 0  0 10]
 [ 0  0  7]]


Referências:  
https://www.kaggle.com/code/androbomb/simple-nn-with-python-multi-layer-perceptron  
https://medium.com/ensina-ai/rede-neural-perceptron-multicamadas-f9de8471f1a9  
https://iaexpert.academy/2020/05/04/quantas-camadas-escondidas-e-quantos-neuronios-incluir-numa-rede-neural-artificial/
https://towardsdatascience.com/how-to-choose-the-right-activation-function-for-neural-networks-3941ff0e6f9c  
https://github.com/enochkan/building-from-scratch/blob/main/optimizers/adam-optimizer-from-scratch.ipynb  
https://www.geeksforgeeks.org/ml-mini-batch-gradient-descent-with-python/