In [1]:
import numpy as np

## Funções de Ativação e suas respectivas derivadas

In [2]:
def activation_function(X, func='sigmoid', derivative=False):
    
    if func == 'relu':
        
        f_x = np.where(f_x <= 0, 0, 1) if derivative else np.maximum(0, X)
        
    elif func == 'tanh':
        
        f_x = (1 - (X ** 2)) if derivative else np.tanh(X)
        
    elif func == 'sigmoid':
        
        f_x = X * (1 - X) if derivative else 1/(1 + np.exp(-X))
    
    return f_x

## Inicialização dos pesos e bias da rede

In [3]:
# Gera os pesos e bias para a inicializacao da rede
def init_layers(nn_architecture, seed = 1, debug = False):
    
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    nn_params_values = {}
    
    for idx, layer in enumerate(nn_architecture):

        layer_idx = idx + 1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]

        W_name = 'W' + str(layer_idx)
        b_name = 'b' + str(layer_idx)
        #nn_params_values[W_name] = np.random.randn(layer_output_size, layer_input_size) * 0.01
        #nn_params_values[b_name] = np.random.randn(layer_output_size, 1) * 0.01
        nn_params_values[W_name] = np.random.uniform(size=(layer_output_size, layer_input_size))
        nn_params_values[b_name] = np.random.uniform(size=(layer_output_size, 1))

        if (debug):
            print(f"{W_name}.shape: {nn_params_values[W_name].shape}")
            print(f"{b_name}.shape: {nn_params_values[b_name].shape}")
            print('-')
        
    return nn_params_values

In [4]:
# definindo as camadas da rede
nn_architecture = [
    {"input_dim": 120, "output_dim": 84, "activation": "relu"},
    {"input_dim": 84, "output_dim": 10, "activation": "softmax"}
]
print(nn_architecture)

[{'input_dim': 120, 'output_dim': 84, 'activation': 'relu'}, {'input_dim': 84, 'output_dim': 10, 'activation': 'softmax'}]


In [5]:
# gera os pesos e bias das camadas da rede aleatoriamente
nn_params_values = init_layers(nn_architecture, debug=True)

W1.shape: (84, 120)
b1.shape: (84, 1)
-
W2.shape: (10, 84)
b2.shape: (10, 1)
-


## Passagem "para frente" da rede

In [6]:
def nn_forward(X, nn_params_values, nn_architecture, debug=False):
    
    if (debug):
        print(f"X.shape: {X.shape}")
        print('-')
    
    # Guarda a passagem em cache para ser utilizado no backpropagation
    cache = {}
    
    A_curr = X
    cache["A0"] = A_curr
    
    # itera sobre as camadas da rede
    for idx, layer in enumerate(nn_architecture):
        
        # nomeamos as camadas a partir de 1
        layer_idx = idx + 1
        
        # valores de entrada passam a ser a saida da camada anterior
        A_prev = A_curr
        if (debug):
            print(f'A{str(idx)}.shape: {A_prev.shape}')
        
        # funcao de ativacao a ser executada na camada atual
        activ_function_curr = layer["activation"]
        
        # pesos da camada atual
        W_curr = nn_params_values["W" + str(layer_idx)]
        
        # bias da camada atual
        b_curr = nn_params_values["b" + str(layer_idx)]
        
        # executa uma passagem na camada atual
        Z_curr = np.dot(W_curr, A_prev) + b_curr
        A_curr = activation_function(Z_curr, func=activ_function_curr, derivative=False)
        
        # guarda a passagem em cache
        cache["W" + str(layer_idx)] = W_curr
        cache["b" + str(layer_idx)] = b_curr
        cache["Z" + str(layer_idx)] = Z_curr
        cache["A" + str(layer_idx)] = A_curr
    
    cache["A" + str(len(nn_architecture))] = A_curr
    if (debug):
        print(f'A{len(nn_architecture)}.shape: {A_curr.shape}')
        print('-')
    
    # retorna a saida da ultima camada da rede e um cache com os valores de cada camada
    return A_curr, cache

In [7]:
# Metodos para auxiliar o debug

def print_cache(cache, nn_architecture):
    
    print('------------ cache ------------')
    print(f"X/A0: \n{cache['A0']}")
    for layer_idx, layer in enumerate(nn_architecture, 1):
    
        print('-')
        print(f"W{layer_idx}: \n{cache['W' + str(layer_idx)]}")
        print(f"b{layer_idx}: \n{cache['b' + str(layer_idx)]}")
        print(f"Z{layer_idx}: \n{cache['Z' + str(layer_idx)]}")
        print(f"A{layer_idx}: \n{cache['A' + str(layer_idx)]}")
        print(f"Activation Function: {layer['activation']}")
    print('-------------------------------')

def print_params(nn_params_values, nn_architecture):
    
    print('------------ params ------------')
    for layer_idx, layer in enumerate(nn_architecture, 1):
        print('-')
        print(f"W{layer_idx}: \n{nn_params_values['W' + str(layer_idx)]}")
        print(f"b{layer_idx}: \n{nn_params_values['b' + str(layer_idx)]}")
    print('--------------------------------')
              
def print_delta(delta_values, nn_architecture):
    
    print('------------ delta ------------')
    for layer_idx, layer in enumerate(nn_architecture, 1):
        print('-')
        print(f"dZ{layer_idx}: \n{delta_values['dZ' + str(layer_idx)]}")
        print(f"dW{layer_idx}: \n{delta_values['dW' + str(layer_idx)]}")
        print(f"db{layer_idx}: \n{delta_values['db' + str(layer_idx)]}")
    print('--------------------------------')

## Execução de uma passagem na rede

In [8]:
nn_architecture = [
    {"input_dim": 2, "output_dim": 1, "activation": "sigmoid"},
    {"input_dim": 1, "output_dim": 1, "activation": "sigmoid"}
]
nn_params_values = init_layers(nn_architecture)
print(f'nn_architecture: \n{nn_architecture}')
print()
    
X = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]]).T
print(f'X.shape: {X.shape} \n{X}')
print()

y = np.array([[0, 
               1, 
               1, 
               0]])
print(f"y.shape: {y.shape} \n{y}")

y_hat, cache = nn_forward(X, nn_params_values, nn_architecture)
print()
print(f"y_hat: {y_hat.shape} \n{y_hat}")
print()
print_cache(cache, nn_architecture)

nn_architecture: 
[{'input_dim': 2, 'output_dim': 1, 'activation': 'sigmoid'}, {'input_dim': 1, 'output_dim': 1, 'activation': 'sigmoid'}]

X.shape: (2, 4) 
[[0 0 1 1]
 [0 1 0 1]]

y.shape: (1, 4) 
[[0 1 1 0]]

y_hat: (1, 4) 
[[ 0.57393661  0.58665059  0.58151636  0.59283222]]

------------ cache ------------
X/A0: 
[[0 0 1 1]
 [0 1 0 1]]
-
W1: 
[[ 0.417022    0.72032449]]
b1: 
[[ 0.00011437]]
Z1: 
[[  1.14374817e-04   7.20438868e-01   4.17136380e-01   1.13746087e+00]]
A1: 
[[ 0.50002859  0.67270365  0.60279781  0.75721315]]
Activation Function: sigmoid
-
W2: 
[[ 0.30233257]]
b2: 
[[ 0.14675589]]
Z2: 
[[ 0.29793082  0.35013612  0.3290013   0.37568609]]
A2: 
[[ 0.57393661  0.58665059  0.58151636  0.59283222]]
Activation Function: sigmoid
-------------------------------


## Funções de custo/perda

In [9]:
def mean_square_error(Y, Y_hat):
    """
    Calcula função de custo por meio do erro quadrado médio entre as previsões e as amostras de treinamento.
    
    Arguments:
    Y -- Conjunto de amostras de treinamento.
    Y_hat -- Previsões realizadas pela rede.
        
    Returns:
    mse -- Valor escalar indicando o quão distante as previsões foram das amostras de treinamento.
    """
    mse = np.square(np.subtract(Y, Y_hat)).mean()
    return mse


def cross_entropy(Y, Y_hat):
    """
    Calcula a função de custo por meio da entropia cruzada.

    Arguments:
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)
    Y_hat -- probability vector corresponding to your label predictions, shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[1]

    cost = (-1 / m) * np.sum(np.multiply(Y, np.log(Y_hat)) + np.multiply(1 - Y, np.log(1 - Y_hat)))
    
    cost = np.squeeze(cost)      # garante q o valor retornado sera escalar
    assert(cost.shape == ())
    
    return cost

## Atualização dos pessos (backpropagation)

In [10]:
def nn_backward_propagation(y_hat, y, cache, nn_architecture):

    dZ = y_hat - y 
    delta_values = {}
    
    for layer_idx, layer in reversed(list(enumerate(nn_architecture, len(nn_architecture)-1))):
        
        # rede de uma unica camada
        if (len(nn_architecture) == 1):
            layer_idx = 1
        
        A_prev = cache["A" + str(layer_idx-1)]
        m = A_prev.shape[1]
        
        A_curr = cache["A" + str(layer_idx)]
        Z_curr = cache["Z" + str(layer_idx)]
        W_curr = cache["W" + str(layer_idx)]
        b_curr = cache["b" + str(layer_idx)]
             
        # matriz de pesos
        dW = np.dot(dZ, A_prev.T) * (1 / m)
        
        # bias
        db = np.sum(dZ, axis=1, keepdims=True) * (1 / m)
        
        # valores de ajuste a ser descontado dos pesos
        delta_values["dZ" + str(layer_idx)] = dZ
        delta_values["dW" + str(layer_idx)] = dW
        delta_values["db" + str(layer_idx)] = db
        
        # funcao de ativacao
        dZ = np.dot(W_curr.T, dZ) * activation_function(A_prev, func='sigmoid', derivative=True)
    
    return delta_values

In [11]:
def nn_update_parameters(nn_params_values, delta_values, nn_architecture, learning_rate):

    for layer_idx, layer in enumerate(nn_architecture, 1):
        
        nn_params_values["W" + str(layer_idx)] -= learning_rate * delta_values["dW" + str(layer_idx)]        
        nn_params_values["b" + str(layer_idx)] -= learning_rate * delta_values["db" + str(layer_idx)]

    return nn_params_values;

## Treinamento da rede

In [12]:
def convert_prob_into_class(y_hat):
    probs_ = np.copy(y_hat)
    probs_[probs_ > 0.5] = 1
    probs_[probs_ <= 0.5] = 0
    return probs_

In [13]:
def get_accuracy_value(Y, Y_hat):
    Y_hat_ = convert_prob_into_class(Y_hat)
    return (Y_hat_ == Y).all(axis=0).mean()

In [14]:
def train(X, y, nn_architecture, epochs, learning_rate):
    
    nn_params_values = init_layers(nn_architecture)
    print()
    print_params(nn_params_values, nn_architecture)
    
    print(f"\nTreinando com taxa de aprendizado = {learning_rate} e {epochs} épocas:")

    for i in range(epochs):

        y_hat, cache = nn_forward(X, nn_params_values, nn_architecture)
        
        if ((i+1)% 5000) == 0:
            #error = cross_entropy(y, y_hat)
            error = mean_square_error(y, y_hat)
            acuracia = get_accuracy_value(y, y_hat)
            print("-> Iteração: {:05} - Erro: {:.5f} - Acurácia: {:.5f}".format((i+1), error, acuracia))
            if (error < 0.06):
                break
        
        delta_values = nn_backward_propagation(y_hat, y, cache, nn_architecture)
        nn_params_values = nn_update_parameters(nn_params_values, delta_values, nn_architecture, learning_rate)
    
    return nn_params_values

In [15]:
nn_architecture = [
    {"input_dim": 2, "output_dim": 3, "activation": "tanh"},
    {"input_dim": 3, "output_dim": 1, "activation": "sigmoid"}
]
print(f'nn_architecture: \n{nn_architecture}')
print()

X = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]]).T
print(f'X.shape: {X.shape} \n{X}')
print()

y = np.array([[0, 
               1, 
               1, 
               0]])
print(f"y.shape: {y.shape} \n{y}")

epochs = 100000
learning_rate = 0.01
nn_params_values = train(X, y, nn_architecture, epochs, learning_rate)

y_hat, cache = nn_forward(X, nn_params_values, nn_architecture)
print()
print(f"y_hat: {y_hat.shape} \n{convert_prob_into_class(y_hat)}")
print()
print_cache(cache, nn_architecture)

nn_architecture: 
[{'input_dim': 2, 'output_dim': 3, 'activation': 'tanh'}, {'input_dim': 3, 'output_dim': 1, 'activation': 'sigmoid'}]

X.shape: (2, 4) 
[[0 0 1 1]
 [0 1 0 1]]

y.shape: (1, 4) 
[[0 1 1 0]]

------------ params ------------
-
W1: 
[[  4.17022005e-01   7.20324493e-01]
 [  1.14374817e-04   3.02332573e-01]
 [  1.46755891e-01   9.23385948e-02]]
b1: 
[[ 0.18626021]
 [ 0.34556073]
 [ 0.39676747]]
-
W2: 
[[ 0.53881673  0.41919451  0.6852195 ]]
b2: 
[[ 0.20445225]]
--------------------------------

Treinando com taxa de aprendizado = 0.01 e 100000 épocas:
-> Iteração: 05000 - Erro: 0.22644 - Acurácia: 0.75000
-> Iteração: 10000 - Erro: 0.18131 - Acurácia: 0.75000
-> Iteração: 15000 - Erro: 0.06075 - Acurácia: 1.00000
-> Iteração: 20000 - Erro: 0.01400 - Acurácia: 1.00000

y_hat: (1, 4) 
[[ 0.  1.  1.  0.]]

------------ cache ------------
X/A0: 
[[0 0 1 1]
 [0 1 0 1]]
-
W1: 
[[ 2.15406403  2.19923199]
 [ 6.15093223  6.05570446]
 [ 0.09240721 -0.26326205]]
b1: 
[[ 0.02537967]
 

## Execução com a rede treinada

In [16]:
X = np.array([[0, 1]]).T
print(f'X.shape: {X.shape} \n{X.T}')
print()

y = np.array([[1]])
print(f'y.shape: {y.shape} \n{y.T}')
print()

y_hat, cache = nn_forward(X, nn_params_values, nn_architecture)
y_hat_prob = convert_prob_into_class(y_hat)

print(f"y_hat: {y_hat.shape} \n{y_hat_prob.T}")
print()
print_cache(cache, nn_architecture)

X.shape: (2, 1) 
[[0 1]]

y.shape: (1, 1) 
[[1]]

y_hat: (1, 1) 
[[ 1.]]

------------ cache ------------
X/A0: 
[[0]
 [1]]
-
W1: 
[[ 2.15406403  2.19923199]
 [ 6.15093223  6.05570446]
 [ 0.09240721 -0.26326205]]
b1: 
[[ 0.02537967]
 [-7.06675931]
 [ 0.22235046]]
Z1: 
[[ 2.22461166]
 [-1.01105485]
 [-0.04091159]]
A1: 
[[ 0.97689476]
 [-0.76619796]
 [-0.04088878]]
Activation Function: tanh
-
W2: 
[[ 4.74632681 -2.74882614 -0.16767705]]
b2: 
[[-4.36008083]]
Z2: 
[[ 2.38958207]]
A2: 
[[ 0.91602943]]
Activation Function: sigmoid
-------------------------------
