In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
from typing import List
from scipy.stats import truncnorm

In [2]:
df = pd.read_csv('dataset/AGE_PREDICTION.csv')
df.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_24,feat_25,feat_26,feat_27,feat_28,feat_29,feat_30,feat_31,feat_32,gt
0,2.686191,-0.989465,-0.920503,1.607427,-0.896248,1.118974,-0.969456,1.811707,2.560955,3.803463,...,-0.862891,-0.909545,-0.915361,-0.952061,-0.989461,1.911855,1.409705,2.303997,-0.98184,54
1,-0.887917,4.915272,-0.939446,-0.343677,-0.964685,-0.478649,4.342395,-0.33287,-0.768041,-0.815375,...,-0.939201,-0.965917,-0.969461,-0.934799,5.304822,0.93479,-0.410701,0.28469,4.919212,18
2,-0.923215,2.746968,-0.918085,0.047804,-0.908587,-0.451752,2.984481,0.535007,-0.591029,-0.324043,...,-0.809726,-0.929934,-0.891814,-0.881796,3.415373,1.044108,-0.442615,0.033648,2.628199,26
3,-0.268866,-0.408416,-0.935145,0.7318,-0.922438,0.221781,-0.046606,1.149634,0.592136,1.357959,...,-0.834968,-0.937475,-0.917737,-0.929519,-0.226282,1.608048,0.276169,1.246468,-0.363367,33
4,0.529231,-0.829957,-0.897425,0.92128,-0.865304,0.331018,-0.64494,1.296097,1.166863,2.036034,...,-0.775411,-0.881967,-0.864018,-0.908001,-0.784495,1.329586,0.547925,1.195395,-0.810089,35


In [None]:
class NeuralNetwork:
    def __init__(self, neurons, act='tanh'):
        """
        neurons: list with the number of neurons in each layer.
                      e.g. [512, 16, 8, 1]
        activation: string, 'tanh' or 'sigmoid'
        """
        self.neurons = neurons
        self.layers = len(neurons) - 1  # number of layers (excluding input)
        self.act = act

        # Initialize weights and biases
        self.weights = []
        self.biases = []
        for i in range(self.layers):
            in_dim = neurons[i]
            out_dim = neurons[i + 1]
            W = np.random.randn(in_dim, out_dim) * 0.01
            b = np.zeros((1, out_dim))
            self.weights.append(W)
            self.biases.append(b)

    def activation(self, x):
        if self.act == 'tanh':
            return np.tanh(x)
        elif self.act == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        else:
            raise ValueError("Unsupported activation function")
        
    def derivate_activation(self, x):
        if self.act == 'tanh':
            return 1-(self.activation(x)**2)
        elif self.act == 'sigmoid':
            return self.activation(x)*(1-self.activation(x))

    def forward(self, X):
        """
        Performs a forward pass through the network.
        X: input data of shape (n_samples, n_features)
        Returns:
            Output prediction of shape (n_samples, 1)
        """
        a = X
        zs = []
        activations = [X]  # Store input as the first activation
        
        for i in range(self.layers - 1):  # Hidden layers
            z = np.dot(a, self.weights[i]) + self.biases[i]
            a = self.activation(z)
            zs.append(z)
            activations.append(a)

        # Output layer (no activation)
        output = np.dot(a, self.weights[-1]) + self.biases[-1]
        zs.append(output)
        
        return output, activations, zs

    def get_params_vector(self):
        """Returns all weights and biases flattened into a single vector."""
        params = []
        for W, b in zip(self.weights, self.biases):
            params.append(W.flatten())
            params.append(b.flatten())
        return np.concatenate(params)
    

    def set_params_vector(self, flat_params):
        """Set the weights and biases from a flat parameter vector."""
        idx = 0
        self.weights = []
        self.biases = []
        for i in range(self.layers):
            in_dim = self.neurons[i]
            out_dim = self.neurons[i + 1]
            w_size = in_dim * out_dim
            b_size = out_dim

            W = flat_params[idx:idx + w_size].reshape((in_dim, out_dim))
            idx += w_size
            b = flat_params[idx:idx + b_size].reshape(1, out_dim)
            idx += b_size

            self.weights.append(W)
            self.biases.append(b)

    def mse_loss(self, y_real, y_pred, alpha):
        """
        y_real: true target values
        y_pred: predicted values
        alpha: regularization parameter
        """
        weights = self.weights

        loss = np.mean((y_real - y_pred)**2)/2
        reg = sum([np.sum(w**2) for w in weights])
        return loss + alpha * reg
    
    
    def mape(y_real, y_pred):
        """
        y_real: true target values
        y_pred: predicted values
        """
        return np.mean(np.abs((y_real - y_pred) / (y_real + 1e-8))) * 100
    
    def dloss_dypred(y_real, y_pred):
        return (y_pred-y_real)/y_real.shape[0]
    
    def backward(self, X, y, alpha = 0.5):
        y_pred, activations, zs = self.forward(X)
        deltas = [None] * self.layers
        grads_W, grads_b = [], []

        # delta output layer
        dL_dy = self.dloss_dypred(y, y_pred)  # shape (batch, out_dim)
        deltas[-1] = dL_dy  # ultimo layer: no activation

        # hidden layers backwards
        for layer in reversed(range(self.layers - 1)):
            da_dz = self.derivate_activation(zs[layer])
            deltas[layer] = (deltas[layer+1] @ self.weights[layer+1].T) * da_dz

        # grad W, b
        for layer in range(self.layers):
            a_prev = activations[layer]
            grad_W = (a_prev.T @ deltas[layer])+2*alpha*self.weights[layer]
            grads_W.append(grad_W)
            grads_b.append(np.sum(deltas[layer], axis=0))

        return grads_W, grads_b
    
    def update(self, grads_W, grads_b, lr=1e-3):
        for i in range(self.layers):
            self.weights[i] -= lr * grads_W[i]
            self.biases[i]  -= lr * grads_b[i]


    def train(model, X, y, epochs, lr, method="Batch", alpha = 0.5, batch_size=None, X_val = None, y_val = None):

        if method != "Batch" and method != "Mini Batch" and method != "SGD":
            raise ValueError('Select a method between: "Batch", "Mini Batch" and "SGD".')
        print(f"\nTraining started using '{method}' method for {epochs} epochs.")
        if method == "Mini Batch" and batch_size is None:
            if batch_size is None:
                raise ValueError("You must specify a batch_size for mini-batch training.")
            print(f"Mini-batch size: {batch_size}")

        for epoch in range(epochs):
            print(f"Epoch {epoch + 1}/{epochs}")

            if method == "Batch":
                # Use the entire dataset
                X_batch, y_batch = X, y

                loss = model.forward(X_batch, y_batch)
                model.backward(X_batch, y_batch, alpha=alpha)
                model.update(lr)

                print(f"[Batch] Loss: {loss:.6f}")

            elif method == "SGD":
                # Shuffle the data
                indices = np.random.permutation(len(X))
                X_shuffled = X[indices]
                y_shuffled = y[indices]

                total_loss = 0
                for i in range(len(X_shuffled)):
                    xi = X_shuffled[i].reshape(1, -1)
                    yi = y_shuffled[i].reshape(1, -1)

                    loss = model.forward(xi, yi)
                    model.backward(xi, yi, alpha=alpha)
                    model.update(lr)

                    total_loss += loss

                    print(f"[SGD] Sample {i+1}/{len(X_shuffled)} - Loss: {loss:.6f}")

                avg_loss = total_loss / len(X_shuffled)
                print(f"[SGD] Average Loss: {avg_loss:.6f}")

            elif method == "Mini Batch":
                # Shuffle the data
                indices = np.random.permutation(len(X))
                X_shuffled = X[indices]
                y_shuffled = y[indices]

                total_loss = 0
                num_batches = 0

                for i in range(0, len(X_shuffled), batch_size):
                    X_batch = X_shuffled[i:i+batch_size]
                    y_batch = y_shuffled[i:i+batch_size]

                    loss = model.forward(X_batch, y_batch)
                    model.backward(X_batch, y_batch, alpha=alpha)
                    model.update(lr)

                    total_loss += loss
                    num_batches += 1

                    print(f"[Mini-batch] Batch {num_batches} - Loss: {loss:.6f}")

                avg_loss = total_loss / num_batches
                print(f"[Mini-batch] Average Loss: {avg_loss:.6f}")
            
            if X_val is not None and y_val is not None:

                y_pred_val = model.forward(X_val, y_val)[0]

                val_loss = model.mse_loss(y_val, y_pred_val, alpha=alpha)

                print(f"Validation Loss: {val_loss:.4f}")

        print(f"Training completed!\nFinal training loss: ")

## Prova

In [7]:
def mape(y_real, y_pred):
    """
    y_real: true target values
    y_pred: predicted values
    """
    return np.mean(np.abs((y_real - y_pred) / (y_real + 1e-8))) * 100
    
def dloss_dypred(y_real, y_pred):
    return (y_pred-y_real)/y_real.shape[0]

In [None]:
class NeuralNetwork:
    def __init__(self, neurons: List, act='tanh'):
        """
        neurons: list with the number of neurons in each layer.
                      e.g. [512, 16, 8, 1]
        activation: string, 'tanh' or 'sigmoid'
        """
        self.neurons = neurons
        self.layers = len(neurons) - 1  # number of layers (excluding input)
        self.act = act

        # Initialize weights and biases
        self.weights = []
        self.biases = []
        for i in range(self.layers):
            in_dim = neurons[i]
            out_dim = neurons[i + 1]
            W = np.random.randn(in_dim, out_dim) * 0.01
            b = np.zeros((1, out_dim))
            self.weights.append(W)
            self.biases.append(b)

    def activation(self, x):
        if self.act == 'tanh':
            return np.tanh(x)
        elif self.act == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        else:
            raise ValueError("Unsupported activation function")
        
    def derivate_activation(self, a):
        if self.act == 'tanh':
            return 1 - a**2
        elif self.act == 'sigmoid':
            return a * (1 - a)

    def forward(self, X):
        """
        Performs a forward pass through the network.
        X: input data of shape (n_samples, n_features)
        Returns:
            Output prediction of shape (n_samples, 1)
        """
        a = X
        zs = []
        activations = [X]  # Store input as the first activation
        
        for i in range(self.layers - 1):  # Hidden layers
            z = np.dot(a, self.weights[i]) + self.biases[i]
            a = self.activation(z)
            zs.append(z)
            activations.append(a)

        # Output layer (no activation)
        output = np.dot(a, self.weights[-1]) + self.biases[-1]
        zs.append(output)
        
        return output, activations, zs

    def get_params_vector(self):
        """Returns all weights and biases flattened into a single vector."""
        params = []
        for W, b in zip(self.weights, self.biases):
            params.append(W.flatten())
            params.append(b.flatten())
        return np.concatenate(params)
    

    def set_params_vector(self, flat_params):
        """Set the weights and biases from a flat parameter vector."""
        idx = 0
        self.weights = []
        self.biases = []
        for i in range(self.layers):
            in_dim = self.neurons[i]
            out_dim = self.neurons[i + 1]
            w_size = in_dim * out_dim
            b_size = out_dim

            W = flat_params[idx:idx + w_size].reshape((in_dim, out_dim))
            idx += w_size
            b = flat_params[idx:idx + b_size].reshape(1, out_dim)
            idx += b_size

            self.weights.append(W)
            self.biases.append(b)

    def mse_loss(self, y_real, y_pred, alpha):
        """
        y_real: true target values
        y_pred: predicted values
        alpha: regularization parameter
        """
        weights = self.weights

        loss = np.mean((y_real - y_pred)**2)/2
        reg = sum([np.sum(w**2) for w in weights])
        return loss + alpha * reg
    
    
    def backward(self, X, y, alpha = 0.5):
        y_pred, activations, zs = self.forward(X)
        deltas = [None] * self.layers
        grads_W, grads_b = [], []

        # delta output layer
        dL_dy = dloss_dypred(y, y_pred)  # shape (batch, out_dim)
        deltas[-1] = dL_dy  # ultimo layer: no activation

        # hidden layers backwards
        for layer in reversed(range(self.layers - 1)):
            a = activations[layer + 1]
            da_dz = self.derivate_activation(a)
            deltas[layer] = (deltas[layer+1] @ self.weights[layer+1].T) * da_dz

        # grad W, b
        for layer in range(self.layers):
            a_prev = activations[layer]
            grad_W = (a_prev.T @ deltas[layer])+2*alpha*self.weights[layer]
            grads_W.append(grad_W)
            grads_b.append(np.sum(deltas[layer], axis=0, keepdims=True))

        return grads_W, grads_b
    
    def update(self, grads_W, grads_b, lr=1e-3):
        for i in range(self.layers):
            self.weights[i] -= lr * grads_W[i]
            self.biases[i]  -= lr * grads_b[i]

    def train(self, X, y, epochs, lr, method="Batch", alpha=0.5, batch_size=None, X_val=None, y_val=None):
        if method not in ["Batch", "Mini Batch", "SGD"]:
            raise ValueError('Select a method between: "Batch", "Mini Batch", and "SGD".')

        print(f"\nTraining started using '{method}' method for {epochs} epochs.")

        if method == "Mini Batch" and batch_size is None:
            raise ValueError("You must specify a batch_size for mini-batch training.")

        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")

            if method == "Batch":
                X_batch, y_batch = X, y

                y_pred, _, _ = self.forward(X_batch)
                loss = self.mse_loss(y_batch, y_pred, alpha)

                grads_W, grads_b = self.backward(X_batch, y_batch, alpha)
                self.update(grads_W, grads_b, lr)

                print(f"[Batch] Loss: {loss:.6f}")

            elif method == "SGD":
                indices = np.random.permutation(len(X))
                X_shuffled, y_shuffled = X[indices], y[indices]

                total_loss = 0
                for i in range(len(X_shuffled)):
                    xi = X_shuffled[i].reshape(1, -1)
                    yi = y_shuffled[i].reshape(1, -1)

                    y_pred, _, _ = self.forward(xi)
                    loss = self.mse_loss(yi, y_pred, alpha)

                    grads_W, grads_b = self.backward(xi, yi, alpha)
                    self.update(grads_W, grads_b, lr)

                    total_loss += loss

                avg_loss = total_loss / len(X_shuffled)
                print(f"[SGD] Average Loss: {avg_loss:.6f}")

            elif method == "Mini Batch":
                indices = np.random.permutation(len(X))
                X_shuffled, y_shuffled = X[indices], y[indices]

                total_loss = 0
                num_batches = 0

                for i in range(0, len(X_shuffled), batch_size):
                    X_batch = X_shuffled[i:i+batch_size]
                    y_batch = y_shuffled[i:i+batch_size]

                    y_pred, _, _ = self.forward(X_batch)
                    loss = self.mse_loss(y_batch, y_pred, alpha)

                    grads_W, grads_b = self.backward(X_batch, y_batch, alpha)
                    self.update(grads_W, grads_b, lr)

                    total_loss += loss
                    num_batches += 1

                avg_loss = total_loss / num_batches
                print(f"[Mini-batch] Average Loss: {avg_loss:.6f}")

            # Validation
            if X_val is not None and y_val is not None:
                y_pred_val, _, _ = self.forward(X_val)
                val_loss = self.mse_loss(y_val, y_pred_val, alpha)
                val_mape = mape(y_val, y_pred_val)
                print(f"Validation Loss: {val_loss:.6f} | Validation MAPE: {val_mape:.2f}%")

        print("Training completed!")


In [14]:
import numpy as np
import pandas as pd

# Take a random subset of 1000 rows
df_subset = df.sample(n=1000, random_state=42)

# Split features and target
X = df_subset.drop("gt", axis=1).values
y = df_subset["gt"].values.reshape(-1, 1)

# Train/val split (80/20)
split = int(0.8 * len(X))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]

# Define and train model
nn = NeuralNetwork([X.shape[1], 16, 8, 1], act="tanh")

nn.train(X_train, y_train, 
      lr=0.01, 
      epochs= 5000,
      method="Mini Batch", 
      batch_size=32, 
      alpha=0.001, 
      X_val=X_val, y_val=y_val)



Training started using 'Mini Batch' method for 5000 epochs.

Epoch 1/5000
[Mini-batch] Average Loss: 535.018783
Validation Loss: 187.068362 | Validation MAPE: 30.91%

Epoch 2/5000
[Mini-batch] Average Loss: 138.651510
Validation Loss: 119.830979 | Validation MAPE: 34.03%

Epoch 3/5000
[Mini-batch] Average Loss: 127.633110
Validation Loss: 120.849981 | Validation MAPE: 36.84%

Epoch 4/5000
[Mini-batch] Average Loss: 104.471568
Validation Loss: 109.521299 | Validation MAPE: 34.14%

Epoch 5/5000
[Mini-batch] Average Loss: 92.088698
Validation Loss: 87.391974 | Validation MAPE: 28.04%

Epoch 6/5000
[Mini-batch] Average Loss: 83.096730
Validation Loss: 77.917483 | Validation MAPE: 28.65%

Epoch 7/5000
[Mini-batch] Average Loss: 73.049479
Validation Loss: 62.965233 | Validation MAPE: 24.11%

Epoch 8/5000
[Mini-batch] Average Loss: 67.935506
Validation Loss: 63.607420 | Validation MAPE: 25.66%

Epoch 9/5000
[Mini-batch] Average Loss: 66.263740
Validation Loss: 65.647953 | Validation MAPE: 26

## Prova ADAM

In [3]:
def mape(y_real, y_pred):
    """
    y_real: true target values
    y_pred: predicted values
    """
    return np.mean(np.abs((y_real - y_pred) / (y_real + 1e-8))) * 100
    
def dloss_dypred(y_real, y_pred):
    return (y_pred-y_real)/y_real.shape[0]

def mae(y_real, y_pred):
    """
    y_real: true target values
    y_pred: predicted values
    """
    return np.mean(np.abs(y_pred-y_real))

In [11]:
class NeuralNetwork:
    def __init__(self, neurons: List, act='tanh', init=None):
        """
        neurons: list with the number of neurons in each layer.
                      e.g. [512, 16, 8, 1]
        activation: string, 'tanh' or 'sigmoid'
        """
        self.neurons = neurons
        self.layers = len(neurons) - 1  # number of layers (excluding input)
        self.act = act

        # Initialize weights and biases
        self.weights = []
        self.biases = []
        for i in range(self.layers):
            in_dim = neurons[i]
            out_dim = neurons[i + 1]

            if init is not None and init not in ["glorot_unif", "glorot_normal", "he_unif", "he_normal"]:
                raise ValueError('Select a initialization method between: None, "glorot_unif", "glorot_normal", "he_unif" and "he_normal".')
            if init is None:
                W = np.random.randn(in_dim, out_dim) * 0.01
            elif init == "glorot_unif":
                unif_params = 6/np.sqrt(in_dim+out_dim)
                W = np.random.uniform(-unif_params, unif_params, size = (in_dim, out_dim))
            elif init == "glorot_normal":
                std = 2/np.sqrt(in_dim+out_dim)
                l, u = -2, 2
                W = truncnorm.rvs(l, u, loc=0, scale=std, size=(in_dim, out_dim))
            elif init == "he_unif":
                unif_params = 2/np.sqrt(in_dim)
                W = np.random.uniform(-unif_params, unif_params, size=(in_dim, out_dim))
            elif init == "he_normal":
                std = 2/np.sqrt(in_dim)
                l, u = -2, 2
                W = truncnorm.rvs(l, u, loc=0, scale=std, size=(in_dim, out_dim))
            b = np.zeros((1, out_dim))
            self.weights.append(W)
            self.biases.append(b)

    def activation(self, x):
        if self.act == 'tanh':
            return np.tanh(x)
        elif self.act == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        else:
            raise ValueError("Unsupported activation function")
        
    def derivate_activation(self, a):
        if self.act == 'tanh':
            return 1 - a**2
        elif self.act == 'sigmoid':
            return a * (1 - a)

    def forward(self, X):
        """
        Performs a forward pass through the network.
        X: input data of shape (n_samples, n_features)
        Returns:
            Output prediction of shape (n_samples, 1)
        """
        a = X
        zs = []
        activations = [X]  # Store input as the first activation
        
        for i in range(self.layers - 1):  # Hidden layers
            z = np.dot(a, self.weights[i]) + self.biases[i]
            a = self.activation(z)
            zs.append(z)
            activations.append(a)

        # Output layer (no activation)
        output = np.dot(a, self.weights[-1]) + self.biases[-1]
        zs.append(output)
        
        return output, activations, zs

    def get_params_vector(self):
        """Returns all weights and biases flattened into a single vector."""
        params = []
        for W, b in zip(self.weights, self.biases):
            params.append(W.flatten())
            params.append(b.flatten())
        return np.concatenate(params)
    

    def set_params_vector(self, flat_params):
        """Set the weights and biases from a flat parameter vector."""
        idx = 0
        self.weights = []
        self.biases = []
        for i in range(self.layers):
            in_dim = self.neurons[i]
            out_dim = self.neurons[i + 1]
            w_size = in_dim * out_dim
            b_size = out_dim

            W = flat_params[idx:idx + w_size].reshape((in_dim, out_dim))
            idx += w_size
            b = flat_params[idx:idx + b_size].reshape(1, out_dim)
            idx += b_size

            self.weights.append(W)
            self.biases.append(b)

    def mse_loss(self, y_real, y_pred, alpha):
        """
        y_real: true target values
        y_pred: predicted values
        alpha: regularization parameter
        """
        weights = self.weights

        loss = np.mean((y_real - y_pred)**2)/2
        reg = sum([np.sum(w**2) for w in weights])
        return loss + alpha * reg
    
    
    def backward(self, X, y, alpha = 0.5):
        y_pred, activations, zs = self.forward(X)
        deltas = [None] * self.layers
        grads_W, grads_b = [], []

        # delta output layer
        dL_dy = dloss_dypred(y, y_pred)  # shape (batch, out_dim)
        deltas[-1] = dL_dy  # ultimo layer: no activation

        # hidden layers backwards
        for layer in reversed(range(self.layers - 1)):
            a = activations[layer + 1]
            da_dz = self.derivate_activation(a)
            deltas[layer] = (deltas[layer+1] @ self.weights[layer+1].T) * da_dz

        # grad W, b
        for layer in range(self.layers):
            a_prev = activations[layer]
            grad_W = (a_prev.T @ deltas[layer])+2*alpha*self.weights[layer]
            grads_W.append(grad_W)
            grads_b.append(np.sum(deltas[layer], axis=0, keepdims=True))

        return grads_W, grads_b
    
    def update(self, grads_W, grads_b, lr=1e-3, method="sgd", beta1=0.9, beta2=0.999, eps=1e-8, t=1):
        """
        Update weights and biases using either SGD or Adam.
        
        grads_W: list of gradients for weights
        grads_b: list of gradients for biases
        lr: learning rate
        method: "sgd" or "adam"
        beta1, beta2: Adam hyperparameters
        eps: numerical stability constant
        t: current timestep (needed for Adam bias correction)
        """

        if not hasattr(self, "m_weights"):
            # Initialize Adam moment estimates if not already done
            self.m_weights = [np.zeros_like(W) for W in self.weights]
            self.v_weights = [np.zeros_like(W) for W in self.weights]
            self.m_biases = [np.zeros_like(b) for b in self.biases]
            self.v_biases = [np.zeros_like(b) for b in self.biases]

        for i in range(self.layers):
            if method == "sgd":
                # Standard gradient descent
                self.weights[i] -= lr * grads_W[i]
                self.biases[i]  -= lr * grads_b[i]

            elif method == "adam":
                # Adam optimizer update
                self.m_weights[i] = beta1 * self.m_weights[i] + (1 - beta1) * grads_W[i]
                self.v_weights[i] = beta2 * self.v_weights[i] + (1 - beta2) * (grads_W[i]**2)

                self.m_biases[i] = beta1 * self.m_biases[i] + (1 - beta1) * grads_b[i]
                self.v_biases[i] = beta2 * self.v_biases[i] + (1 - beta2) * (grads_b[i]**2)

                # Bias correction
                m_hat_W = self.m_weights[i] / (1 - beta1**t)
                v_hat_W = self.v_weights[i] / (1 - beta2**t)
                m_hat_b = self.m_biases[i] / (1 - beta1**t)
                v_hat_b = self.v_biases[i] / (1 - beta2**t)

                # Update parameters
                self.weights[i] -= lr * m_hat_W / (np.sqrt(v_hat_W) + eps)
                self.biases[i]  -= lr * m_hat_b / (np.sqrt(v_hat_b) + eps)

            else:
                raise ValueError("Unsupported update method. Choose 'sgd' or 'adam'.")


    def train(self, X, y, epochs, lr, method="Batch", alpha=0.5, batch_size=None, 
          X_val=None, y_val=None, optimizer="sgd", beta1=0.9, beta2=0.999, eps=1e-8,
          early_stopping=None):

        if method not in ["Batch", "Mini Batch", "SGD"]:
            raise ValueError('Select a method between: "Batch", "Mini Batch", and "SGD".')

        if optimizer not in ["sgd", "adam"]:
            raise ValueError('Select optimizer between: "sgd" and "adam".')

        print(f"\nTraining started using '{method}' method for {epochs} epochs with {optimizer.upper()} optimizer.")

        if method == "Mini Batch" and batch_size is None:
            raise ValueError("You must specify a batch_size for mini-batch training.")

        t = 1  # Adam time step

        best_val_loss = float("inf")
        patience_counter = 0

        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")

            if method == "Batch":
                X_batch, y_batch = X, y

                y_pred, _, _ = self.forward(X_batch)
                loss = self.mse_loss(y_batch, y_pred, alpha)

                grads_W, grads_b = self.backward(X_batch, y_batch, alpha)
                self.update(grads_W, grads_b, lr, method=optimizer, beta1=beta1, beta2=beta2, eps=eps, t=t)

                print(f"[Batch] Loss: {loss:.6f}")
                t += 1

            elif method == "SGD":
                indices = np.random.permutation(len(X))
                X_shuffled, y_shuffled = X[indices], y[indices]

                total_loss = 0
                for i in range(len(X_shuffled)):
                    xi = X_shuffled[i].reshape(1, -1)
                    yi = y_shuffled[i].reshape(1, -1)

                    y_pred, _, _ = self.forward(xi)
                    loss = self.mse_loss(yi, y_pred, alpha)

                    grads_W, grads_b = self.backward(xi, yi, alpha)
                    self.update(grads_W, grads_b, lr, method=optimizer, beta1=beta1, beta2=beta2, eps=eps, t=t)

                    total_loss += loss
                    t += 1

                avg_loss = total_loss / len(X_shuffled)
                print(f"[SGD] Average Loss: {avg_loss:.6f}")

            elif method == "Mini Batch":
                indices = np.random.permutation(len(X))
                X_shuffled, y_shuffled = X[indices], y[indices]

                total_loss = 0
                num_batches = 0

                for i in range(0, len(X_shuffled), batch_size):
                    X_batch = X_shuffled[i:i+batch_size]
                    y_batch = y_shuffled[i:i+batch_size]

                    y_pred, _, _ = self.forward(X_batch)
                    loss = self.mse_loss(y_batch, y_pred, alpha)

                    grads_W, grads_b = self.backward(X_batch, y_batch, alpha)
                    self.update(grads_W, grads_b, lr, method=optimizer, beta1=beta1, beta2=beta2, eps=eps, t=t)

                    total_loss += loss
                    num_batches += 1
                    t += 1

                avg_loss = total_loss / num_batches
                print(f"[Mini-batch] Average Loss: {avg_loss:.6f}")

            # --- Validation & Early Stopping ---
            if X_val is not None and y_val is not None:
                y_pred_val, _, _ = self.forward(X_val)
                val_loss = self.mse_loss(y_val, y_pred_val, alpha)
                val_mape = mape(y_val, y_pred_val)
                val_mae = mae(y_val, y_pred_val)
                print(f"Validation Loss: {val_loss:.6f} | Validation MAPE: {val_mape:.2f}% | Validation MAE: {val_mae:.2f} years")

                if early_stopping is not None:
                    if val_loss < best_val_loss - 1e-6:  # improvement
                        best_val_loss = val_loss
                        patience_counter = 0
                    else:  # no improvement
                        patience_counter += 1
                        if patience_counter >= early_stopping:
                            print(f"\nEarly stopping triggered at epoch {epoch+1}!")
                            break

        print(f"\nTraining completed!\nFinal training loss: {avg_loss:.6f}\nFinal validation loss: {val_loss:.6f} \nFinal validation MAPE: {val_mape:.2f}% \nFinal validation MAE: {val_mae:.2f}")

In [19]:
# Split features and target
X = df.drop("gt", axis=1).values
y = df["gt"].values.reshape(-1, 1)

# Train/val split (80/20)
split = int(0.8 * len(X))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]

# Define and train model
nn = NeuralNetwork([X.shape[1], 32, 32, 32, 32, 1], act="sigmoid", init = "he_unif")

nn.train(X_train, y_train, 
      lr=0.005, 
      epochs= 5000,
      method="Mini Batch", 
      batch_size=128, 
      alpha=0.005, 
      X_val=X_val, y_val=y_val, optimizer="adam", beta1=0.9, beta2=0.999, eps=1e-8, early_stopping=20)


Training started using 'Mini Batch' method for 5000 epochs with ADAM optimizer.

Epoch 1/5000
[Mini-batch] Average Loss: 544.334014
Validation Loss: 382.528939 | Validation MAPE: 51.49% | Validation MAE: 22.15 years

Epoch 2/5000
[Mini-batch] Average Loss: 274.949872
Validation Loss: 211.428729 | Validation MAPE: 32.00% | Validation MAE: 14.16 years

Epoch 3/5000
[Mini-batch] Average Loss: 168.763953
Validation Loss: 152.118469 | Validation MAPE: 35.69% | Validation MAE: 12.95 years

Epoch 4/5000
[Mini-batch] Average Loss: 141.161317
Validation Loss: 142.254798 | Validation MAPE: 39.85% | Validation MAE: 13.22 years

Epoch 5/5000
[Mini-batch] Average Loss: 113.309671
Validation Loss: 97.333664 | Validation MAPE: 27.40% | Validation MAE: 9.96 years

Epoch 6/5000
[Mini-batch] Average Loss: 85.879784
Validation Loss: 78.959053 | Validation MAPE: 24.85% | Validation MAE: 8.94 years

Epoch 7/5000
[Mini-batch] Average Loss: 73.048093
Validation Loss: 68.974621 | Validation MAPE: 24.32% | Va