# IMPORT LIBRARY

In [257]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
import pickle

# MODEL

In [258]:
class Layer:
    def __init__(self, n_neurons, init='zero', activation='linear', init_params=None, weights=None, biases=None):
        """
        Initialize a neural network layer
        
        Parameters:
        -----------
        n_neurons : int
            Number of neurons in the layer
        init : str, optional (default='zero')
            Initialization method. Options:
            - 'zero': Zero initialization
            - 'uniform': Uniform random distribution
            - 'normal': Normal (Gaussian) random distribution
        activation : str, optional (default='linear')
            Activation function to use
        init_params : dict, optional
            Additional parameters for initialization:
            - For 'uniform': 
                * 'lower': lower bound (default: -1)
                * 'upper': upper bound (default: 1)
                * 'seed': random seed (optional)
            - For 'normal':
                * 'mean': mean of distribution (default: 0)
                * 'variance': variance of distribution (default: 1)
                * 'seed': random seed (optional)
        """
        self.n_neurons = n_neurons
        self.init = init
        self.activation = activation
        self.init_params = init_params or {}
        
        if self.init == 'uniform':
            self.init_params.setdefault('lower', -1)
            self.init_params.setdefault('upper', 1)
        elif self.init == 'normal':
            self.init_params.setdefault('mean', 0)
            self.init_params.setdefault('variance', 1)
        
        self.weights = weights
        self.biases = biases
    
    def initialize(self, input_dim):
        if 'seed' in self.init_params:
            np.random.seed(self.init_params['seed'])

        self.biases = np.zeros((1, self.n_neurons))
        if self.init == 'zero':
            self.weights = np.zeros((input_dim, self.n_neurons))
        
        elif self.init == 'uniform':
            lower = self.init_params['lower']
            upper = self.init_params['upper']
            self.weights = np.random.uniform(low=lower, high=upper, size=(input_dim, self.n_neurons))
        
        elif self.init == 'normal':
            mean = self.init_params['mean']
            variance = self.init_params['variance']
            self.weights = np.random.normal(loc=mean, scale=np.sqrt(variance), size=(input_dim, self.n_neurons))
        
        elif self.init == 'xavier_uniform':
            limit = np.sqrt(6 / (input_dim + self.n_neurons))
            self.weights = np.random.uniform(-limit, limit, (input_dim, self.n_neurons))
        
        elif self.init == 'xavier_normal':
            std = np.sqrt(2 / (input_dim + self.n_neurons))
            self.weights = np.random.normal(0, std, (input_dim, self.n_neurons))
        
        elif self.init == 'he_normal':
            std = np.sqrt(2 / input_dim)
            self.weights = np.random.normal(0, std, (input_dim, self.n_neurons))
        
        elif self.init == 'he_uniform':
            limit = np.sqrt(6 / input_dim)
            self.weights = np.random.uniform(-limit, limit, (input_dim, self.n_neurons))
        
        else:
            raise ValueError(
                f"Unknown initialization type: {self.init}\n"
                "Available types: zero, uniform, normal, xavier_uniform, xavier_normal, he_normal, he_uniform"
            )
        
        return self
        
    def activate(self, x):
        if self.activation == 'linear':
            return x
        elif self.activation == 'relu':
            return np.maximum(0, x)
        elif self.activation == 'leaky_relu':
            return np.where(x > 0, x, 0.01 * x)
        elif self.activation == 'elu':
            return np.where(x > 0, x, 0.01 * (np.exp(x) - 1))
        elif self.activation == 'sigmoid':
            return 1 / (1 + np.exp(-x))
        elif self.activation == 'tanh':
            return np.tanh(x)
        elif self.activation == 'softmax':
            exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
            return exp_x / np.sum(exp_x, axis=1, keepdims=True)
        else:
            raise ValueError(
                f"Unknown activation function: {self.activation}\n"
                "/n Activation function available: linear, relu,sigmoid, tanh, softmax, elu, leaky_relu"
            )
    
    def activation_derivative(self, x):
        if self.activation == 'linear':
            return np.ones_like(x)
        elif self.activation == 'relu':
            return (x > 0).astype(float)
        elif self.activation == 'leaky_relu':
            return np.where(x > 0, 1, 0.01)
        elif self.activation == 'elu':
            alpha = 0.01
            return np.where(x > 0, 1, alpha * np.exp(x))
        elif self.activation == 'sigmoid':
            s = self.activate(x)
            return s * (1 - s)
        elif self.activation == 'tanh':
            t = np.tanh(x)
            return 1 - t**2
        elif self.activation == 'softmax':
            return 1
        else:
            raise ValueError(
                f"Unknown activation function: {self.activation}/n"
                "Activation function available: linear, relu,sigmoid, tanh, softmax, elu, leaky_relu"
            )


In [259]:
'''
Note:
- if y_true.ndim == 1: y_true = y_true.reshape(-1, 1) -> Kalo array 1D ubah jadi array 2D
- if y_true.shape != y_pred.shape: y_true = np.eye(y_pred.shape[1])[y_true.flatten()] -> handle kalo y_true contain class label bukan one hot 
'''

def mse(y_true, y_pred):
    if y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
    if y_true.shape != y_pred.shape:
        y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
    return np.mean((y_true - y_pred) ** 2)

def mse_derivative(y_true, y_pred):
    if y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
    if y_true.shape != y_pred.shape:
        y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
    return 2 * (y_pred - y_true) / y_true.size

def bce(y_true, y_pred):
    if y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
    if y_true.shape != y_pred.shape:
        y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
    
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def bce_derivative(y_true, y_pred):
    if y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
    if y_true.shape != y_pred.shape:
        y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
    
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return (y_pred - y_true) / (y_pred * (1 - y_pred))

def cce(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    if y_true.ndim == 1 or (y_true.ndim == 2 and y_true.shape[1] == 1):
        y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
    
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

def cce_derivative(y_true, y_pred):
    if y_true.ndim == 1 or (y_true.ndim == 2 and y_true.shape[1] == 1):
        y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
    
    return y_pred - y_true

In [260]:
class FFNN:
    def __init__(self, loss='mse', batch_size=32, learning_rate=0.01, epochs=100, verbose=1):
        self.layers = []
        self.learning_rate = learning_rate
        self.loss = loss
        self.batch_size = batch_size
        self.epochs = epochs
        self.verbose = verbose
        
        if loss == 'mse':
            self.loss_func = mse
            self.loss_derivative = mse_derivative
        elif loss == 'bce':
            self.loss_func = bce
            self.loss_derivative = bce_derivative
        elif loss == 'cce':
            self.loss_func = cce
            self.loss_derivative = cce_derivative
        else:
            raise ValueError(
                f"Unknown loss function: {loss}\n"
                "Loss function available: mse, bce, cce"
            )

    '''
    LOSS FUNCTION
    Note:
    - if y_true.ndim == 1: y_true = y_true.reshape(-1, 1) -> Kalo array 1D ubah jadi array 2D
    - if y_true.shape != y_pred.shape: y_true = np.eye(y_pred.shape[1])[y_true.flatten()] -> handle kalo y_true contain class label bukan one hot 
    '''

    def mse(y_true, y_pred):
        if y_true.ndim == 1:
            y_true = y_true.reshape(-1, 1)
        if y_true.shape != y_pred.shape:
            y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
        return np.mean((y_true - y_pred) ** 2)

    def mse_derivative(y_true, y_pred):
        if y_true.ndim == 1:
            y_true = y_true.reshape(-1, 1)
        if y_true.shape != y_pred.shape:
            y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
        return 2 * (y_pred - y_true) / y_true.size

    def bce(y_true, y_pred):
        if y_true.ndim == 1:
            y_true = y_true.reshape(-1, 1)
        if y_true.shape != y_pred.shape:
            y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
        
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

    def bce_derivative(y_true, y_pred):
        if y_true.ndim == 1:
            y_true = y_true.reshape(-1, 1)
        if y_true.shape != y_pred.shape:
            y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
        
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return (y_pred - y_true) / (y_pred * (1 - y_pred))

    def cce(y_true, y_pred):
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        if y_true.ndim == 1 or (y_true.ndim == 2 and y_true.shape[1] == 1):
            y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
        
        return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

    def cce_derivative(y_true, y_pred):
        if y_true.ndim == 1 or (y_true.ndim == 2 and y_true.shape[1] == 1):
            y_true = np.eye(y_pred.shape[1])[y_true.flatten()]
        
        return y_pred - y_true
    
    def build_layers(self, *layer_args):
        self.layers = list(layer_args)
    
    def _initialize_network(self, input_dim):
        prev_dim = input_dim
        for layer in self.layers:
            layer.initialize(prev_dim)
            prev_dim = layer.n_neurons
    
    def forward(self, X):
        if X.ndim == 1:
            X = X.reshape(1, -1)
        
        activations = [X]
        zs = []
        
        for layer in self.layers:
            z = activations[-1] @ layer.weights + layer.biases
            a = layer.activate(z)
            zs.append(z)
            activations.append(a)
        
        return zs, activations
    
    def backward(self, X, y, zs, activations):
        m = X.shape[0]
        y_pred = activations[-1]
        
        delta = self.loss_derivative(y, y_pred)
        
        for i in reversed(range(len(self.layers))):
            z = zs[i]
            a_prev = activations[i]
            
            grad_w = (a_prev.T @ delta) / m
            grad_b = np.sum(delta, axis=0, keepdims=True) / m

            self.layers[i].weights -= self.learning_rate * grad_w
            self.layers[i].biases -= self.learning_rate * grad_b
            
            if i > 0:
                delta = (delta @ self.layers[i].weights.T) * self.layers[i - 1].activation_derivative(zs[i - 1])
    
    def fit(self, X, y, X_val=None, y_val=None):
        self._initialize_network(X.shape[1])
        
        # Kalo gaada val data -> trainnya displit 
        if X_val is None or y_val is None:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        else:
            X_train, y_train = X, y
        
        if self.verbose == 0:
            for epoch in range(self.epochs):
                indices = np.arange(X.shape[0])
                np.random.shuffle(indices)
                
                for start in range(0, X.shape[0], self.batch_size):
                    end = start + self.batch_size
                    batch_indices = indices[start:end]
                    X_batch = X[batch_indices]
                    y_batch = y[batch_indices]
                    
                    zs, activations = self.forward(X_batch)
                    self.backward(X_batch, y_batch, zs, activations)
        
        elif self.verbose == 1:
            for epoch in range(self.epochs):
                epoch_progress = tqdm(total=X.shape[0], desc=f"Epoch {epoch+1}/{self.epochs}", unit='sample')
                
                indices = np.arange(X.shape[0])
                np.random.shuffle(indices)
                
                for start in range(0, X.shape[0], self.batch_size):
                    end = start + self.batch_size
                    batch_indices = indices[start:end]
                    X_batch = X[batch_indices]
                    y_batch = y[batch_indices]
                    
                    zs, activations = self.forward(X_batch)
                    self.backward(X_batch, y_batch, zs, activations)

                    epoch_progress.update(len(X_batch))
                
                epoch_progress.close()
                
                y_train_pred = self.forward(X)[1][-1]
                train_loss = self.loss_func(y, y_train_pred)
                
                y_val_pred = self.forward(X_val)[1][-1]
                val_loss = self.loss_func(y_val, y_val_pred)
                
                print(f"Epoch {epoch+1}/{self.epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        
        else:
            raise ValueError(
                f"Invalid verbose value: {self.verbose}\n"
                "Verbose options: 0 (no output), 1 (progress bar)"
            )
    
    def predict(self, X):
        _, activations = self.forward(X)
        if self.loss == 'cce':  
            return np.argmax(activations[-1], axis=1)
        return np.round(activations[-1])
    
    def save(self, filename):
        model_state = {
            'layers': self.layers,
            'learning_rate': self.learning_rate,
            'loss': self.loss,
            'batch_size': self.batch_size,
            'epochs': self.epochs,
            'verbose': self.verbose
        }
        with open(filename, 'wb') as f:
            pickle.dump(model_state, f)

        print(f"Model saved to {filename}")
    
    @classmethod
    def load(cls, filename):
        with open(filename, 'rb') as f:
            model_state = pickle.load(f)
        
        model = cls(
            loss=model_state['loss'],
            batch_size=model_state['batch_size'],
            learning_rate=model_state['learning_rate'],
            epochs=model_state['epochs'],
            verbose=model_state['verbose']
        )
        
        model.layers = model_state['layers']
        print(f"Model loaded from {filename}")
        return model

# LOAD DATA

In [261]:
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
y = y.astype(np.uint8)

In [262]:
X.shape

(70000, 784)

In [263]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, train_size=1000, test_size=10, stratify=y, random_state=42)

In [264]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, train_size=0.8, stratify=y_train_full, random_state=42)

In [265]:
X_train = X_train / 255.0
X_val = X_val / 255.0
X_test = X_test / 255.0

# TRAIN MODEL

In [266]:
model_ffnn = FFNN(
    loss='cce',
    batch_size=1,
    learning_rate=0.1,
    epochs=20,
    verbose=0
)

In [267]:
model_ffnn.build_layers(
    Layer(n_neurons=2, init='zero', activation='linear'),
    Layer(n_neurons=2, init='uniform', activation='relu', init_params={'lower': -0.5, 'upper': 0.5}),
    Layer(n_neurons=2, init='normal', activation='tanh', init_params={'mean': 0, 'variance': 0.01}),
    Layer(n_neurons=2, init='xavier_normal', activation='softmax'),
    Layer(n_neurons=2, init='xavier_uniform', activation='leaky_relu'),
    Layer(n_neurons=2, init='he_normal', activation='elu'),
    Layer(n_neurons=10, init='he_uniform', activation='sigmoid')
)

In [268]:
model_ffnn.fit(X_train, y_train)

  return np.where(x > 0, x, 0.01 * (np.exp(x) - 1))
  return 1 / (1 + np.exp(-x))
  return np.where(x > 0, 1, alpha * np.exp(x))
  delta = (delta @ self.layers[i].weights.T) * self.layers[i - 1].activation_derivative(zs[i - 1])
  delta = (delta @ self.layers[i].weights.T) * self.layers[i - 1].activation_derivative(zs[i - 1])


In [269]:
y_pred = model_ffnn.predict(X_test)
for h in y_pred:
    max_index = np.argmax(h)
    print(max_index)

0
0
0
0
0
0
0
0
0
0


In [270]:
model_ffnn.save('my_model.pkl')

Model saved to my_model.pkl


In [271]:
loaded_model = FFNN.load('my_model.pkl')

Model loaded from my_model.pkl


In [272]:
y_pred = loaded_model.predict(X_test)
for h in y_pred:
    max_index = np.argmax(h)
    print(max_index)

0
0
0
0
0
0
0
0
0
0
