In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
class Advance_Neural_Network:
    def __init__(self, layers, learning_rate = 0.001, epochs = 100, regularization = 0.01,
                optimizer = 'adam', activation = 'relu', batch_size = 16):
        self.layers = layers
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.regularization = regularization
        self.optimizer = optimizer
        self.activation = activation
        self.batch_size = batch_size

        #Initializing weights and biases using Xavier initialiation
        self.weights = []
        self.biases = []

        for i in range(len(layers) - 1):
            xavier = np.sqrt(2.0 / (layers[i] + layers[i + 1]))
            weight_matrix = np.random.normal(0, xavier, (layers[i], layers[i + 1]))
            bias_vector = np.zeros((1, layers[i + 1]))

            self.weights.append(weight_matrix)
            self.biases.append(bias_vector)

        if self.optimizer == 'momentum':
            self.velocity_w = [np.zeros_like(w) for w in self.weights] 
            self.velocity_b = [np.zeros_like(b) for b in self.biases] 
            self.momentum = 0.9

        elif self.optimizer == 'adam':
            self.m_w = [np.zeros_like(w) for w in self.weights] #First moment
            self.v_w = [np.zeros_like(w) for w in self.weights] #Second moment
            self.m_b = [np.zeros_like(b) for b in self.biases]
            self.v_b = [np.zeros_like(b) for b in self.biases]
            self.beta1 = 0.9
            self.beta2 = 0.999
            self.epsilon = 1e-8
            self.t = 0 # Time setp


        self.loss_history = []
        self.accuracy_history = []

    def _softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # For numerical stability
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def activation_function(self, z, derivative = False):
        if self.activation == 'relu':
            if derivative:
                return np.where(z > 0, 1, 0)
            return np.maximum(0, z)

        elif self.activation == 'sigmoid':
            sigmoid = 1 / (1 + np.exp(-np.clip(z, -250, 250))) # Numerical stability
            if derivative:
                return sigmoid * (1 - sigmoid)
            return sigmoid

        elif self.activation == 'tanh':
            tanh = np.tanh(z)
            if derivative:
                return 1 - tanh**2
            return tanh


    def forward_propagation(self, X):
        activations = [X]
        z_values = []

        for i in range(len(self.weights)):
            # linear transformation
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            z_values.append(z)

            #Applying activationo fuction 
            if i == len(self.weights) - 1 and self.layers[-1] > 1:
                a = self._softmax(z)
            else:
                a = self.activation_function(z)

            activations.append(a)

        return activations, z_values

    def backward_propagation(self, X, y, activations, z_values):
        m = X.shape[0]
        gradients_w = []
        gradients_b = []

        # Output layer error
        if self.layers[-1] == 1: # Regression
            delta = activations[-1] - y.reshape(-1,1)
        else: # Classification
            delta = activations[-1] - y

        # back propagation through all layers
        for i in reversed(range(len(self.weights))):
            # Gradient with respect to weights
            grad_w = np.dot(activations[i].T, delta) / m

            # adding L2 regularizations
            grad_w += self.regularization * self.weights[i]

            # Gradient with respect to bias
            grad_b = np.mean(delta, axis = 0, keepdims = True)

            gradients_w.insert(0, grad_w)
            gradients_b.insert(0, grad_b)

            # error to previous layer
            if i > 0:
                # applyinnh the activation derivative
                delta = np.dot(delta, self.weights[i].T) * self.activation_function(z_values[i-1], derivative=True)


        return gradients_w, gradients_b

    def update_weights(self, gradients_w, gradients_b):
        if self.optimizer == 'sgd':
            # Standard gradient descent
            for i in range(len(self.weights)):
                self.weights[i] -= self.learning_rate * gradients_w[i]
                self.biases[i] -= self.learning_rate * gradients_b[i]

        elif self.optimizer == 'momentum':
            # momentum-based gradient descent
            for i in range(len(self.weights)):
                self.velocity_w[i] = self.momentum * self.velocity_w[i] + self.learning_rate * gradients_w[i]
                self.velocity_b[i] = self.momentum * self.velocity_b[i] + self.learning_rate * gradients_b[i]

                self.weights[i] -= self.velocity_w[i]
                self.biases[i] -= self.velocity_b[i]

        elif self.optimizer == 'adam':
            # Adam optimizer with bias correction
            self.t += 1

            for i in range(len(self.weights)):
                # update biased first moment estimate
                self.m_w[i] = self.beta1 * self.m_w[i] + (1 - self.beta1) * gradients_w[i]
                self.m_b[i] = self.beta1 * self.m_b[i] + (1 - self.beta1) * gradients_b[i]

                # update biased second raw moment estimate
                self.v_w[i] = self.beta2 * self.v_w[i] + (1 - self.beta2) * (gradients_w[i] **2)
                self.v_b[i] = self.beta2 * self.v_b[i] + (1 - self.beta2) * (gradients_b[i] **2)

                # compute bias-corrected first moment estimate
                m_w_corrected = self.m_w[i] / (1 - self.beta1 ** self.t)
                m_b_corrected = self.m_b[i] / (1 - self.beta1 ** self.t)

                # compute bias-corrected second raw moment estimate
                v_w_corrected = self.v_w[i] / (1 - self.beta2 ** self.t)
                v_b_corrected = self.v_b[i] / (1 - self.beta2 ** self.t)

                # Updating parameters
                self.weights[i] -= self.learning_rate * m_w_corrected / (np.sqrt(v_w_corrected) + self.epsilon)
                self.biases[i] -= self.learning_rate * m_b_corrected / (np.sqrt(v_b_corrected) + self.epsilon)

    def compute_loss(self, y_true, y_pred):
        m = y_true.shape[0]

        if self.layers[-1] == 1: #Regression
            mse_loss = np.mean((y_true.reshape(-1,1) - y_pred) ** 2) / 2
            
        else: # Classification
            y_pred_clipped = np.clip(y_pred, 1e-15, 1 - 1e-15)
            log_likelihood = -np.sum(y_true * np.log(y_pred_clipped))
            loss = log_likelihood / m
            mse_loss = loss

        # l2 regularization using
        l2_penalty = 0
        for weights in self.weights:
            l2_penalty += np.sum(weights ** 2)
        l2_penalty *= self.regularization / 2
        return mse_loss + l2_penalty

    def fit(self, X, y):
        print(f"Starting Neural network Training...")
        print(f"Architecture: {self.layers}")
        print(f"Optimizer: {self.optimizer}")
        print(f"Activation: {self.activation}")
        print(f"Learning rate: {self.learning_rate}")
        print(f"Regularization: {self.regularization}")
        print("-" * 50)

        for epoch in range(self.epochs):
            # Shuffling for SGD
            indices = np.random.permutation(X.shape[0])
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            epoch_loss = 0
            num_batchs = 0

            # Batch trainig
            for i in range(0, X.shape[0], self.batch_size):
                batch_X = X_shuffled[i:i + self.batch_size]
                batch_y = y_shuffled[i:i + self.batch_size]

                # Forward propagation
                activations, z_values = self.forward_propagation(batch_X)

                # Compute loss
                batch_loss = self.compute_loss(batch_y, activations[-1])
                epoch_loss += batch_loss
                num_batchs += 1

                # Backward propagation
                gradients_w, gradients_b = self.backward_propagation(batch_X, batch_y, activations, z_values)

                # Update weights
                self.update_weights(gradients_w, gradients_b)\

            # Training matrices
            avg_loss = epoch_loss / num_batchs
            self.loss_history.append(avg_loss)

            # Classification accuracy
            if self.layers[-1] > 1:
                predictions = self.predict(X)
                accuracy = np.mean(predictions == np.argmax(y, axis = 1))
                self.accuracy_history.append(accuracy)

            # Progess
            if epoch % 100 == 0:
                if self.layers[-1] > 1:
                    print(f"Epoch {epoch:4d} | Loss: {avg_loss:.6f} | Accuracy: {accuracy:.4f}")
                else:
                    print(f"Epoch {epoch:4d} | Loss: {avg_loss:.6f}")

        print("Training Ccompleted!")

    def predict(self, X):
        activations, _ = self.forward_propagation(X)

        if self.layers[-1] == 1:
            return activations[-1].flatten()
        else:
            return np.argmax(activations[-1], axis = 1)

    def predict_probability(self, X):
        activations, _ = self.forward_propagation(X)
        return activations[-1]

def demonstrate_ml_model():
    print("Model Demonstration")
    print("=" * 80)

    #  dataset for demonstration
    print("\n1. DATASET GENERATION AND PREPROCESSING")
    print("-" * 50)

    X, y = make_classification(n_samples=2000, n_features=20, n_informative=15,
                               n_redundant=5, n_classes=3, n_clusters_per_class=2,
                               random_state=42)
    print(f"Dataset shape: {X.shape}")
    print(f"Number of classes: {len(np.unique(y))}")
    print(f"Feature statistics:")
    print(f" Mean: {np.mean(X):.4f}")
    print(f" Std: {np.std(X):.4f}")
    print(f" Min: {np.min(X):.4f}")
    print(f" Max: {np.max(X):.4f}")

    #Standarization
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print(f"\nAfter standardization:")
    print(f" Mean: {np.mean(X_scaled):.4f}")
    print(f" Std: {np.std(X_scaled):.4f}")

    y_onehot = np.eye(3)[y]

    #Train test splitt
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_onehot, test_size=0.2,
                                                        random_state=42, stratify=y)
    print(f"\nTraining set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")

    # Archi design
    print("\n2. Neural Network architecture design")
    print("-" * 50)

    architecture = [20, 64, 32, 16, 3]
    print(f"Network architecture: {architecture}")

    # parameters
    total_params = 0
    for i in range(len(architecture) - 1):
        layer_params = architecture[i] * architecture[i + 1] + architecture[i + 1] # weights and bias
        total_params += layer_params
        print(f"Layer {i+1}: {architecture[i]} → {architecture[i+1]} | Parameters: {layer_params}")

        print(f"Total parameters: {total_params}")

        # hypter params
        print(f"\nMathematical hyperparameter selection:")
        print(f"Learning rate: 0.001 (balanced convergence speed)")
        print(f"Batch size: 64 (computational efficiency vs. gradient accuracy)")
        print(f"Regularization: 0.001 (prevent overfitting)")
        print(f"Optimizer: Adam (adaptive learning rates)")

        # Creating and training multiple models for comparison
        print("\n3. Model Training and Optimization")
        print("-" * 50)
        models = {
            'Adam + ReLU': Advance_Neural_Network(
                layers = architecture, learning_rate=0.001,epochs=500, regularization=0.001,
                optimizer='adam', activation='relu', batch_size=64), 
            
            'SGD + Sigmoid': Advance_Neural_Network(
                layers = architecture, learning_rate=0.01,epochs=500,regularization=0.001,
                optimizer='sgd', activation='sigmoid',batch_size=64),
            
            'Momentum + Tanh': Advance_Neural_Network(
                layers = architecture, learning_rate=0.005, epochs=500,regularization=0.001,
                optimizer='momentum', activation='tanh', batch_size=64)
        }

    results = {}

    for name, model in models.items():
        print(f"\nTraining {name} model:")
        model.fit(X_train, y_train)
        
        # Evaluate model
        train_predictions = model.predict(X_train)
        test_predictions = model.predict(X_test)
        train_accuracy = np.mean(train_predictions == np.argmax(y_train, axis=1))
        test_accuracy = np.mean(test_predictions == np.argmax(y_test, axis=1))
        
        results[name] = {
            'model': model,
            'train_accuracy': train_accuracy,
            'test_accuracy': test_accuracy,
            'final_loss': model.loss_history[-1]
        }

        print(f"Final Results - Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")

        # Mathematical analysis of results
        print("\n4. Mathematical analysis and results")
        print("-" * 50)
        
        print("Model Performance Comparison:")
        print(f"{'Model':<20} {'Train Acc':<12} {'Test Acc':<12} {'Final Loss':<12} {'Overfitting':<12}")
        print("-" * 70)
        
        for name, result in results.items():
            overfitting = result['train_accuracy'] - result['test_accuracy']
            print(f"{name:<20} {result['train_accuracy']:<12.4f} {result['test_accuracy']:<12.4f} "
                  f"{result['final_loss']:<12.6f} {overfitting:<12.4f}")
        
        # Finding best model
        best_model_name = max(results.keys(), key=lambda k: results[k]['test_accuracy'])
        best_model = results[best_model_name]['model']
        
        print(f"\nBest performing model: {best_model_name}")
        print(f"Test accuracy: {results[best_model_name]['test_accuracy']:.4f}")
        
        # Detailed mathematical analysis
        print("\n5. Detailed Mathematical Analysis")
        print("-" * 50)
        
        # Analyze weight distributions
        print("Weight Distribution Analysis:")
        
    for i, weight_matrix in enumerate(best_model.weights):
        weight_mean = np.mean(weight_matrix)
        weight_std = np.std(weight_matrix)
        weight_min = np.min(weight_matrix)
        weight_max = np.max(weight_matrix)
    
        print(f"Layer {i+1} weights: μ={weight_mean:.4f}, σ={weight_std:.4f}, "
              f"min={weight_min:.4f}, max={weight_max:.4f}")
    
    # Gradient analysis (approximate)
    print(f"\nLoss convergence analysis:")
    initial_loss = best_model.loss_history[0]
    final_loss = best_model.loss_history[-1]
    loss_reduction = (initial_loss - final_loss) / initial_loss * 100
    
    print(f"Initial loss: {initial_loss:.6f}")
    print(f"Final loss: {final_loss:.6f}")
    print(f"Loss reduction: {loss_reduction:.2f}%")
    
    # Mathematical complexity analysis
    print(f"\nComputational Complexity Analysis:")
    print(f"Forward pass complexity = O({total_params})")
    print(f"Backward pass complexity = O({total_params})")
    print(f"Memory complexity: O({total_params}) parameters")
        
    return best_model, results
            
        # Execute the comprehensive demonstration
if __name__ == "__main__":
    model, results = demonstrate_ml_model()



Model Demonstration

1. DATASET GENERATION AND PREPROCESSING
--------------------------------------------------
Dataset shape: (2000, 20)
Number of classes: 3
Feature statistics:
 Mean: 0.0831
 Std: 3.8041
 Min: -25.1647
 Max: 31.8324

After standardization:
 Mean: -0.0000
 Std: 1.0000

Training set: (1600, 20)
Test set: (400, 20)

2. NEURAL NETWORK ARCHITECTURE DESIGN
--------------------------------------------------
Network architecture: [20, 64, 32, 16, 3]
Layer 1: 20 → 64 | Parameters: 1344
Total parameters: 1344

Mathematical hyperparameter selection:
Learning rate: 0.001 (balanced convergence speed)
Batch size: 64 (computational efficiency vs. gradient accuracy)
Regularization: 0.001 (prevent overfitting)
Optimizer: Adam (adaptive learning rates)

3. MODEL TRAINING AND Optimization
--------------------------------------------------
Layer 2: 64 → 32 | Parameters: 2080
Total parameters: 3424

Mathematical hyperparameter selection:
Learning rate: 0.001 (balanced convergence speed)
