##**Random Numbers**

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

class MultiLayerANN:
    def __init__(self, input_size, hidden_sizes, output_size, learning_rate=0.01):
        # Initialize parameters
        self.learning_rate = learning_rate
        self.hidden_sizes = hidden_sizes
        self.weights = []
        self.biases = []

        # Initialize weights and biases for each layer
        layer_sizes = [input_size] + hidden_sizes + [output_size]
        for i in range(len(layer_sizes) - 1):
            weight = np.random.randn(layer_sizes[i + 1], layer_sizes[i]) * 0.01
            bias = np.zeros((layer_sizes[i + 1], 1))
            self.weights.append(weight)
            self.biases.append(bias)

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, z):
        sig = self.sigmoid(z)
        return sig * (1 - sig)

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z))
        return exp_z / exp_z.sum(axis=0, keepdims=True)

    def forward_propagation(self, X):
        self.A = [X]
        for i in range(len(self.weights) - 1):
            Z = np.dot(self.weights[i], self.A[-1]) + self.biases[i]
            A = self.sigmoid(Z)
            self.A.append(A)

        # Output layer
        Z_out = np.dot(self.weights[-1], self.A[-1]) + self.biases[-1]
        A_out = self.softmax(Z_out)
        self.A.append(A_out)

        return A_out

    def compute_cost(self, Y_hat, Y):
        m = Y.shape[1]
        cost = -np.sum(Y * np.log(Y_hat + 1e-8)) / m
        return np.squeeze(cost)

    def backward_propagation(self, X, Y):
        m = X.shape[1]

        # Output layer gradient
        dZ = self.A[-1] - Y  # Output layer
        self.dW = [1/m * np.dot(dZ, self.A[-2].T)]
        self.db = [1/m * np.sum(dZ, axis=1, keepdims=True)]

        # Backpropagate for each hidden layer
        for i in range(len(self.weights) - 2, -1, -1):  # Start from second-to-last layer
            dA = np.dot(self.weights[i + 1].T, dZ)  # Backpropagate the error
            dZ = dA * self.sigmoid_derivative(self.A[i + 1])  # Apply the derivative of the activation function
            self.dW.insert(0, 1/m * np.dot(dZ, self.A[i].T))  # Insert gradients at the start of the list
            self.db.insert(0, 1/m * np.sum(dZ, axis=1, keepdims=True))

        # Update weights and biases
        for i in range(len(self.weights)):
            self.weights[i] -= self.learning_rate * self.dW[i]
            self.biases[i] -= self.learning_rate * self.db[i]

    def train(self, X_train, Y_train, X_val, Y_val, X_test, Y_test, epochs=10):
        for epoch in range(1, epochs + 1):  # Start epoch from 1 to print like 1, 2, 3,...
            Y_hat_train = self.forward_propagation(X_train)
            cost = self.compute_cost(Y_hat_train, Y_train)
            self.backward_propagation(X_train, Y_train)

            # Calculate training accuracy
            predictions_train = self.predict(X_train)
            accuracy_train = np.mean(predictions_train == np.argmax(Y_train, axis=0)) * 100

            # Calculate validation accuracy
            predictions_val = self.predict(X_val)
            accuracy_val = np.mean(predictions_val == np.argmax(Y_val, axis=0)) * 100

            # Print the accuracies and cost at each epoch (but not final test accuracy)
            print(f"Epoch {epoch}, Cost: {cost:.2f}, Train Accuracy: {accuracy_train:.2f}%, "
                  f"Validation Accuracy: {accuracy_val:.2f}%")

        # Final testing accuracy (only once at the end of training)
        predictions_test = self.predict(X_test)
        accuracy_test = np.mean(predictions_test == np.argmax(Y_test, axis=0)) * 100
        print(f"Test Accuracy: {accuracy_test:.2f}%")

    def predict(self, X):
        Y_hat = self.forward_propagation(X)
        predictions = np.argmax(Y_hat, axis=0)
        return predictions

# Example usage
if __name__ == "__main__":
    # Simulate some data (e.g., 2 features, 3 classes, 1000 examples)
    np.random.seed(1)
    X = np.random.randn(2, 1000)
    Y = np.random.randint(0, 3, (1, 1000))
    Y_one_hot = np.eye(3)[Y.flatten()].T

    # Split the data into training, validation, and testing sets
    X_train, X_temp, Y_train, Y_temp = train_test_split(X.T, Y_one_hot.T, test_size=0.4, random_state=42)
    X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

    # Initialize and train the network with 3 hidden layers
    ann = MultiLayerANN(input_size=2, hidden_sizes=[5, 4, 3], output_size=3, learning_rate=0.1)
    ann.train(X_train.T, Y_train.T, X_val.T, Y_val.T, X_test.T, Y_test.T, epochs=10)

Epoch 1, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 2, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 3, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 4, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 5, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 6, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 7, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 8, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 9, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Epoch 10, Cost: 1.10, Train Accuracy: 36.00%, Validation Accuracy: 38.00%
Test Accuracy: 36.50%


##**Make Classification Dataset**

In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

class MultiLayerNet:
    def __init__(self, input_size, hidden_sizes, output_size, activation_function, loss_function, reg_lambda=0.0):
        self.params = {}
        self.num_layers = len(hidden_sizes) + 1  # Number of hidden layers + output layer
        self.layer_sizes = [input_size] + hidden_sizes + [output_size]

        for i in range(1, self.num_layers + 1):
            self.params[f'W{i}'] = np.random.randn(self.layer_sizes[i-1], self.layer_sizes[i]) / np.sqrt(self.layer_sizes[i-1])
            self.params[f'b{i}'] = np.zeros((1, self.layer_sizes[i]))  # Shape (1, layer_size)

        self.activation_function = activation_function
        self.loss_function = loss_function
        self.reg_lambda = reg_lambda

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        sig = self.sigmoid(x)
        return sig * (1 - sig)

    def forward(self, X):
        layer_output = X
        self.layer_inputs = []
        self.layer_outputs = [X]

        for i in range(1, self.num_layers + 1):
            W, b = self.params[f'W{i}'], self.params[f'b{i}']
            layer_input = np.dot(layer_output, W) + b
            self.layer_inputs.append(layer_input)
            layer_output = self.sigmoid(layer_input)
            self.layer_outputs.append(layer_output)

        return layer_output

    def backward(self, X, y, output):
        delta = output - y  # delta should have shape (n_samples, 1)
        dW = {}
        db = {}
        delta = delta / X.shape[0]

        for i in reversed(range(1, self.num_layers + 1)):
            layer_input = self.layer_inputs[i-1]
            activation_derivative = self.sigmoid_derivative(layer_input)

            dW[f'W{i}'] = np.dot(self.layer_outputs[i-1].T, delta * activation_derivative) + self.reg_lambda * self.params[f'W{i}']
            db[f'b{i}'] = np.sum(delta * activation_derivative, axis=0, keepdims=True)  # Use keepdims to maintain shape

            delta = np.dot(delta * activation_derivative, self.params[f'W{i}'].T)

        return dW, db

    def loss(self, X, y, output):
        data_loss = self.loss_function(output, y)
        reg_loss = 0.0

        for i in range(1, self.num_layers + 1):
            reg_loss += 0.5 * self.reg_lambda * np.sum(self.params[f'W{i}'] ** 2)

        total_loss = data_loss + reg_loss
        return total_loss

    def train(self, X_train, y_train, X_val, y_val, num_epochs, learning_rate=0.1):
        for epoch in range(1, num_epochs + 1):  # Start from 1 instead of 0
            # Forward propagation
            output_train = self.forward(X_train)

            # Backward propagation
            dW, db = self.backward(X_train, y_train, output_train)

            # Update parameters
            for i in range(1, self.num_layers + 1):
                self.params[f'W{i}'] -= learning_rate * dW[f'W{i}']
                self.params[f'b{i}'] -= learning_rate * db[f'b{i}']

            # Print loss and accuracy for each epoch (no skipping)
            loss_train = self.loss(X_train, y_train, output_train)
            output_val = self.forward(X_val)
            loss_val = self.loss(X_val, y_val, output_val)

            # Training accuracy
            train_pred = np.round(output_train)
            train_accuracy = np.mean(train_pred == y_train)

            # Validation accuracy
            val_pred = np.round(output_val)
            val_accuracy = np.mean(val_pred == y_val)

            # Print results for every epoch
            # print(f"Epoch {epoch}, Loss (Train): {loss_train}, Loss (Validation): {loss_val}")
            print(f"Epoch {epoch}, Train Accuracy: {train_accuracy:.2f}, Validation Accuracy: {val_accuracy:.2f}")


    def evaluate(self, X, y):
        output = self.forward(X)
        predicted_classes = np.round(output)
        accuracy = np.mean(predicted_classes == y)
        return accuracy

# Define the binary cross-entropy loss function
def binary_crossentropy_loss(output, y):
    epsilon = 1e-7  # Prevent log(0) errors
    output = np.clip(output, epsilon, 1 - epsilon)
    return -np.mean(y * np.log(output) + (1 - y) * np.log(1 - output))

# Generate a toy classification dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
y = y.reshape(-1, 1)  # Reshape y for binary classification

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Normalize the input data
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
X_val = (X_val - mean) / std
X_test = (X_test - mean) / std

# Create a multi-layer neural network with 5 hidden layers
net = MultiLayerNet(input_size=10, hidden_sizes=[20, 15, 10, 5, 3], output_size=1,
                    activation_function='sigmoid', loss_function=binary_crossentropy_loss, reg_lambda=0.01)

# Train the network for 1000 epochs
net.train(X_train, y_train, X_val, y_val, num_epochs=10, learning_rate=0.01)

# Evaluate the trained network on the test set
test_accuracy = net.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 2, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 3, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 4, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 5, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 6, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 7, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 8, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 9, Train Accuracy: 0.51, Validation Accuracy: 0.54
Epoch 10, Train Accuracy: 0.51, Validation Accuracy: 0.54
Test Accuracy: 0.45


##**MNIST Dataset**

In [3]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

class MultiLayerNet:
    def __init__(self, input_size, hidden_sizes, output_size, activation_function, loss_function, reg_lambda=0.0):
        self.params = {}
        self.num_layers = len(hidden_sizes) + 1  # Number of hidden layers + output layer
        self.layer_sizes = [input_size] + hidden_sizes + [output_size]

        for i in range(1, self.num_layers + 1):
            self.params[f'W{i}'] = np.random.randn(self.layer_sizes[i-1], self.layer_sizes[i]) / np.sqrt(self.layer_sizes[i-1])
            self.params[f'b{i}'] = np.zeros((1, self.layer_sizes[i]))  # Shape (1, layer_size)

        self.activation_function = activation_function
        self.loss_function = loss_function
        self.reg_lambda = reg_lambda

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        sig = self.sigmoid(x)
        return sig * (1 - sig)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # for numerical stability
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def softmax_derivative(self, x):
        # Softmax derivative for cross-entropy loss
        s = self.softmax(x)
        return s * (1 - s)

    def forward(self, X):
        layer_output = X
        self.layer_inputs = []
        self.layer_outputs = [X]

        for i in range(1, self.num_layers + 1):
            W, b = self.params[f'W{i}'], self.params[f'b{i}']
            layer_input = np.dot(layer_output, W) + b
            self.layer_inputs.append(layer_input)
            if i == self.num_layers:  # Apply softmax activation on the output layer
                layer_output = self.softmax(layer_input)
            else:
                layer_output = self.sigmoid(layer_input)
            self.layer_outputs.append(layer_output)

        return layer_output

    def backward(self, X, y, output):
        delta = output - y  # delta should have shape (n_samples, 10)
        dW = {}
        db = {}
        delta = delta / X.shape[0]

        for i in reversed(range(1, self.num_layers + 1)):
            layer_input = self.layer_inputs[i-1]
            activation_derivative = self.softmax_derivative(layer_input) if i == self.num_layers else self.sigmoid_derivative(layer_input)

            dW[f'W{i}'] = np.dot(self.layer_outputs[i-1].T, delta * activation_derivative) + self.reg_lambda * self.params[f'W{i}']
            db[f'b{i}'] = np.sum(delta * activation_derivative, axis=0, keepdims=True)

            delta = np.dot(delta * activation_derivative, self.params[f'W{i}'].T)

        return dW, db

    def loss(self, X, y, output):
        data_loss = self.loss_function(output, y)
        reg_loss = 0.0

        for i in range(1, self.num_layers + 1):
            reg_loss += 0.5 * self.reg_lambda * np.sum(self.params[f'W{i}'] ** 2)

        total_loss = data_loss + reg_loss
        return total_loss

    def train(self, X_train, y_train, X_val, y_val, num_epochs, learning_rate=0.1):
        for epoch in range(1, num_epochs + 1):  # Start from 1 instead of 0
            # Forward propagation
            output_train = self.forward(X_train)

            # Backward propagation
            dW, db = self.backward(X_train, y_train, output_train)

            # Update parameters
            for i in range(1, self.num_layers + 1):
                self.params[f'W{i}'] -= learning_rate * dW[f'W{i}']
                self.params[f'b{i}'] -= learning_rate * db[f'b{i}']

            # Print loss and accuracy for each epoch (no skipping)
            loss_train = self.loss(X_train, y_train, output_train)
            output_val = self.forward(X_val)
            loss_val = self.loss(X_val, y_val, output_val)

            # Training accuracy
            train_pred = np.argmax(output_train, axis=1)
            train_accuracy = np.mean(train_pred == np.argmax(y_train, axis=1))

            # Validation accuracy
            val_pred = np.argmax(output_val, axis=1)
            val_accuracy = np.mean(val_pred == np.argmax(y_val, axis=1))

            # Print results for every epoch
            print(f"Epoch {epoch}, Train Accuracy: {train_accuracy:.2f}, Validation Accuracy: {val_accuracy:.2f}")

    def evaluate(self, X, y):
        output = self.forward(X)
        predicted_classes = np.argmax(output, axis=1)
        accuracy = np.mean(predicted_classes == np.argmax(y, axis=1))
        return accuracy

# Define the categorical cross-entropy loss function
def categorical_crossentropy_loss(output, y):
    epsilon = 1e-7  # Prevent log(0) errors
    output = np.clip(output, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y * np.log(output), axis=1))

# Load MNIST dataset from OpenML
mnist = fetch_openml('mnist_784', version=1)
X = mnist.data / 255.0  # Normalize pixel values to [0, 1]
y = mnist.target.astype(int)

# Convert y to one-hot encoding for multi-class classification
y_one_hot = np.zeros((y.size, 10))
y_one_hot[np.arange(y.size), y] = 1

# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Create a multi-layer neural network with 5 hidden layers
net = MultiLayerNet(input_size=784, hidden_sizes=[128, 64, 32, 16, 8], output_size=10,
                    activation_function='sigmoid', loss_function=categorical_crossentropy_loss, reg_lambda=0.01)

# Train the network for 10 epochs
net.train(X_train, y_train, X_val, y_val, num_epochs=10, learning_rate=0.01)

# Evaluate the trained network on the test set
test_accuracy = net.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 2, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 3, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 4, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 5, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 6, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 7, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 8, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 9, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 10, Train Accuracy: 0.11, Validation Accuracy: 0.11
Test Accuracy: 0.11


In [4]:
import numpy as np
from keras.datasets import mnist
from sklearn.model_selection import train_test_split

class MultiLayerNet:
    def __init__(self, input_size, hidden_sizes, output_size, activation_function, loss_function, reg_lambda=0.0):
        self.params = {}
        self.num_layers = len(hidden_sizes) + 1  # Number of hidden layers + output layer
        self.layer_sizes = [input_size] + hidden_sizes + [output_size]

        for i in range(1, self.num_layers + 1):
            self.params[f'W{i}'] = np.random.randn(self.layer_sizes[i-1], self.layer_sizes[i]) / np.sqrt(self.layer_sizes[i-1])
            self.params[f'b{i}'] = np.zeros((1, self.layer_sizes[i]))  # Shape (1, layer_size)

        self.activation_function = activation_function
        self.loss_function = loss_function
        self.reg_lambda = reg_lambda

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        sig = self.sigmoid(x)
        return sig * (1 - sig)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))  # Subtract max for numerical stability
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def softmax_derivative(self, x):
        s = self.softmax(x)
        return s * (1 - s)  # Simplified derivative for softmax (for numerical stability)

    def forward(self, X):
        layer_output = X
        self.layer_inputs = []
        self.layer_outputs = [X]

        for i in range(1, self.num_layers + 1):
            W, b = self.params[f'W{i}'], self.params[f'b{i}']
            layer_input = np.dot(layer_output, W) + b
            self.layer_inputs.append(layer_input)
            if i == self.num_layers:
                layer_output = self.softmax(layer_input)  # Use softmax for the output layer
            else:
                layer_output = self.sigmoid(layer_input)
            self.layer_outputs.append(layer_output)

        return layer_output

    def backward(self, X, y, output):
        delta = output - y  # delta should have shape (n_samples, num_classes)
        dW = {}
        db = {}
        delta = delta / X.shape[0]

        for i in reversed(range(1, self.num_layers + 1)):
            layer_input = self.layer_inputs[i-1]
            if i == self.num_layers:
                activation_derivative = self.softmax_derivative(layer_input)
            else:
                activation_derivative = self.sigmoid_derivative(layer_input)

            dW[f'W{i}'] = np.dot(self.layer_outputs[i-1].T, delta * activation_derivative) + self.reg_lambda * self.params[f'W{i}']
            db[f'b{i}'] = np.sum(delta * activation_derivative, axis=0, keepdims=True)

            delta = np.dot(delta * activation_derivative, self.params[f'W{i}'].T)

        return dW, db

    def loss(self, X, y, output):
        data_loss = self.loss_function(output, y)
        reg_loss = 0.0

        for i in range(1, self.num_layers + 1):
            reg_loss += 0.5 * self.reg_lambda * np.sum(self.params[f'W{i}'] ** 2)

        total_loss = data_loss + reg_loss
        return total_loss

    def train(self, X_train, y_train, X_val, y_val, num_epochs, learning_rate=0.1):
        for epoch in range(1, num_epochs + 1):  # Start from 1 instead of 0
            # Forward propagation
            output_train = self.forward(X_train)

            # Backward propagation
            dW, db = self.backward(X_train, y_train, output_train)

            # Update parameters
            for i in range(1, self.num_layers + 1):
                self.params[f'W{i}'] -= learning_rate * dW[f'W{i}']
                self.params[f'b{i}'] -= learning_rate * db[f'b{i}']

            # Print loss and accuracy for each epoch (no skipping)
            loss_train = self.loss(X_train, y_train, output_train)
            output_val = self.forward(X_val)
            loss_val = self.loss(X_val, y_val, output_val)

            # Training accuracy
            train_pred = np.argmax(output_train, axis=1)  # Get class with max probability
            train_accuracy = np.mean(train_pred == np.argmax(y_train, axis=1))

            # Validation accuracy
            val_pred = np.argmax(output_val, axis=1)  # Get class with max probability
            val_accuracy = np.mean(val_pred == np.argmax(y_val, axis=1))

            # Print results for every epoch
            print(f"Epoch {epoch}, Train Accuracy: {train_accuracy:.2f}, Validation Accuracy: {val_accuracy:.2f}")

    def evaluate(self, X, y):
        output = self.forward(X)
        predicted_classes = np.argmax(output, axis=1)  # Get class with max probability
        accuracy = np.mean(predicted_classes == np.argmax(y, axis=1))
        return accuracy

# Define the categorical cross-entropy loss function
def categorical_crossentropy_loss(output, y):
    epsilon = 1e-7  # Prevent log(0) errors
    output = np.clip(output, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y * np.log(output), axis=1))  # Sum over classes for each example

# Load the MNIST dataset
(X_train_full, y_train_full), (X_test, y_test) = mnist.load_data()

# Preprocess the data
X_train_full = X_train_full.astype(np.float32) / 255.0  # Normalize to [0, 1]
X_test = X_test.astype(np.float32) / 255.0  # Normalize to [0, 1]

# Flatten the images from 28x28 to 784-dimensional vectors
X_train_full = X_train_full.reshape(X_train_full.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Convert labels to one-hot encoding
y_train_full = np.eye(10)[y_train_full]  # One-hot encoding for 10 classes
y_test = np.eye(10)[y_test]  # One-hot encoding for 10 classes

# Split the data into training, validation, and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42)

# Create a multi-layer neural network with 5 hidden layers
net = MultiLayerNet(input_size=784, hidden_sizes=[512, 256, 128, 64, 32], output_size=10,
                    activation_function='sigmoid', loss_function=categorical_crossentropy_loss, reg_lambda=0.01)

# Train the network for 10 epochs
net.train(X_train, y_train, X_val, y_val, num_epochs=10, learning_rate=0.01)

# Evaluate the trained network on the test set
test_accuracy = net.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 2, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 3, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 4, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 5, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 6, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 7, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 8, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 9, Train Accuracy: 0.11, Validation Accuracy: 0.11
Epoch 10, Train Accuracy: 0.11, Validation Accuracy: 0.11
Test Accuracy: 0.11


In [5]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

class MultiLayerNet:
    def __init__(self, input_size, hidden_sizes, output_size, activation_function, loss_function, reg_lambda=0.0):
        self.params = {}
        self.num_layers = len(hidden_sizes) + 1  # Number of hidden layers + output layer
        self.layer_sizes = [input_size] + hidden_sizes + [output_size]

        for i in range(1, self.num_layers + 1):
            self.params[f'W{i}'] = np.random.randn(self.layer_sizes[i-1], self.layer_sizes[i]) / np.sqrt(self.layer_sizes[i-1])
            self.params[f'b{i}'] = np.zeros((1, self.layer_sizes[i]))  # Shape (1, layer_size)

        self.activation_function = activation_function
        self.loss_function = loss_function
        self.reg_lambda = reg_lambda

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def sigmoid_derivative(self, x):
        sig = self.sigmoid(x)
        return sig * (1 - sig)

    def forward(self, X):
        layer_output = X
        self.layer_inputs = []
        self.layer_outputs = [X]

        for i in range(1, self.num_layers + 1):
            W, b = self.params[f'W{i}'], self.params[f'b{i}']
            layer_input = np.dot(layer_output, W) + b
            self.layer_inputs.append(layer_input)
            layer_output = self.sigmoid(layer_input)
            self.layer_outputs.append(layer_output)

        return layer_output

    def backward(self, X, y, output):
        delta = output - y  # delta should have shape (n_samples, 1)
        dW = {}
        db = {}
        delta = delta / X.shape[0]

        for i in reversed(range(1, self.num_layers + 1)):
            layer_input = self.layer_inputs[i-1]
            activation_derivative = self.sigmoid_derivative(layer_input)

            dW[f'W{i}'] = np.dot(self.layer_outputs[i-1].T, delta * activation_derivative) + self.reg_lambda * self.params[f'W{i}']
            db[f'b{i}'] = np.sum(delta * activation_derivative, axis=0, keepdims=True)  # Use keepdims to maintain shape

            delta = np.dot(delta * activation_derivative, self.params[f'W{i}'].T)

        return dW, db

    def loss(self, X, y, output):
        data_loss = self.loss_function(output, y)
        reg_loss = 0.0

        for i in range(1, self.num_layers + 1):
            reg_loss += 0.5 * self.reg_lambda * np.sum(self.params[f'W{i}'] ** 2)

        total_loss = data_loss + reg_loss
        return total_loss

    def train(self, X_train, y_train, X_val, y_val, num_epochs, learning_rate=0.1):
        for epoch in range(1, num_epochs + 1):  # Start from 1 instead of 0
            # Forward propagation
            output_train = self.forward(X_train)

            # Backward propagation
            dW, db = self.backward(X_train, y_train, output_train)

            # Update parameters
            for i in range(1, self.num_layers + 1):
                self.params[f'W{i}'] -= learning_rate * dW[f'W{i}']
                self.params[f'b{i}'] -= learning_rate * db[f'b{i}']

            # Print loss and accuracy for each epoch (no skipping)
            loss_train = self.loss(X_train, y_train, output_train)
            output_val = self.forward(X_val)
            loss_val = self.loss(X_val, y_val, output_val)

            # Training accuracy
            train_pred = np.round(output_train)
            train_accuracy = np.mean(train_pred == y_train)

            # Validation accuracy
            val_pred = np.round(output_val)
            val_accuracy = np.mean(val_pred == y_val)

            # Print results for every epoch
            print(f"Epoch {epoch}, Train Accuracy: {train_accuracy:.2f}, Validation Accuracy: {val_accuracy:.2f}")

    def evaluate(self, X, y):
        output = self.forward(X)
        predicted_classes = np.round(output)
        accuracy = np.mean(predicted_classes == y)
        return accuracy

# Define the categorical cross-entropy loss function
def categorical_crossentropy_loss(output, y):
    epsilon = 1e-7  # Prevent log(0) errors
    output = np.clip(output, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y * np.log(output), axis=1))

# Load the MNIST dataset using TensorFlow
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

# Preprocess the data
X_train_full = X_train_full.astype(np.float32) / 255.0  # Normalize to [0, 1]
X_test = X_test.astype(np.float32) / 255.0  # Normalize to [0, 1]

# Flatten the images from 28x28 to 784-dimensional vectors
X_train_full = X_train_full.reshape(X_train_full.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Convert labels to one-hot encoding
y_train_full = np.eye(10)[y_train_full]  # One-hot encoding for 10 classes
y_test = np.eye(10)[y_test]  # One-hot encoding for 10 classes

# Split the data into training, validation, and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42)

# Create a multi-layer neural network with 5 hidden layers
net = MultiLayerNet(input_size=784, hidden_sizes=[512, 256, 128, 64, 32], output_size=10,
                    activation_function='sigmoid', loss_function=categorical_crossentropy_loss, reg_lambda=0.01)

# Train the network for 10 epochs
net.train(X_train, y_train, X_val, y_val, num_epochs=10, learning_rate=0.01)

# Evaluate the trained network on the test set
test_accuracy = net.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

Epoch 1, Train Accuracy: 0.58, Validation Accuracy: 0.58
Epoch 2, Train Accuracy: 0.58, Validation Accuracy: 0.58
Epoch 3, Train Accuracy: 0.58, Validation Accuracy: 0.66
Epoch 4, Train Accuracy: 0.66, Validation Accuracy: 0.66
Epoch 5, Train Accuracy: 0.66, Validation Accuracy: 0.66
Epoch 6, Train Accuracy: 0.66, Validation Accuracy: 0.66
Epoch 7, Train Accuracy: 0.66, Validation Accuracy: 0.66
Epoch 8, Train Accuracy: 0.66, Validation Accuracy: 0.66
Epoch 9, Train Accuracy: 0.66, Validation Accuracy: 0.66
Epoch 10, Train Accuracy: 0.66, Validation Accuracy: 0.66
Test Accuracy: 0.66
