<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/ANN_OCT2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [238]:
import numpy as np

class NeuralNetwork:
    def __init__(self, layer_sizes, dropout=0.0):
        weight_shapes = [(a, b) for a, b in zip(layer_sizes[1:], layer_sizes[:-1])]

        # Xavier initialization
        self.weights = [np.random.randn(s[0], s[1]) * np.sqrt(2 / (s[0] + s[1])) for s in weight_shapes]

        #self.weights = [np.random.standard_normal(s)/s[1]**.5 for s in weight_shapes]
        self.biases = [np.zeros((s, 1)) for s in layer_sizes[1:]]
        self.layer_sizes = layer_sizes
        self.dropout = dropout

        # Initialize Adam optimizer parameters here
        self.m_w = [np.zeros(w.shape) for w in self.weights]
        self.v_w = [np.zeros(w.shape) for w in self.weights]
        self.m_b = [np.zeros(b.shape) for b in self.biases]
        self.v_b = [np.zeros(b.shape) for b in self.biases]


    def predict(self, a):
        for i, (w, b) in enumerate(zip(self.weights, self.biases)):
            a = self.activation(np.dot(w, a) + b)
            if i < len(self.weights) - 1:  # Apply dropout to all layers except the output layer
                a *= np.random.binomial(1, 1 - self.dropout, size=a.shape) / (1 - self.dropout)
        return a

    #def activation(self, z):
        #return np.maximum(0, z)  # ReLU activation function

    #def activation(self, z):
        #return 1 / (1 + np.exp(-z))  # Sigmoid activation function


    def activation(self, z):
        return 1 / (1 + np.exp(-z))  # Sigmoid activation function

    def activation_derivative(self, z):
        return self.activation(z) * (1 - self.activation(z))  # Derivative of Sigmoid

    def backpropagate(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        activation = x
        activations = [x]  # list to store all the activations, layer by layer
        zs = []  # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation) + b
            zs.append(z)
            activation = self.activation(z)
            activations.append(activation)

        delta = self.cost_derivative(activations[-1], y) * self.activation_derivative(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        # Now you can print activations here:
        #print("Activations:", activations)
        #sys.stdout.flush()  # Ensure the output is sent to the console immediately

        for l in range(2, len(self.layer_sizes)):
            z = zs[-l]
            sp = self.activation_derivative(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)


        # Example: Print intermediate values
        print("Activations:", activations)
        print("Zs:", zs)
        print("Delta:", delta)

    def cost_derivative(self, output_activations, y):
        return (output_activations - y)

    def activation_derivative(self, z):
        return 1. * (z > 0)  # Derivative of ReLU

    #def train(self, training_data, epochs, initial_learning_rate, batch_size, l2_lambda=0.01):

    #def train(self, training_data, epochs, learning_rate, batch_size, l2_lambda=0.01):  # Add L2 regularization parameter

    def train(self, training_data, epochs, initial_learning_rate, batch_size, l2_lambda=0.01):



    #def train(self, training_data, epochs, learning_rate, batch_size, l2_lambda=0.01):
        training_data = list(training_data)
        n = len(training_data)

        for j in range(epochs):
            np.random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+batch_size]
                for k in range(0, n, batch_size)]

            #for mini_batch in mini_batches:
            #    self.update_mini_batch(mini_batch, learning_rate, j, l2_lambda)  # Pass l2_lambda to update_mini_batch


            for mini_batch in mini_batches:
                learning_rate = self.adjust_learning_rate(initial_learning_rate, j)  # Adjust learning rate
                self.update_mini_batch(mini_batch, learning_rate, j, l2_lambda)


            #for mini_batch in mini_batches:
                #self.update_mini_batch(mini_batch, learning_rate, j, l2_lambda)  # Pass l2_lambda to update_mini_batch

                #self.update_mini_batch(mini_batch, learning_rate, j)
            #print(f"Epoch {j} complete.")


    def adjust_learning_rate(self, initial_learning_rate, epoch, decay_rate=0.95):
        """Reduces the learning rate over time."""
        return initial_learning_rate * decay_rate**(epoch / 10)  # Example: Exponential decay


    def update_mini_batch(self, mini_batch, learning_rate, epoch, l2_lambda):

    #def update_mini_batch(self, mini_batch, learning_rate, epoch, l2_lambda):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        # Adam optimizer parameters - moved to the beginning of the function
        beta1 = 0.9
        beta2 = 0.999
        epsilon = 1e-8

        for i in range(len(self.weights)):
            self.m_w[i] = beta1 * self.m_w[i] + (1 - beta1) * nabla_w[i]
            self.v_w[i] = beta2 * self.v_w[i] + (1 - beta2) * (nabla_w[i] ** 2)
            m_w_hat = self.m_w[i] / (1 - beta1 ** (epoch + 1))
            v_w_hat = self.v_w[i] / (1 - beta2 ** (epoch + 1))

            # Moved the L2 regularization update here, after m_w_hat and v_w_hat are calculated
            #self.weights[i] = (1 - learning_rate * l2_lambda / len(mini_batch)) * self.weights[i] - (learning_rate / len(mini_batch)) * m_w_hat / (np.sqrt(v_w_hat) + epsilon)

            # Add L2 regularization term to the weight update
            self.weights[i] = (1 - learning_rate * l2_lambda / len(mini_batch)) * self.weights[i] - (learning_rate / len(mini_batch)) * m_w_hat / (np.sqrt(v_w_hat) + epsilon)


            # Add L2 regularization term to the weight update
            #self.weights[i] = (1 - learning_rate * l2_lambda / len(mini_batch)) * self.weights[i] - (learning_rate / len(mini_batch)) * m_w_hat / (np.sqrt(v_w_hat) + epsilon)

        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backpropagate(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]

        # Adam optimizer
        #beta1 = 0.9
        #beta2 = 0.999
        #epsilon = 1e-8

        if not hasattr(self, 'm_w'):
            self.m_w = [np.zeros(w.shape) for w in self.weights]
            self.v_w = [np.zeros(w.shape) for w in self.weights]
            self.m_b = [np.zeros(b.shape) for b in self.biases]
            self.v_b = [np.zeros(b.shape) for b in self.biases]

        for i in range(len(self.weights)):
            self.m_w[i] = beta1 * self.m_w[i] + (1 - beta1) * nabla_w[i]
            self.v_w[i] = beta2 * self.v_w[i] + (1 - beta2) * (nabla_w[i] ** 2)
            m_w_hat = self.m_w[i] / (1 - beta1 ** (epoch + 1))
            v_w_hat = self.v_w[i] / (1 - beta2 ** (epoch + 1))
            self.weights[i] = self.weights[i] - (learning_rate / len(mini_batch)) * m_w_hat / (np.sqrt(v_w_hat) + epsilon)

            self.m_b[i] = beta1 * self.m_b[i] + (1 - beta1) * nabla_b[i]
            self.v_b[i] = beta2 * self.v_b[i] + (1 - beta2) * (nabla_b[i] ** 2)
            m_b_hat = self.m_b[i] / (1 - beta1 ** (epoch + 1))
            v_b_hat = self.v_b[i] / (1 - beta2 ** (epoch + 1))
            self.biases[i] = self.biases[i] - (learning_rate / len(mini_batch)) * m_b_hat / (np.sqrt(v_b_hat) + epsilon)

    def numerical_gradient(self, x, y, epsilon=1e-4):
        """Calculates the numerical gradient of the cost function."""
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]

        for i in range(len(self.weights)):
            for j in range(self.weights[i].shape[0]):
                for k in range(self.weights[i].shape[1]):
                    self.weights[i][j, k] += epsilon
                    cost_plus = self.cost(x, y)
                    self.weights[i][j, k] -= 2 * epsilon
                    cost_minus = self.cost(x, y)
                    self.weights[i][j, k] += epsilon
                    nabla_w[i][j, k] = (cost_plus - cost_minus) / (2 * epsilon)

        # Similar process for biases (omitted for brevity)

        return nabla_b, nabla_w

    def evaluate_regression(self, test_data):
        """Calculates the Mean Squared Error for regression."""
        squared_errors = [(self.predict(x)[0, 0] - y[0, 0])**2 for x, y in test_data]
        return np.mean(squared_errors)

    def evaluate(self, test_data, threshold=0.5):
        """Calculates the accuracy for binary classification."""
        correct_predictions = 0
        for x, y in test_data:
            prediction = self.predict(x)
            predicted_class = 1 if prediction[0, 0] > threshold else 0
            true_class = 1 if y[0, 0] > threshold else 0  # Make sure this aligns with your data
            if predicted_class == true_class:
                correct_predictions += 1
        return correct_predictions / len(test_data)

    # In your evaluate() function:
    def evaluate0(self, test_data, threshold=0.5):
        """Calculates the accuracy for binary classification."""
        correct_predictions = 0
        for x, y in test_data:
            prediction = self.predict(x)
            print(f"Raw Prediction: {prediction}")  # Print raw prediction
            predicted_class = 1 if prediction[0, 0] > threshold else 0
            true_class = 1 if y[0, 0] > threshold else 0
            print(f"Predicted Class: {predicted_class}, True Class: {true_class}")  # Print classes
            if predicted_class == true_class:
                correct_predictions += 1
                print("Correct Prediction!")
            else:
                print("Incorrect Prediction!")
        accuracy = correct_predictions / len(test_data)
        print(f"Correct Predictions: {correct_predictions}, Total: {len(test_data)}")
        print(f"Calculated Accuracy: {accuracy}")
        return accuracy

    #def evaluate(self, test_data):
    #    """Calculates the accuracy for test data."""
    #    test_results = [(np.argmax(self.predict(x)), np.argmax(y))
    #                    for (x, y) in test_data]
    #    return sum(int(x == y) for (x, y) in test_results) / len(test_data)

    def cost(self, x, y):
        """Calculates the cost for a single input."""
        return np.sum((self.predict(x) - y) ** 2) / 2.0  # Mean squared error


In [331]:
# Example usage:
#net = NeuralNetwork([2, 10, 5, 1], dropout=0.5)  # Network with dropout

# Example usage:
net = NeuralNetwork([2, 5, 1], dropout=0.0)

# Example usage:
#net = NeuralNetwork([2, 5, 1], dropout=0.0)  # Simpler network with one hidden layer

# Example usage:
#net = NeuralNetwork([2, 3, 1], dropout=0.0)  # Smaller network

# Sample training data (replace with your actual data)
training_data = [
    (np.array([[0.1], [0.2]]), np.array([[0.3]])),
    (np.array([[0.4], [0.5]]), np.array([[0.6]])),
    (np.array([[0.2], [0.1]]), np.array([[0.4]])),
    (np.array([[0.7], [0.8]]), np.array([[0.9]])),
    (np.array([[0.6], [0.3]]), np.array([[0.7]])),
    # ... add more training examples
]

# Normalize the training data
x_values = np.array([x for x, y in training_data])
mean = np.mean(x_values)
std = np.std(x_values)
training_data = [((x - mean) / std, y) for x, y in training_data]

# Train the network
#net.train(training_data, epochs=50, learning_rate=0.01, batch_size=10)

# Train the network with a different learning rate and batch size
#net.train(training_data, epochs=50, learning_rate=0.05, batch_size=5)


# Train the network with a learning rate schedule
net.train(training_data, epochs=100, initial_learning_rate=0.01, batch_size=10, l2_lambda=0.01)


# Train the network with a learning rate schedule
#net.train(training_data, epochs=25, learning_rate=0.01, batch_size=10, l2_lambda=0.01)


# Test the network (replace with your actual test data)
test_data = [
    (np.array([[0.2], [0.3]]), np.array([[0.5]])),
    (np.array([[0.7], [0.1]]), np.array([[0.8]])),
    # ... add more test examples
]

# Normalize the test data (using the same mean and std from training data)
test_data = [(((x - mean) / std), y) for x, y in test_data]



# Make predictions and evaluate
predictions = []
for x, y in test_data:
    prediction = net.predict(x)
    predictions.append(prediction)


# Calculate Mean Squared Error (MSE)
#mse = np.mean([(prediction - y)**2 for (_, y), prediction in zip(test_data, predictions)])
#print(f"MSE: {mse}")

# Evaluate using Mean Squared Error (MSE)
mse = np.mean([(net.predict(x) - y)**2 for x, y in test_data])
print(f"MSE: {mse}")


# Gradient checking (example for a single data point)
x, y = training_data[0]
analytical_nabla_b, analytical_nabla_w = net.backpropagate(x, y)
numerical_nabla_b, numerical_nabla_w = net.numerical_gradient(x, y)

# Compare the gradients (you can print the differences or calculate the relative error)
#print("Analytical gradient w:", analytical_nabla_w)
#print("Numerical gradient w:", numerical_nabla_w)


# Calculate accuracy (assuming binary classification)
correct_predictions = 0
for (x, y), prediction in zip(test_data, predictions):
    predicted_class = 1 if prediction[0, 0] > 0.5 else 0
    true_class = 1 if y[0, 0] > 0.5 else 0
    if predicted_class == true_class:
        correct_predictions += 1

accuracy = correct_predictions / len(test_data)
print(f"Accuracy: {accuracy}")

MSE: 0.06178975769895317
Accuracy: 0.5


In [372]:
# Sample data (replace with your actual data)
training_data = [
    (np.array([[0.1], [0.2]]), np.array([[0.3]])),
    (np.array([[0.4], [0.5]]), np.array([[0.6]])),
]

# Simplified test_data
test_data = [(np.array([[-0.79652144], [-0.37729963]]), np.array([[0.5]]))]

print("test-data: ", test_data)

# Evaluate using Mean Squared Error (MSE)
mse = np.mean([(net.predict(x) - y)**2 for x, y in test_data])
print(f"MSE: {mse}")

# Create and train the network
net = NeuralNetwork([2, 3, 1], dropout=0.1)
net.train(training_data, epochs=100, initial_learning_rate=0.1, batch_size=1, l2_lambda=0.01)

# Evaluate the network
accuracy = net.evaluate(test_data)
print(f"Accuracy: {accuracy}")

test-data:  [(array([[-0.79652144],
       [-0.37729963]]), array([[0.5]]))]
MSE: 0.0004245561904818655
Accuracy: 1.0
