In [1]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt


In [2]:
# Create toy dataset
X, Y = make_moons(n_samples=1000, noise=0.2, random_state=42)
Y = Y.reshape(Y.shape[0], 1)  # reshape to (1000, 1)

# Train-Test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Transpose for compatibility (features x samples)
X_train, Y_train = X_train.T, Y_train.T
X_test, Y_test = X_test.T, Y_test.T


In [3]:
def initialize_parameters(layer_dims):
    np.random.seed(1)
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters[f"W{l}"] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters[f"b{l}"] = np.zeros((layer_dims[l], 1))

    return parameters


In [4]:
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return Z > 0

def sigmoid_derivative(A):
    return A * (1 - A)


In [5]:
def forward_propagation(X, parameters):
    caches = {}
    A = X
    caches['A0'] = A  # <-- store input explicitly
    L = len(parameters) // 2

    for l in range(1, L):
        Z = parameters[f"W{l}"] @ A + parameters[f"b{l}"]
        A = relu(Z)
        caches[f"Z{l}"] = Z
        caches[f"A{l}"] = A

    ZL = parameters[f"W{L}"] @ A + parameters[f"b{L}"]
    AL = sigmoid(ZL)
    caches[f"Z{L}"] = ZL
    caches[f"A{L}"] = AL

    return AL, caches



In [6]:
def compute_cost(AL, Y):
    m = Y.shape[1]
    cost = -np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL)) / m
    return np.squeeze(cost)


In [7]:
def backward_propagation(AL, Y, caches, parameters):
    grads = {}
    L = len(parameters) // 2
    m = Y.shape[1]

    dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    dZL = dAL * sigmoid_derivative(AL)

    grads[f"dW{L}"] = (1 / m) * dZL @ caches[f"A{L-1}"].T
    grads[f"db{L}"] = (1 / m) * np.sum(dZL, axis=1, keepdims=True)

    dA_prev = parameters[f"W{L}"].T @ dZL

    for l in reversed(range(1, L)):
        dZ = dA_prev * relu_derivative(caches[f"Z{l}"])
        grads[f"dW{l}"] = (1 / m) * dZ @ caches[f"A{l-1}"].T
        grads[f"db{l}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
        dA_prev = parameters[f"W{l}"].T @ dZ

    return grads



In [8]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    for l in range(1, L + 1):
        parameters[f"W{l}"] -= learning_rate * grads[f"dW{l}"]
        parameters[f"b{l}"] -= learning_rate * grads[f"db{l}"]
    return parameters


In [9]:
def model(X, Y, layer_dims, learning_rate=0.1, epochs=1000, print_cost=True):
    parameters = initialize_parameters(layer_dims)

    for i in range(epochs):
        AL, caches = forward_propagation(X, parameters)
        cost = compute_cost(AL, Y)
        grads = backward_propagation(AL, Y, caches, parameters)
        parameters = update_parameters(parameters, grads, learning_rate)

        if print_cost and i % 100 == 0:
            print(f"Cost after epoch {i}: {cost:.4f}")

    return parameters


In [10]:
def predict(X, parameters):
    AL, _ = forward_propagation(X, parameters)
    return (AL > 0.5).astype(int)


In [None]:
layer_dims = [2, 10, 5, 1]  # input layer, 2 hidden layers, output
parameters = model(X_train, Y_train, layer_dims, epochs=1000)

# Predict and evaluate
train_pred = predict(X_train, parameters)
test_pred = predict(X_test, parameters)

train_acc = np.mean(train_pred == Y_train) * 100
test_acc = np.mean(test_pred == Y_test) * 100

print(f"Train Accuracy: {train_acc:.2f}%")
print(f"Test Accuracy: {test_acc:.2f}%")


Cost after epoch 0: 0.6931
Cost after epoch 100: 0.6931
Cost after epoch 200: 0.6931
Cost after epoch 300: 0.6931
Cost after epoch 400: 0.6931
Cost after epoch 500: 0.6931
Cost after epoch 600: 0.6931
Cost after epoch 700: 0.6931
Cost after epoch 800: 0.6931
Cost after epoch 900: 0.6931
Train Accuracy: 78.25%
Test Accuracy: 77.00%
