# XOR with a Tiny Neural Network (1 hidden layer, 2 neurons)

This notebook builds and trains a minimal neural network **from scratch (NumPy only)** to solve the classic XOR problem.

**Architecture**
- Input: 2 features
- Hidden layer: 2 neurons (tanh activation)
- Output: 1 neuron (sigmoid)

We'll train with binary cross-entropy and gradient descent, then plot the loss curve and decision boundary.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=4, suppress=True)

# Reproducibility
rng = np.random.default_rng(42)

In [None]:
# XOR dataset
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]], dtype=float)
y = np.array([[0.], [1.], [1.], [0.]], dtype=float)
m, d = X.shape
print(f"X shape: {X.shape}, y shape: {y.shape}")

In [None]:
# Model hyperparameters
n_hidden = 2
lr = 0.1
epochs = 10000

# Parameter initialization (small random weights)
W1 = rng.normal(0.0, 0.5, size=(d, n_hidden))
b1 = np.zeros((n_hidden,))
W2 = rng.normal(0.0, 0.5, size=(n_hidden, 1))
b2 = np.zeros((1,))

def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

def forward(X, W1, b1, W2, b2):
    z1 = X @ W1 + b1  # (m, n_hidden)
    a1 = np.tanh(z1)
    z2 = a1 @ W2 + b2  # (m, 1)
    yhat = sigmoid(z2)
    cache = {"z1": z1, "a1": a1, "z2": z2, "yhat": yhat}
    return yhat, cache

def bce_loss(yhat, y):
    eps = 1e-8
    return -np.mean(y*np.log(yhat + eps) + (1-y)*np.log(1 - yhat + eps))

loss_hist = []
for t in range(epochs):
    # Forward
    yhat, cache = forward(X, W1, b1, W2, b2)
    loss = bce_loss(yhat, y)
    loss_hist.append(loss)

    # Backward
    # dLoss/dyhat = (yhat - y) / (yhat*(1 - yhat)) * dBCE/dlogit ?
    # For sigmoid + BCE, gradient simplifies: dL/dz2 = (yhat - y)/m
    dz2 = (yhat - y) / m                        # (m, 1)
    dW2 = cache["a1"].T @ dz2                   # (n_hidden, 1)
    db2 = np.sum(dz2, axis=0)                   # (1,)

    da1 = dz2 @ W2.T                             # (m, n_hidden)
    dz1 = da1 * (1 - cache["a1"]**2)            # tanh' = 1 - tanh^2
    dW1 = X.T @ dz1                              # (d, n_hidden)
    db1 = np.sum(dz1, axis=0)                   # (n_hidden,)

    # Update
    W2 -= lr * dW2
    b2 -= lr * db2
    W1 -= lr * dW1
    b1 -= lr * db1

    if (t+1) % 1000 == 0:
        preds = (yhat >= 0.5).astype(float)
        acc = np.mean(preds == y)
        print(f"epoch {t+1:5d} | loss {loss:.5f} | acc {acc:.2f}")

print("\nFinal parameters:")
print("W1=\n", W1)
print("b1=\n", b1)
print("W2=\n", W2)
print("b2=\n", b2)

In [None]:
# Plot loss curve (one chart only)
plt.figure()
plt.plot(loss_hist)
plt.title("Training Loss (Binary Cross-Entropy)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

In [None]:
# Plot decision boundary (one chart only)
xx, yy = np.meshgrid(np.linspace(-0.5, 1.5, 300),
                     np.linspace(-0.5, 1.5, 300))
grid = np.c_[xx.ravel(), yy.ravel()]
probs, _ = forward(grid, W1, b1, W2, b2)
probs = probs.reshape(xx.shape)

plt.figure()
cs = plt.contourf(xx, yy, probs, levels=50, alpha=0.6)
plt.contour(xx, yy, probs, levels=[0.5])
X0 = X[y.ravel() == 0]
X1 = X[y.ravel() == 1]
plt.scatter(X0[:,0], X0[:,1], marker='o', label='class 0')
plt.scatter(X1[:,0], X1[:,1], marker='s', label='class 1')
plt.legend()
plt.title("Decision Boundary after Training")
plt.xlabel("x1")
plt.ylabel("x2")
plt.xlim(-0.5, 1.5)
plt.ylim(-0.5, 1.5)
plt.show()

In [None]:
# Final predictions on XOR inputs
yhat, _ = forward(X, W1, b1, W2, b2)
preds = (yhat >= 0.5).astype(int)
print("Inputs:\n", X)
print("Target:\n", y.astype(int))
print("Predicted prob:\n", yhat)
print("Predicted class:\n", preds)