In [1]:
import numpy as np

"""

Architecture: 2 input neurons, 4 hidden neurons, 1 output neurons, MSE loss, sigmoid activations at each layer.

Example of found solution weights

W1: [[ 5.22722431  0.98707488  0.68186502  6.01151874]
 [-3.16613113 -3.99951104  2.91152228  5.80878724]]

W2: [[-6.57254498]
 [ 4.63984181]
 [-3.9456002 ]
 [ 8.04506769]]

B1: [[ 1.24785544  0.50324162 -2.14896976 -2.053226  ]]

B2: [[-1.53894177]]

"""

# XOR dataset
X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

y = np.array([[0], [1], [1], [0]])  # XOR labels

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    # assumes 'a' is already sigmoid(z)
    return a * (1 - a)

np.random.seed(42)

# Initialize weights and biases
W1 = np.random.randn(2, 4)   # input â†’ hidden
b1 = np.zeros((1, 4))

W2 = np.random.randn(4, 1)   # hidden â†’ output
b2 = np.zeros((1, 1))

learning_rate = 0.1
epochs = 10000

for epoch in range(epochs):

    z1 = X @ W1 + b1
    a1 = sigmoid(z1)

    z2 = a1 @ W2 + b2
    y_hat = sigmoid(z2)

    loss = np.mean((y - y_hat) ** 2)

    dL_dyhat = y_hat - y
    dL_dz2 = dL_dyhat * sigmoid_derivative(y_hat)

    dW2 = a1.T @ dL_dz2
    db2 = np.sum(dL_dz2, axis=0, keepdims=True)

    dL_da1 = dL_dz2 @ W2.T
    dL_dz1 = dL_da1 * sigmoid_derivative(a1)

    dW1 = X.T @ dL_dz1
    db1 = np.sum(dL_dz1, axis=0, keepdims=True)

    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1

    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

print(W1)
print(W2)
print(b1)
print(b2)

print("\nFinal predictions:")
predictions = sigmoid(sigmoid(X @ W1 + b1) @ W2 + b2)
print(predictions)

print("\nBinary output:")
print((predictions > 0.5).astype(int))

Epoch 0, Loss: 0.2832
Epoch 1000, Loss: 0.2452
Epoch 2000, Loss: 0.2124
Epoch 3000, Loss: 0.1503
Epoch 4000, Loss: 0.0572
Epoch 5000, Loss: 0.0209
Epoch 6000, Loss: 0.0107
Epoch 7000, Loss: 0.0067
Epoch 8000, Loss: 0.0047
Epoch 9000, Loss: 0.0035
[[ 5.22722431  0.98707488  0.68186502  6.01151874]
 [-3.16613113 -3.99951104  2.91152228  5.80878724]]
[[-6.57254498]
 [ 4.63984181]
 [-3.9456002 ]
 [ 8.04506769]]
[[ 1.24785544  0.50324162 -2.14896976 -2.053226  ]]
[[-1.53894177]]

Final predictions:
[[0.0372988 ]
 [0.94914558]
 [0.94481534]
 [0.06424562]]

Binary output:
[[0]
 [1]
 [1]
 [0]]
