In [183]:
import numpy as np

# network goes R50 --> R20 --> R20 --> R10
w_1 = np.random.normal(0,0.05,(20, 50))
b_1 = np.random.normal(0,0.05,(20, 1))

w_2 = np.random.normal(0,0.05,(10, 20))
b_2 = np.random.normal(0,0.05,(10,1))

w_3 = np.random.normal(0,0.05,(1, 10))
b_3 = np.random.normal(0,0.05,(1,1))

X = np.random.normal(0, 1, (50, 10000))
y = np.array([np.sum(X[:, x]**2) for x in range(10000)])

X_train = X[:, :8000]
X_test = X[:, 8000:]

y_train = y[:8000]
y_test = y[8000:]

def loss(y, y_pred):
    return .5*np.square(y_pred-y)

def dloss(y, y_pred):
    return y_pred-y

def activation(z):
    return z * (z>0)

def dactivation(z):
    return 1 * (z>0)

test_losses = []
train_losses = []

def forward(X, params):
    w_1, b_1, w_2, b_2, w_3, b_3 = params
    z_1 = w_1 @ X + b_1
    a_1 = activation(z_1)

    z_2 = w_2 @ a_1 + b_2
    a_2 = activation(z_2)

    z_3 =  w_3 @ a_2 + b_3
    a_3 = activation(z_3)

    return a_3, a_2, a_1, z_3, z_2, z_1

epochs = 100
for _ in range(epochs):    
    a_3, a_2, a_1, z_3, z_2, z_1 = forward(X_train, (w_1, b_1, w_2, b_2, w_3, b_3))
    out_test, _, _, _, _, _ = forward(X_test, (w_1, b_1, w_2, b_2, w_3, b_3))
    
    l = loss(y_train, a_3)
    
    dl = dloss(y_train, a_3) # 1 x 8000 
    
    dZ3 = dl * dactivation(z_3) # 1 x 8000
    dW3 = np.dot(dZ3, a_2.T) / X_train.shape[1] # This should be 1 x 10
    db3 = np.sum(dZ3, axis=1, keepdims=True) / X_train.shape[1]
    
    dA2 = np.dot(w_3.T, dZ3)
    dZ2 = dA2 * dactivation(z_2)
    dW2 = np.dot(dZ2, a_1.T) / X_train.shape[1]
    db2 = np.sum(dZ2, axis=1, keepdims=True) / X_train.shape[1]
    
    dA1 = np.dot(w_2.T, dZ2)
    dZ1 = dA1 * dactivation(z_1)
    dW1 = np.dot(dZ1, X_train.T) / X_train.shape[1]
    db1 = np.sum(dZ1, axis=1, keepdims=True) / X_train.shape[1]

    w_3 -= 0.01*dW3
    w_2 -= 0.01*dW2
    w_1 -= 0.01*dW1

    b_3 -= 0.01*db3
    b_2 -= 0.01*db2
    b_1 -= 0.01*db1

    train_losses.append(np.mean(l))
    test_losses.append(np.mean(loss(y_test, out_test)))


ValueError: shapes (10,1) and (8000,8000) not aligned: 1 (dim 1) != 8000 (dim 0)

In [188]:
import numpy as np

# Generate random data
np.random.seed(42)
#X = np.random.rand(10000, 50)
#y = np.random.rand(10000, 1)

X = np.random.normal(0, 1, (100000,50))
y = np.array([np.sum(X[x,:]) for x in range(100000)]).reshape(100000,1)

# Split data into training and validation sets (80/20)
split_idx = int(0.8 * X.shape[0])
X_train, y_train = X[:split_idx], y[:split_idx]
X_val, y_val = X[split_idx:], y[split_idx:]

# Neural network architecture
input_size = 50
hidden_size1 = 20
hidden_size2 = 10
output_size = 1

# Initialize weights and biases
np.random.seed(42)
W1 = np.random.randn(input_size, hidden_size1)
b1 = np.zeros((1, hidden_size1))
W2 = np.random.randn(hidden_size1, hidden_size2)
b2 = np.zeros((1, hidden_size2))
W3 = np.random.randn(hidden_size2, output_size)
b3 = np.zeros((1, output_size))

# Training hyperparameters
learning_rate = 0.001
epochs = 1000

# Training loop
for epoch in range(epochs):
    # Forward pass
    z1 = X_train.dot(W1) + b1
    a1 = np.maximum(0, z1)  # ReLU activation
    z2 = a1.dot(W2) + b2
    a2 = np.maximum(0, z2)  # ReLU activation
    z3 = a2.dot(W3) + b3
    predictions = z3
    
    # Compute MSE loss
    loss = np.mean((predictions - y_train) ** 2)
    
    # Backward pass
    grad_loss = 2 * (predictions - y_train) / X_train.shape[0]
    grad_z3 = grad_loss
    grad_W3 = a2.T.dot(grad_z3)
    grad_b3 = np.sum(grad_z3, axis=0, keepdims=True)
    grad_a2 = grad_z3.dot(W3.T)
    grad_z2 = grad_a2 * (z2 > 0)  # ReLU gradient
    grad_W2 = a1.T.dot(grad_z2)
    grad_b2 = np.sum(grad_z2, axis=0, keepdims=True)
    grad_a1 = grad_z2.dot(W2.T)
    grad_z1 = grad_a1 * (z1 > 0)  # ReLU gradient
    grad_W1 = X_train.T.dot(grad_z1)
    grad_b1 = np.sum(grad_z1, axis=0, keepdims=True)
    
    # Update weights and biases
    W1 -= learning_rate * grad_W1
    b1 -= learning_rate * grad_b1
    W2 -= learning_rate * grad_W2
    b2 -= learning_rate * grad_b2
    W3 -= learning_rate * grad_W3
    b3 -= learning_rate * grad_b3
    
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# Validation
z1_val = X_val.dot(W1) + b1
a1_val = np.maximum(0, z1_val)
z2_val = a1_val.dot(W2) + b2
a2_val = np.maximum(0, z2_val)
z3_val = a2_val.dot(W3) + b3
predictions_val = z3_val
val_loss = np.mean((predictions_val - y_val) ** 2)
print(f"Validation Loss: {val_loss:.4f}")

Epoch 0, Loss: 1412.0016
Epoch 100, Loss: 49.9446
Epoch 200, Loss: 49.2804
Epoch 300, Loss: 48.9670
Epoch 400, Loss: 48.7071
Epoch 500, Loss: 48.4367
Epoch 600, Loss: 48.1156
Epoch 700, Loss: 47.7201
Epoch 800, Loss: 47.2291
Epoch 900, Loss: 46.6139
Validation Loss: 46.0358


In [171]:
grad_loss.shape

(8000, 1)

In [179]:
X.shape

(10000, 50)

In [185]:
y.shape

(10000,)

In [189]:
grad_loss.shape

(80000, 1)