In [15]:
import torch
import torch.nn.functional as F
import numpy as np

# --- Helper functions to simulate CSV reading ---
def read_csv(file_path):
    # For demonstration, we assume MNIST dummy data:
    num_examples = 1200
    num_features = 784
    num_classes = 10

    # Create dummy input data (row-major order for PyTorch):
    # In Eigen, the data might be stored as (784,1200) and then transposed.
    # Here we directly use shape (1200,784)
    data = np.random.rand(num_examples, num_features).astype(np.float32)
    
    # Create dummy one-hot labels with shape (1200,10)
    labels = np.zeros((num_examples, num_classes), dtype=np.float32)
    random_classes = np.random.randint(0, num_classes, size=num_examples)
    labels[np.arange(num_examples), random_classes] = 1.0

    return data, labels

# --- Read data ---
data_np, labels_np = read_csv("data/mnist_dummy.csv")
# In PyTorch, inputs have shape (1200, 784) and labels (1200, 10)
inputs = torch.tensor(data_np)       # shape: (1200, 784)
labels = torch.tensor(labels_np)       # shape: (1200, 10)

# --- Initialize weights ---
# In the C++/Eigen code, weights1 is created as a 784 x 256 matrix (column-major).
# In PyTorch, we define weights1 as (784,256) so that when we do inputs.matmul(weights1)
# with inputs of shape (1200,784), we get (1200,256). Similarly weights2 is (256,10).
# Here we initialize them with small random values.
weights1 = (torch.rand(784, 256) * 0.01).requires_grad_(True)
weights2 = (torch.rand(256, 10) * 0.01).requires_grad_(True)

learning_rate = 0.01

# --- Training Loop ---
for epoch in range(10):
    # Forward pass
    # In the original C++ code, inputs is transposed to become (1200,784).
    # Since our PyTorch inputs are already in this shape, we can use them directly.
    a1 = inputs.matmul(weights1)   # (1200,784) @ (784,256) => (1200,256)
    z1 = a1.clamp(min=0)             # ReLU activation

    a2 = z1.matmul(weights2)         # (1200,256) @ (256,10) => (1200,10)

    # Manually compute the softmax (avoiding broadcasting pitfalls by explicitly summing over classes)
    # Note: The denominator is computed per example (row).
    exp_a2 = torch.exp(a2)
    epsilon = 1e-10
    denom = exp_a2.sum(dim=1, keepdim=True) + epsilon  # shape: (1200,1)
    z2 = exp_a2 / denom                                  # shape: (1200,10), each row sums to 1

    # Compute cross entropy loss manually.
    # In the C++ code, the loss is computed as:
    # loss = ((-1.0) * log(z2) * labels.transpose()).sum(0).sum(1)
    # Because the original labels were {10,1200} and then transposed to (1200,10).
    # Our labels are already (1200,10), so we compute:
    loss = (-1.0 * torch.log(z2 + epsilon) * labels).sum() # scalar loss
    loss /= inputs.shape[0]
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

    # Backward pass: compute gradients with respect to weights.
    loss.backward()

    # Update weights using a manual gradient descent step.
    with torch.no_grad():
        weights1 -= learning_rate * weights1.grad
        weights2 -= learning_rate * weights2.grad

        # Zero the gradients after updating.
        weights1.grad.zero_()
        weights2.grad.zero_()


Epoch 1, Loss: 2.3086204528808594
Epoch 2, Loss: 2.300020217895508
Epoch 3, Loss: 2.300004720687866
Epoch 4, Loss: 2.300002336502075
Epoch 5, Loss: 2.3000001907348633
Epoch 6, Loss: 2.2999982833862305
Epoch 7, Loss: 2.2999958992004395
Epoch 8, Loss: 2.2999939918518066
Epoch 9, Loss: 2.2999916076660156
Epoch 10, Loss: 2.299989938735962
