In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

class ChessCNN(nn.Module):
    def __init__(self):
        super(ChessCNN, self).__init__()
        # The board state is 768 features reshaped into (12, 8, 8)
        self.conv1 = nn.Conv2d(in_channels=12, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        # After two conv layers, the board output is 64 channels of size 8x8 = 4096 features.
        # We then combine these with the extra 12 features.
        self.fc1 = nn.Linear(4096 + 12, 512)
        self.fc2 = nn.Linear(512, 3)  # 3 classes: win, draw, loss

    def forward(self, x):
        # x shape: (batch_size, 780)
        board = x[:, :768]  # First 768 features for board state
        extras = x[:, 768:] # Next 12 features (en passant + castling)
        
        # Reshape board to (batch, 12, 8, 8)
        board = board.view(-1, 12, 8, 8)
        x_board = F.relu(self.conv1(board))
        x_board = F.relu(self.conv2(x_board))
        
        # Flatten the convolutional features
        x_board = x_board.view(x_board.size(0), -1)
        
        # Concatenate with the extra features
        x_combined = torch.cat([x_board, extras], dim=1)
        x_combined = F.relu(self.fc1(x_combined))
        logits = self.fc2(x_combined)
        return logits

# Example training code using dummy data for demonstration
if __name__ == "__main__":
    # Set up device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Create random dummy data: 1000 samples with 780 features each
    num_samples = 1000
    X = torch.randn(num_samples, 780).to(device)
    y = torch.randint(0, 3, (num_samples,)).to(device)  # Labels: 0 (loss), 1 (draw), 2 (win)
    
    # Create DataLoader for batching
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Initialize model, loss function, and optimizer
    model = ChessCNN().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for batch_x, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * batch_x.size(0)
        
        epoch_loss = running_loss / num_samples
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")
