In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2Model, GPT2Config

class CellularAutomataGPT2(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads):
        super(CellularAutomataGPT2, self).__init__()
        self.input_projection = nn.Linear(input_dim, hidden_dim)
        self.gpt2 = GPT2Model(GPT2Config(n_embd=hidden_dim, n_head=num_heads))
        self.output_projection = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        x = self.input_projection(x)
        hidden_states = self.gpt2(inputs_embeds=x).last_hidden_state
        output = self.output_projection(hidden_states)
        return output

# Parameters
input_dim = 100  # Length of each generation (number of cells in the automaton)
hidden_dim = 240  # Dimension for model embedding (must be divisible by num_heads)
num_heads = 12  # Number of attention heads
sequence_length = 60  # Number of steps to predict

# Initialize the model
model = CellularAutomataGPT2(input_dim, hidden_dim, num_heads)
model.train()

# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()

# Generate synthetic data for training (replace with real cellular automata data)
def generate_ca_data(rule, num_samples, sequence_length, input_dim):
    # This function should generate training sequences using the specified CA rule
    # For simplicity, we generate random binary sequences
    return torch.randint(0, 2, (num_samples, sequence_length, input_dim), dtype=torch.float)

num_samples = 1000
training_data = generate_ca_data(30, num_samples, sequence_length, input_dim)

# Training loop
epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for sequence in training_data:
        optimizer.zero_grad()
        # Add random noise to the input sequence to improve model robustness
        noisy_sequence = sequence[:-1] + torch.normal(0, 0.1, sequence[:-1].shape)
        input_sequence = torch.clamp(noisy_sequence, 0, 1)  # Ensure values are in the range [0, 1]
        target_sequence = sequence[1:]

        # Forward pass
        output = model(input_sequence.unsqueeze(0))
        loss = criterion(output.squeeze(0), target_sequence)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(training_data)}")

# Save the trained model
torch.save(model.state_dict(), "ca_gpt2_model_2.pth")

# Testing the model
model.eval()
with torch.no_grad():
    # Generate a test sequence (replace with real test data if available)
    test_sequence = generate_ca_data(30, 1, sequence_length, input_dim).squeeze(0)
    input_sequence = test_sequence[:-1]
    target_sequence = test_sequence[1:]

    # Forward pass for testing
    output = model(input_sequence.unsqueeze(0))
    predicted_sequence = torch.sigmoid(output.squeeze(0)).round()

    print("Input Sequence:")
    print(input_sequence)
    print("Target Sequence:")
    print(target_sequence)
    print("Predicted Sequence:")
    print(predicted_sequence)


Epoch 1, Loss: 0.6953051356077194
Epoch 2, Loss: 0.6937872629165649
Epoch 3, Loss: 0.6936452788114548
Epoch 4, Loss: 0.6935864339470863
Epoch 5, Loss: 0.6935521367788314
