In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [16]:
def generate_synthetic_data(num_samples=1000):
    data = []
    labels = []
    for _ in range(num_samples):
        # Generate random variables
        A = np.random.randn()
        B = 2 * A + np.random.randn() * 0.1  # A causes B
        C = np.random.randn()
        D = 3 * C + np.random.randn() * 0.1  # C causes D

        # Let's define variable pairs and their causal directions
        pairs = [('A', 'B', 0), ('B', 'A', 1), ('C', 'D', 0), ('D', 'C', 1)]
        # Label: 0 if first variable causes second, 1 otherwise

        # Markov blankets (for simplicity, include all variables)
        variables = {'A': A, 'B': B, 'C': C, 'D': D}

        for (X, Y, label) in pairs:
            features = []
            # Collect features for X, Y, and their Markov blankets
            for var in variables:
                features.append(variables[var])
            data.append(variables)
            labels.append(label)
    return np.array(data), np.array(labels)


In [17]:
# Generate data
X_data, y_data = generate_synthetic_data(num_samples=1)

# Split into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, random_state=42
)


In [18]:
X_data

array([{'A': -0.1471367936921451, 'B': -0.042388791395786884, 'C': -0.4147429364773032, 'D': -1.326614269113992},
       {'A': -0.1471367936921451, 'B': -0.042388791395786884, 'C': -0.4147429364773032, 'D': -1.326614269113992},
       {'A': -0.1471367936921451, 'B': -0.042388791395786884, 'C': -0.4147429364773032, 'D': -1.326614269113992},
       {'A': -0.1471367936921451, 'B': -0.042388791395786884, 'C': -0.4147429364773032, 'D': -1.326614269113992}],
      dtype=object)

In [5]:
class CausalDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [6]:
batch_size = 32

train_dataset = CausalDataset(X_train, y_train)
test_dataset = CausalDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [7]:
class CausalTransformer(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_layers, num_classes):
        super(CausalTransformer, self).__init__()
        self.embedding = nn.Linear(input_dim, model_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dim_feedforward=128
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(model_dim, num_classes)
    
    def forward(self, x):
        # x shape: (batch_size, sequence_length)
        # Add a sequence dimension
        x = x.unsqueeze(1)  # Now x shape: (batch_size, 1, sequence_length)
        x = self.embedding(x)  # Apply embedding
        x = x.permute(1, 0, 2)  # Transformer expects input as (sequence_length, batch_size, model_dim)
        transformer_output = self.transformer_encoder(x)
        # Take the output corresponding to the first (and only) token
        output = transformer_output[0]  # Shape: (batch_size, model_dim)
        output = self.fc_out(output)
        return output


In [8]:
input_dim = X_train.shape[1]  # Number of variables
model_dim = 64
num_heads = 4
num_layers = 2
num_classes = 2  # Causal directions: 0 or 1

model = CausalTransformer(input_dim, model_dim, num_heads, num_layers, num_classes)




In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [11]:
def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(loader)

def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            total_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == y_batch).sum().item()
    accuracy = correct / len(loader.dataset)
    return total_loss / len(loader), accuracy


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 20

for epoch in range(num_epochs):
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss, val_accuracy = validate(model, test_loader, criterion, device)
    
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


Epoch 1/20
Train Loss: 0.7199
Validation Loss: 0.6988, Validation Accuracy: 0.4650
Epoch 2/20
Train Loss: 0.7018
Validation Loss: 0.6995, Validation Accuracy: 0.4913
Epoch 3/20
Train Loss: 0.7020
Validation Loss: 0.6985, Validation Accuracy: 0.5075
Epoch 4/20
Train Loss: 0.6967
Validation Loss: 0.7159, Validation Accuracy: 0.4775
Epoch 5/20
Train Loss: 0.6963
Validation Loss: 0.6971, Validation Accuracy: 0.5050
Epoch 6/20
Train Loss: 0.6985
Validation Loss: 0.6988, Validation Accuracy: 0.4863
Epoch 7/20
Train Loss: 0.6974
Validation Loss: 0.6981, Validation Accuracy: 0.5088


KeyboardInterrupt: 