In [None]:
import os
import math
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Load the dataset (only first 5000 rows)
final_df = pd.read_csv('final_chess_games.csv', nrows=5000)
final_df = final_df[['Result', 'AN']]

# Preprocess the data
def preprocess_data(df):
    sequences = df['AN'].apply(lambda x: x.strip().split())
    return sequences.tolist()

print("Preprocessing data...")
sequences = preprocess_data(final_df)
print("Data preprocessing completed.")

# Build vocabulary
all_moves = set(move for seq in sequences for move in seq)
vocab = {move: idx for idx, move in enumerate(all_moves)}
vocab['<pad>'] = len(vocab)  # Add padding token
vocab_size = len(vocab)

# Determine the maximum sequence length
max_length = max(len(seq) for seq in sequences) - 1

# Define the dataset
class ChessDataset(Dataset):
    def __init__(self, sequences, vocab, max_length):
        self.sequences = sequences
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.vocab.get(move, self.vocab['<pad>']) for move in sequence[:-1]]
        target_seq = [self.vocab.get(move, self.vocab['<pad>']) for move in sequence[1:]]

        # Pad sequences
        input_seq += [self.vocab['<pad>']] * (self.max_length - len(input_seq))
        target_seq += [self.vocab['<pad>']] * (self.max_length - len(target_seq))

        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)

print("Creating datasets...")
train_sequences, val_sequences = train_test_split(sequences, test_size=0.2, random_state=42)
train_dataset = ChessDataset(train_sequences, vocab, max_length)
val_dataset = ChessDataset(val_sequences, vocab, max_length)
print("Datasets created.")

# Data loaders
batch_size = 8  # Adjust batch size if necessary
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
print("Data loaders initialized.")

# Define positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(-math.log(10000.0) * torch.arange(0, d_model, 2).float() / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.positional_encoding(src)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.positional_encoding(tgt)

        tgt_mask = self.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
        output = self.transformer(src, tgt, tgt_mask=tgt_mask)
        output = self.fc_out(output)
        return output

# Hyperparameters
d_model = 512
nhead = 8
num_layers = 6
dim_feedforward = 2048
dropout = 0.1

print("Initializing model, loss function, and optimizer...")
model = TransformerModel(vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)
print("Model, loss function, and optimizer initialized.")

# Training parameters
num_epochs = 10
checkpoint_interval = 2  # Save checkpoint every 2 epochs
accumulation_steps = 4  # Number of batches to accumulate gradients

# Device configuration
device = torch.device("cpu")
print("Using device:", device)
model.to(device)

print("Starting training...")
for epoch in range(num_epochs):
    epoch_start_time = time.time()
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for batch_idx, (src, tgt) in enumerate(train_loader):
        batch_start_time = time.time()
        src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)

        # Forward pass
        output = model(src, tgt[:-1, :])
        loss = criterion(output.view(-1, vocab_size), tgt[1:, :].reshape(-1))
        loss = 0.5 * loss  # Scale the loss by half

        # Backward pass
        (loss / accumulation_steps).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        batch_elapsed_time = time.time() - batch_start_time
        print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}, Batch Time: {batch_elapsed_time:.2f}s")

    epoch_elapsed_time = time.time() - epoch_start_time
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} training completed. Average Loss: {avg_loss:.4f}. Elapsed Time: {epoch_elapsed_time:.2f} seconds")

    # Validation step
    validation_start_time = time.time()
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)

            output = model(src, tgt[:-1, :])
            loss = criterion(output.view(-1, vocab_size), tgt[1:, :].reshape(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    validation_elapsed_time = time.time() - validation_start_time
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}. Validation Time: {validation_elapsed_time:.2f} seconds")

    if (epoch + 1) % checkpoint_interval == 0:
        checkpoint_path = f'transformer_model_epoch_{epoch+1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

# Save the final model
torch.save(model.state_dict(), 'transformer_model_final.pth')
print("Training completed and model saved.")

# Function to generate moves
def generate_moves(model, start_sequence, vocab, max_length=50):
    model.eval()
    generated_moves = start_sequence.copy()
    input_seq = [vocab.get(move, vocab['<pad>']) for move in generated_moves]
    input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(1).to(device)

    for _ in range(max_length):
        tgt_input = input_tensor[-1:, :]
        output = model(input_tensor, tgt_input)
        next_move_idx = output.argmax(dim=-1)[-1, 0].item()
        next_move = [move for move, idx in vocab.items() if idx == next_move_idx][0]
        generated_moves.append(next_move)

        if next_move == '<pad>':
            break

        input_seq.append(next_move_idx)
        input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(1).to(device)

    return generated_moves

# Load the trained model
model.load_state_dict(torch.load('transformer_model_final.pth', map_location=device))
model.to(device)

# Example usage
start_sequence = ['1.', 'e4', 'e5', '2.', 'Nf3', 'Nc6']
generated_moves = generate_moves(model, start_sequence, vocab)
print("Generated moves:", ' '.join(generated_moves))

Preprocessing data...
Data preprocessing completed.
Creating datasets...
Datasets created.
Data loaders initialized.
Initializing model, loss function, and optimizer...




Model, loss function, and optimizer initialized.
Using device: cpu
Starting training...


In [None]:
import os
import math
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# Load the dataset (only first 5000 rows)
final_df = pd.read_csv('final_chess_games.csv', nrows=5000)
final_df = final_df[['Result', 'AN']]

# Preprocess the data
def preprocess_data(df):
    sequences = df['AN'].apply(lambda x: x.strip().split())
    return sequences.tolist()

print("Preprocessing data...")
sequences = preprocess_data(final_df)
print("Data preprocessing completed.")

# Build vocabulary
all_moves = set(move for seq in sequences for move in seq)
vocab = {move: idx for idx, move in enumerate(all_moves)}
vocab['<pad>'] = len(vocab)  # Add padding token
vocab_size = len(vocab)

# Determine the maximum sequence length
max_length = max(len(seq) for seq in sequences) - 1

# Define the dataset
class ChessDataset(Dataset):
    def __init__(self, sequences, vocab, max_length):
        self.sequences = sequences
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_seq = [self.vocab.get(move, self.vocab['<pad>']) for move in sequence[:-1]]
        target_seq = [self.vocab.get(move, self.vocab['<pad>']) for move in sequence[1:]]

        # Pad sequences
        input_seq += [self.vocab['<pad>']] * (self.max_length - len(input_seq))
        target_seq += [self.vocab['<pad>']] * (self.max_length - len(target_seq))

        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)

print("Creating datasets...")
train_sequences, val_sequences = train_test_split(sequences, test_size=0.2, random_state=42)
train_dataset = ChessDataset(train_sequences, vocab, max_length)
val_dataset = ChessDataset(val_sequences, vocab, max_length)
print("Datasets created.")

# Data loaders
batch_size = 8  # Adjust batch size if necessary
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
print("Data loaders initialized.")

# Define positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(-math.log(10000.0) * torch.arange(0, d_model, 2).float() / d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, dropout)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.positional_encoding(src)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.positional_encoding(tgt)

        tgt_mask = self.generate_square_subsequent_mask(tgt.size(0)).to(tgt.device)
        output = self.transformer(src, tgt, tgt_mask=tgt_mask)
        output = self.fc_out(output)
        return output

# Hyperparameters
d_model = 512
nhead = 8
num_layers = 6
dim_feedforward = 2048
dropout = 0.1

print("Initializing model, loss function, and optimizer...")
model = TransformerModel(vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.001)
print("Model, loss function, and optimizer initialized.")

# Training parameters
num_epochs = 10
checkpoint_interval = 2  # Save checkpoint every 2 epochs
accumulation_steps = 4  # Number of batches to accumulate gradients
patience = 3  # Early stopping patience

# Device configuration
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using device:", device)
model.to(device)

print("Starting training...")
best_val_loss = float('inf')
counter = 0
for epoch in range(num_epochs):
    epoch_start_time = time.time()
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for batch_idx, (src, tgt) in enumerate(train_loader):
        batch_start_time = time.time()
        src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)

        # Forward pass
        output = model(src, tgt[:-1, :])
        loss = criterion(output.view(-1, vocab_size), tgt[1:, :].reshape(-1))
        loss = 0.5 * loss  # Scale the loss by half

        # Backward pass
        (loss / accumulation_steps).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        total_loss += loss.item()
        batch_elapsed_time = time.time() - batch_start_time
        print(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}, Batch Time: {batch_elapsed_time:.2f}s")

    epoch_elapsed_time = time.time() - epoch_start_time
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} training completed. Average Loss: {avg_loss:.4f}. Elapsed Time: {epoch_elapsed_time:.2f} seconds")

    # Validation step
    validation_start_time = time.time()
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.transpose(0, 1).to(device), tgt.transpose(0, 1).to(device)

            output = model(src, tgt[:-1, :])
            loss = criterion(output.view(-1, vocab_size), tgt[1:, :].reshape(-1))
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    validation_elapsed_time = time.time() - validation_start_time
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}. Validation Time: {validation_elapsed_time:.2f} seconds")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print("Validation loss improved. Model saved.")
    else:
        counter += 1
        print(f"No improvement in validation loss for {counter} epoch(s).")
        if counter >= patience:
            print("Early stopping triggered.")
            break

    if (epoch + 1) % checkpoint_interval == 0:
        checkpoint_path = f'transformer_model_epoch_{epoch+1}.pth'
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

# Load the best model
model.load_state_dict(torch.load('best_model.pth', map_location=device))
print("Training completed and best model loaded.")

# Function to generate moves
def generate_moves(model, start_sequence, vocab, max_length=50):
    model.eval()
    generated_moves = start_sequence.copy()
    input_seq = [vocab.get(move, vocab['<pad>']) for move in generated_moves]
    input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(1).to(device)

    for _ in range(max_length):
        tgt_input = input_tensor[-1:, :]
        output = model(input_tensor, tgt_input)
        next_move_idx = output.argmax(dim=-1)[-1, 0].item()
        next_move = [move for move, idx in vocab.items() if idx == next_move_idx][0]
        generated_moves.append(next_move)

        if next_move == '<pad>':
            break

        input_seq.append(next_move_idx)
        input_tensor = torch.tensor(input_seq, dtype=torch.long).unsqueeze(1).to(device)

    return generated_moves

# Example usage
start_sequence = ['1.', 'e4', 'e5', '2.', 'Nf3', 'Nc6']
generated_moves = generate_moves(model, start_sequence, vocab)
print("Generated moves:", ' '.join(generated_moves))

Preprocessing data...
Data preprocessing completed.
Creating datasets...
Datasets created.
Data loaders initialized.
Initializing model, loss function, and optimizer...




Model, loss function, and optimizer initialized.
Using device: cpu
Starting training...
