In [1]:
import torch
import torch.nn as nn
import math
import re

In [2]:
def preprocess_game(game_str):
    moves = game_str.split()
    input_output_pairs = []
    for i in range(1, len(moves)):
        input_seq = ' '.join(moves[:i])
        output_move = moves[i]
        input_output_pairs.append((input_seq, output_move))
    return input_output_pairs

def preprocess_file(file_path):
    all_pairs = []
    with open(file_path, 'r') as file:
        for line in file:
            game = line.strip()
            all_pairs.extend(preprocess_game(game))
    return all_pairs

file_path = 'out/grandmaster.txt'
training_data = preprocess_file(file_path)
len(training_data)

60790649

In [None]:
training_data[:10]

In [None]:
vocab = set()

files = [
    'out/beginner.txt',
    'out/intermediate.txt',
    'out/master.txt',
    'out/grandmaster.txt',
]

for file_path in files:
    with open(file_path, 'r') as file:
        for line in file:
            for word in line.split(' '):
                vocab.add(word.strip())
vocab = sorted(list(vocab))
print(len(vocab))

In [None]:
with open('vocab.txt', 'w') as file:
    file.write('\n'.join(vocab))

In [3]:
vocab = open('vocab.txt', 'r').read().splitlines()
vocab

['0-1',
 '1-0',
 '1/2-1/2',
 'B1a3',
 'B1a4',
 'B1b2',
 'B1b3',
 'B1c2',
 'B1c2+',
 'B1c3',
 'B1d2',
 'B1d3',
 'B1d4',
 'B1e2',
 'B1e3',
 'B1e4',
 'B1f2',
 'B1f3',
 'B1f3+',
 'B1g2',
 'B1g3',
 'B1h3',
 'B1xg2',
 'B2a3',
 'B2b3',
 'B2c3',
 'B2c4',
 'B2d3',
 'B2e3',
 'B2f3',
 'B2f5',
 'B2g3',
 'B2h3',
 'B2xf3',
 'B2xf3#',
 'B3b2',
 'B3b4',
 'B3b5',
 'B3c2',
 'B3c4',
 'B3c5',
 'B3d2',
 'B3d4',
 'B3e2',
 'B3e4',
 'B3f2',
 'B3f4',
 'B3g2',
 'B3g4',
 'B3h2',
 'B3h5',
 'B4b3',
 'B4c5',
 'B4d3',
 'B4d5',
 'B4e3',
 'B4e5',
 'B4e6',
 'B4f3',
 'B4f5',
 'B4f6',
 'B4g3',
 'B4g5',
 'B4xf5',
 'B5a4',
 'B5b4',
 'B5b6',
 'B5c3',
 'B5c4',
 'B5c6',
 'B5d4',
 'B5d6',
 'B5e4',
 'B5e6',
 'B5f4',
 'B5f6',
 'B5h4',
 'B5h6',
 'B6a4',
 'B6a5',
 'B6b5',
 'B6b7',
 'B6c5',
 'B6c7',
 'B6d4',
 'B6d5',
 'B6d7',
 'B6e5',
 'B6e7',
 'B6f4',
 'B6f5',
 'B6f5+',
 'B6f7',
 'B6g5',
 'B6g7',
 'B6h5',
 'B7a6',
 'B7b6',
 'B7c5+',
 'B7c6',
 'B7d4',
 'B7d5',
 'B7d6',
 'B7e6',
 'B7f5',
 'B7f6',
 'B7g5',
 'B7g6',
 'B7xf5',
 'B7xg6'

In [None]:
vocab[:30]

In [4]:
# Assuming 'vocab' is your list of moves as created in your code snippet
move_to_index = {move: idx for idx, move in enumerate(vocab)}
index_to_move = {idx: move for idx, move in enumerate(vocab)}

# Add special tokens
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
move_to_index[UNK_TOKEN] = len(move_to_index)
move_to_index[PAD_TOKEN] = len(move_to_index)
index_to_move[len(index_to_move)] = UNK_TOKEN
index_to_move[len(index_to_move)] = PAD_TOKEN

def encode_move(move):
    return move_to_index.get(move, move_to_index[UNK_TOKEN])

def decode_move(index):
    return index_to_move.get(index, UNK_TOKEN)

In [5]:
encode_move('Qe4#')

8316

In [6]:
decode_move(8316)

'Qe4#'

In [7]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        # Create a matrix of shape (max_len, d_model) filled with zeros
        # This will store the positional encodings for each position and dimension
        pe = torch.zeros(max_len, d_model)
        
        # Create a vector of positions from 0 to max_len-1
        # Unsqueeze to shape (max_len, 1) for broadcasting
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Create the division term for the sinusoidal function
        # This creates a vector of values that increase exponentially
        # We use log(10000.0) as it's a common choice that works well in practice
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Apply sine to even indices in the positional encoding
        # This creates a sinusoidal pattern that varies at different frequencies
        pe[:, 0::2] = torch.sin(position * div_term)
        
        # Apply cosine to odd indices in the positional encoding
        # This creates a cosinusoidal pattern that varies at different frequencies
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Unsqueeze and transpose to shape (1, max_len, d_model)
        # This allows for easy addition to the input embeddings later
        pe = pe.unsqueeze(0).transpose(0, 1)
        
        # Register the positional encoding as a buffer
        # This means it won't be considered a model parameter (won't be updated during training)
        # but will be saved and loaded with the model
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add the positional encoding to the input
        # x is expected to have shape (seq_len, batch_size, d_model)
        # We slice the positional encoding to match the input sequence length
        return x + self.pe[:x.size(0), :]

In [8]:
class ChessTransformer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers):
        super().__init__()
        
        # Create an embedding layer to convert input tokens to vectors
        # vocab_size is the number of unique tokens in our vocabulary
        # d_model is the dimensionality of the embedding space
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Create a positional encoding layer
        # This adds information about the position of each token in the sequence
        self.pos_encoder = PositionalEncoding(d_model)
        
        # Create a single transformer encoder layer
        # This includes self-attention and feedforward neural network
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        
        # Create the full transformer encoder by stacking multiple encoder layers
        # num_encoder_layers determines the depth of the network
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        
        # Store d_model for use in the forward pass
        self.d_model = d_model
        
        # Create a linear layer for the final output
        # This projects the transformer output back to vocabulary space
        self.linear = nn.Linear(d_model, vocab_size)

    def forward(self, src):
        # Convert input tokens to embeddings
        # Multiply by sqrt(d_model) to scale the embeddings
        # This scaling helps maintain the variance of the forward pass
        src = self.embedding(src) * math.sqrt(self.d_model)
        
        # Add positional encoding to the embeddings
        src = self.pos_encoder(src)
        
        # Pass the encoded input through the transformer encoder
        output = self.transformer_encoder(src)
        
        # Project the transformer output to vocabulary space
        output = self.linear(output)
        
        return output

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

class ChessDataset(Dataset):
    def __init__(self, data, max_seq_length=100):
        self.data = data
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, target_move = self.data[idx]
        
        # Ensure input_seq is a list
        input_seq = input_seq.split() if isinstance(input_seq, str) else input_seq
        
        # Truncate or pad input sequence to exactly max_seq_length
        if len(input_seq) > self.max_seq_length:
            input_seq = input_seq[-self.max_seq_length:]
        else:
            input_seq = [PAD_TOKEN] * (self.max_seq_length - len(input_seq)) + input_seq
        
        # Ensure the sequence is exactly max_seq_length
        input_seq = input_seq[:self.max_seq_length]
        
        # Encode moves
        input_tensor = torch.tensor([encode_move(m) for m in input_seq])
        target_tensor = torch.tensor(encode_move(target_move))
        
        return input_tensor, target_tensor

In [10]:
# Create dataset and dataloader
dataset = ChessDataset(training_data, max_seq_length=100)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True, num_workers=4)

In [11]:
# Training parameters
vocab_size = len(vocab) + 500  # Slightly larger than the actual vocabulary size                                                    (1/5 results) [1919/2674]
d_model = 512                                                                                                                                     
nhead = 8
num_encoder_layers = 6

In [12]:
import torch.optim as optim
import gc

# Model definition
model = ChessTransformer(vocab_size, d_model, nhead, num_encoder_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Clear cache (might not need this)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name()}")

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 3090


In [13]:
from tqdm import tqdm
import time
import gc

def train_model(model, dataloader, criterion, optimizer, num_epochs, device, log_interval=100, save_interval=1000):
    model.to(device)
    total_steps = len(dataloader)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        epoch_loss = 0.0
        start_time = time.time()
        
        progress_bar = tqdm(enumerate(dataloader), total=total_steps, desc=f"Epoch {epoch+1}/{num_epochs}")
        
        for i, (inputs, targets) in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs[:, -1, :], targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            epoch_loss += loss.item()
            
            if (i + 1) % log_interval == 0:
                avg_loss = running_loss / log_interval
                progress_bar.set_postfix({'Loss': f'{avg_loss:.4f}'})
                running_loss = 0.0
            
            if (i + 1) % save_interval == 0:
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss,
                }, f'checkpoint_epoch{epoch+1}_step{i+1}.pth')
        
        epoch_time = time.time() - start_time
        epoch_avg_loss = epoch_loss / total_steps
        print(f"Epoch {epoch+1}/{num_epochs} completed in {epoch_time:.2f}s - Avg Loss: {epoch_avg_loss:.4f}")
        print(f"GPU memory allocated: {torch.cuda.memory_allocated()/1e9}GB")
        print(f"GPU memory cached: {torch.cuda.memory_cached()/1e9}GB")

In [None]:
num_epochs = 10
log_interval = 100  # Log every 100 batches
save_interval = 1000  # Save checkpoint every 1000 batches

print("Starting training loop...")
total_start_time =- time.perf_counter()

train_model(model, dataloader, criterion, optimizer, num_epochs, device, log_interval, save_interval)

total_end_time = time.perf_counter()
total_duration = total_end_time - total_start_time
print(f"Total training time: {total_duration:.2f} seconds")

Starting training loop...


Epoch 1/10:   0%|                                                                                                | 115/237464 [01:38<508:19:02,  7.71s/it, Loss=6.8377]

In [None]:
dataset[1]

In [None]:
# Inference function
def predict_next_move(model, move_sequence):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor([encode_move(m) for m in move_sequence.split()]).unsqueeze(0).to(device)
        output = model(input_tensor)
        predicted_move_index = output[0, -1, :].argmax().item()
        return decode_move(predicted_move_index)

# Example usage
game_so_far = "e4"
next_move = predict_next_move(model, game_so_far)
print(f"Predicted next move: {next_move}")