In [1]:
!pip install transformers peft datasets accelerate torch pandas scikit-learn chess

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting chess
  Downloading chess-1.11.1.tar.gz (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.6 MB/s[0m eta [3

Supervised(Ignore)

In [2]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import Dataset

# Dataset Preparation
class ChessDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = item["input"]
        output_text = item["output"]
        inputs = self.tokenizer(
            input_text, truncation=True, padding="max_length", max_length=self.max_length
        )
        labels = self.tokenizer(
            output_text, truncation=True, padding="max_length", max_length=self.max_length
        ).input_ids

        inputs["labels"] = labels
        return {key: torch.tensor(val) for key, val in inputs.items()}

# Parsing the Dataset
def parse_chess_games(file_path):
    data = []
    count =0
    with open(file_path, "r") as file:
        lines = file.readlines()
    for line in lines:
        if "###" in line:
            if count >100:
                break
            count = count+1
            raw_moves = line.split("###")[1].strip()
            moves = re.findall(r"W\d+\.[^B]+ B\d+\.[^W]+", raw_moves)
            if len(moves) < 2:
                continue
            inputs = " ".join(moves[:-1])
            next_move = moves[-1]
            if not inputs or not next_move:  # Ensure non-empty inputs and outputs
                continue
            data.append({"input": inputs, "output": next_move})
    return pd.DataFrame(data)

# Load and Prepare Data
file_path = "/content/all_with_filtered_anotations_since1998 copy.txt"
raw_data = parse_chess_games(file_path)
print("i am here 1")
# Split into train and validation sets
train_data = raw_data.sample(frac=0.8, random_state=42)
val_data = raw_data.drop(train_data.index)
print(f"Dataset Size: {len(train_data)} training samples")
print("Example Data:", train_data.iloc[0])  # Check the structure of the first sample
print("i am here 2")
# Model and Tokenizer Setup
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(model_name)
print("i am here 3")
# LoRA Configuration
lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["c_attn"], lora_dropout=0.1
)
model = get_peft_model(base_model, lora_config)

# Create Datasets
train_dataset = ChessDataset(train_data, tokenizer)
val_dataset = ChessDataset(val_data, tokenizer)
print("i am here 4")
# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
)

# Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)
print("i am here 5")
# Fine-Tune the Model
trainer.train()
print("i am here 6")
# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Inference Example
def predict_next_move(model, tokenizer, input_moves):
    inputs = tokenizer(input_moves, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
input_moves = "W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3"
print(predict_next_move(model, tokenizer, input_moves))


i am here 1
Dataset Size: 80 training samples
Example Data: input     W1.e4 B1.c5  W2.Nf3 B2.d6  W5.c4 B5.Nc6  W6.Nc...
output                                      W61.Kf6 B61.d4 
Name: 83, dtype: object
i am here 2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

i am here 3
i am here 4


  trainer = Trainer(


i am here 5


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


i am here 6


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3 Nf6? { (-1.67 → -1.67) Mistake. Best move was Nf6. } (Nf6, Nf6) .. { (-1.67 → -1.67) Mistake.


In [12]:
input_moves = "W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3 B3.Nf6 W4."
print(predict_next_move(model, tokenizer, input_moves))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3 B3.Nf6 W4.Nd3 B5.


In [10]:
def predict_next_move(model, tokenizer, input_moves):
    inputs = tokenizer(input_moves, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=6)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

UnSupervised(Ignore) no cuda

In [13]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import Dataset

# Dataset Preparation - Parse Chess Game Moves as Sequences
def parse_chess_games(file_path, limit=25000):
    data = []
    with open(file_path, "r") as file:
        lines = file.readlines()
    for count, line in enumerate(lines):
        if "###" in line:
            if count >= limit:
                break
            raw_moves = line.split("###")[1].strip()
            moves = re.findall(r"W\d+\.[^B]+ B\d+\.[^W]+", raw_moves)
            if not moves:  # Skip if no moves found
                continue
            # Combine all moves into a single sequence
            game_sequence = " ".join(moves).strip()
            if game_sequence:  # Ensure non-empty
                data.append(game_sequence)
    return pd.DataFrame(data, columns=["text"])

# Dataset Class for Language Modeling
class ChessDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        # Tokenize the entire sequence
        tokenized = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        input_ids = tokenized.input_ids.squeeze()  # Remove batch dimension
        attention_mask = tokenized.attention_mask.squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

# Load and Prepare Data
file_path = "/content/all_with_filtered_anotations_since1998 copy.txt"
raw_data = parse_chess_games(file_path)
train_data = raw_data.sample(frac=0.8, random_state=42)
val_data = raw_data.drop(train_data.index)
print(f"Dataset Size: {len(train_data)} training samples")
print("Example Data:", train_data.iloc[0])
# Model and Tokenizer Setup
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(model_name)

# LoRA Configuration for Efficient Fine-Tuning
lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["c_attn"], lora_dropout=0.1
)
model = get_peft_model(base_model, lora_config)

# Create Datasets
train_dataset = ChessDataset(train_data, tokenizer)
val_dataset = ChessDataset(val_data, tokenizer)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2,
)

# Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Fine-Tune the Model
trainer.train()

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model_unsupe")
tokenizer.save_pretrained("./fine_tuned_model_unsupe")




  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Predicted moves: W1.d4 B1.d5 W2.c4 B2.e6 W3.Nc3 Nf6 4.Nf3 Nf


In [20]:
#80 samples 3 epochs
# Inference: Predict the Next Move Given a Sequence of Moves
def predict_next_move(model, tokenizer, input_moves, max_new_tokens=3):
    inputs = tokenizer(input_moves, return_tensors="pt").input_ids
    outputs = model.generate(inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
input_moves = "W1.d4 B1.Nf6 W2.Nf3 B2.g6 W3.g3 B3.Bg7 W4.Bg2 B4.O-O W5.O-O B5.d6 W6.Nbd2 B6."
predicted_moves = predict_next_move(model, tokenizer, input_moves)
print(f"Predicted moves: {predicted_moves}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Predicted moves: W1.d4 B1.Nf6 W2.Nf3 B2.g6 W3.g3 B3.Bg7 W4.Bg2 B4.O-O W5.O-O B5.d6 W6.Nbd2 B6.Nf3


In [1]:
!pip install transformers peft datasets accelerate torch pandas scikit-learn chess




Unsupervised **Cuda**

In [3]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import Dataset
from accelerate import Accelerator

# Dataset Preparation - Parse Chess Game Moves as Sequences
def parse_chess_games(file_path, limit=25000):
    data = []
    with open(file_path, "r") as file:
        lines = file.readlines()
    for count, line in enumerate(lines):
        if "###" in line:
            if count >= limit:
                break
            raw_moves = line.split("###")[1].strip()
            moves = re.findall(r"W\d+\.[^B]+ B\d+\.[^W]+", raw_moves)
            if not moves:  # Skip if no moves found
                continue
            # Combine all moves into a single sequence
            game_sequence = " ".join(moves).strip()
            if game_sequence:  # Ensure non-empty
                data.append(game_sequence)
    return pd.DataFrame(data, columns=["text"])

# Dataset Class for Language Modeling
class ChessDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        # Tokenize the entire sequence
        tokenized = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        input_ids = tokenized.input_ids.squeeze()  # Remove batch dimension
        attention_mask = tokenized.attention_mask.squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": input_ids}

# Load and Prepare Data
file_path = "/content/all_with_filtered_anotations_since1998 copy.txt"
raw_data = parse_chess_games(file_path)
train_data = raw_data.sample(frac=0.8, random_state=42)
val_data = raw_data.drop(train_data.index)
print(f"Dataset Size: {len(train_data)} training samples")
print("Example Data:", train_data.iloc[0])

# Model and Tokenizer Setup
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(model_name)

# LoRA Configuration for Efficient Fine-Tuning
lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["c_attn"], lora_dropout=0.1
)
model = get_peft_model(base_model, lora_config)

# Create Datasets
train_dataset = ChessDataset(train_data, tokenizer)
val_dataset = ChessDataset(val_data, tokenizer)

# Accelerator setup (optional)
accelerator = Accelerator()

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2,
    # Enable mixed precision for T4 GPU
    fp16=True,
)

# Initialize Trainer (without the accelerator argument)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Use accelerator to prepare model and datasets for multi-device/multi-GPU setups
model, train_dataset, val_dataset = accelerator.prepare(
    model, train_dataset, val_dataset
)

# Fine-Tune the Model
trainer.train()

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model_unsupe")
tokenizer.save_pretrained("./fine_tuned_model_unsupe")


Dataset Size: 19927 training samples
Example Data: text    W1.e4 B1.c5  W2.Nf3 B2.Nc6  W3.Nc3 B3.e6  W4.d...
Name: 20883, dtype: object


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.718,No log
2,0.6604,No log
3,0.6323,No log
4,0.6178,No log
5,0.6071,No log
6,0.6001,No log
7,0.5942,No log


('./fine_tuned_model_unsupe/tokenizer_config.json',
 './fine_tuned_model_unsupe/special_tokens_map.json',
 './fine_tuned_model_unsupe/vocab.json',
 './fine_tuned_model_unsupe/merges.txt',
 './fine_tuned_model_unsupe/added_tokens.json',
 './fine_tuned_model_unsupe/tokenizer.json')

Prediction with Cuda

In [None]:
def predict_next_move(model, tokenizer, input_moves, max_new_tokens=100):
    # Move model to the GPU
    device = model.device

    # Tokenize the input moves
    inputs = tokenizer(input_moves, return_tensors="pt", padding=True, truncation=True)

    # Move the input tensors to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)  # Ensure attention mask is also on the correct device

    # Ensure pad_token_id is set correctly for open-ended generation
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

    # Generate the next tokens
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=pad_token_id  # Set pad_token_id
    )

    # Decode and return the predicted moves
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
input_moves = "W1.e4 B1.c5 W2.Nf3 B2.e6 W3.Nc3 B3.a6 W4.d4 B4.cxd4 W5.Nxd4 B5.Nf6"
predicted_moves = predict_next_move(model, tokenizer, input_moves)
print(f"Predicted moves: {predicted_moves}")


# Loading Saved Model and predicting

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Path to the saved model directory
model_path = "/content/fine_tuned_model_unsupe"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Ensure pad_token_id is correctly set for open-ended generation
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)




GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace

In [15]:
# Function to predict the next move
def predict_next_move(model, tokenizer, input_moves, max_new_tokens=3):
    # Move model to the GPU
    device = model.device

    # Tokenize the input moves
    inputs = tokenizer(input_moves, return_tensors="pt", padding=True, truncation=True)

    # Move the input tensors to the same device as the model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)  # Ensure attention mask is also on the correct device

    # Ensure pad_token_id is set correctly for open-ended generation
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id

    # Generate the next tokens
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=pad_token_id  # Set pad_token_id
    )

    # Decode and return the predicted moves
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example Usage
input_moves = "W1.e4 B1.c5 W2.Nf3 B2.d6 W3.Ne5 B3.dxe5 W4."
predicted_moves = predict_next_move(model, tokenizer, input_moves)
print(f"Predicted moves: {predicted_moves}")

Predicted moves: W1.e4 B1.c5 W2.Nf3 B2.d6 W3.Ne5 B3.dxe5 W4.Nxe5
