In [1]:
%%capture
import os
os.environ["UNSLOTH_VLLM_STANDBY"] = "1" # [NEW] Extra 30% context lengths!
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install or uv pip install
    !pip install unsloth vllm
else:
    pass # For Colab / Kaggle, we need extra instructions hidden below \/

In [2]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
!pip install python-chess
!apt-get install stockfish

In [3]:
#@title Colab Extra Install { display-mode: "form" }
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm
else:
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
    !uv pip install -qqq --upgrade \
        unsloth {get_vllm} {get_numpy} {get_pil} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
!uv pip install transformers==4.56.2
!uv pip install --no-deps trl==0.22.2

In [4]:
# --- Environment & configuration
import os
from dotenv import load_dotenv
load_dotenv()

import yaml

# Path to your YAML config file
path = '/content/config.yaml'

def load_config(path: str):
    with open(path, 'r') as file:
        config = yaml.safe_load(file)
    return config

config = load_config(path)
config_name = "qwen4b"
print("Selected config_name:", config_name)

match config_name:
    case "llama":
        config = config["llama_config"]
    case "phi":
        config = config["PHI_config"]
    case "mistral":
        config = config["mistral_config"]
    case "qwen7b":
        config = config["qwen7b_config"]
    case "qwen4b":
        config = config["qwen4b_config"]
    case _:
        raise ValueError("Check model name – perhaps the keyboard got excited.")

# Stockfish path from env
# stockfish_path = os.getenv("STOCKFISH_PATH")
stockfish_path= '/usr/games/stockfish'

print("STOCKFISH_PATH:", stockfish_path)

Selected config_name: qwen4b
STOCKFISH_PATH: /usr/games/stockfish


In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = False, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




INFO 10-26 20:11:26 [__init__.py:244] Automatically detected platform cuda.
ERROR 10-26 20:11:31 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.9: Fast Qwen2 patching. Transformers: 4.56.2. vLLM: 0.9.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
#add paddings
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token or "<|pad|>"

model.resize_token_embeddings(len(tokenizer))

target_modules = [
    "q_proj","k_proj","v_proj","o_proj",
    "gate_proj","up_proj","down_proj",

]
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = target_modules,
    lora_alpha = lora_rank, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

Unsloth 2025.10.9 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [None]:
SYSTEM_PROMPT = """You are a chess coach assistant. You will be given a board position in FEN format. Your job is to analyze the board and suggest the best legal move for the player whose turn it is.

Please follow this exact format in your response:

<reasoning>
(Brief explanation of what you see on the board — piece activity, threats, and candidate moves)
</reasoning>
<answer>
(best move written in correct SAN format, such as Nf3 or exd5)
</answer>

Do not invent illegal or impossible moves. The move must be legal in the given FEN position.
Do not use UCI format like e2e4 — only SAN notation like e4, Nf3, or O-O.
In case of taking a piece use the [file]x[target square] format
### Example:
FEN: rnbqkbnr/pppppppp/8/8/4P3/5N2/PPPP1PPP/RNBQKB1R b KQkq - 1 1

<reasoning>
White has just played e4 and developed the knight to f3. It’s Black’s turn. The e4 pawn is undefended. Capturing it with the pawn from d7 to d5 is a natural central counter.
</reasoning>
<answer>
d5
</answer>

Now solve the following position:
"""

def fen_color(fen: str) -> str:
    try:
        parts = fen.strip().split()
        side = parts[1]
        return "White" if side == "w" else "Black" if side == "b" else "Unknown"
    except Exception:
        return "Unknown"

# ---------- add this block after loading model/tokenizer ----------
import torch, re, pandas as pd
from datasets import load_dataset

# (optional but recommended) ensure a chat template is set for base models
try:
    from unsloth.chat_templates import get_chat_template
    if getattr(tokenizer, "chat_template", None) in (None, ""):
        tokenizer = get_chat_template(tokenizer, chat_template="qwen3")
except Exception:
    # fallback: minimal qwen-like template
    if getattr(tokenizer, "chat_template", None) in (None, ""):
        tokenizer.chat_template = (
            "{% for m in messages %}"
            "{% if m['role'] == 'system' %}<|im_start|>system\n{{ m['content'] }}<|im_end|>\n"
            "{% elif m['role'] == 'user' %}<|im_start|>user\n{{ m['content'] }}<|im_end|>\n"
            "{% elif m['role'] == 'assistant' %}<|im_start|>assistant\n{{ m['content'] }}<|im_end|>\n"
            "{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
        )

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

# helpers
ANSWER_RE = re.compile(r"<answer>\s*(.*?)\s*</answer>", re.DOTALL | re.IGNORECASE)
REASONING_RE = re.compile(r"<reasoning>\s*(.*?)\s*</reasoning>", re.DOTALL | re.IGNORECASE)

def extract_move(text: str):
    m = ANSWER_RE.search(text or "")
    return m.group(1).strip() if m else None

def to_top5_set(v):
    if v is None:
        return set()
    if isinstance(v, (list, tuple, set)):
        return set(str(x).strip() for x in v if str(x).strip())
    parts = [p.strip() for p in str(v).replace(";", ",").split(",")]
    return set(p for p in parts if p)

def build_prompt(fen: str, color: str):
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"FEN: {fen}\nYou are with the following pieces: {color}"},
    ]
    return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

def generate_answer(the_model, prompt: str, max_new_tokens=128, temperature=0.7, top_p=0.9, top_k=50):
    the_model.eval()
    with torch.no_grad():
        inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
        out = the_model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=temperature > 0,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,
        )
    gen_ids = out[0][inputs["input_ids"].shape[-1]:]
    return tokenizer.decode(gen_ids, skip_special_tokens=False)

# dataset
original_dataset = load_dataset("czovekboti/sft_chess", split="train")

print("=" * 80)
print("TESTING MODEL BEFORE TRAINING + SCORING")
print("=" * 80)

num_test_examples = 50
rows = []

for i in range(num_test_examples):
    example = original_dataset[i]
    fen = example["fen"]
    color = fen_color(fen)

    prompt = build_prompt(fen, color)
    output = generate_answer(model, prompt, max_new_tokens=128, temperature=0.0, top_p=1.0, top_k=0)  # deterministic

    move = extract_move(output)
    has_reasoning = bool(REASONING_RE.search(output))
    has_answer_tag = bool(ANSWER_RE.search(output))

    top5 = to_top5_set(example.get("top_5_moves"))
    in_top5 = (move in top5) if move else False

    best_move = (example.get("best_move") or "").strip() or None
    equals_best = (move == best_move) if (move and best_move) else False

    # points (tweak as needed)
    score = 0
    score += 1 if has_reasoning else 0
    score += 1 if has_answer_tag else 0
    score += 1 if move else 0
    score += 2 if in_top5 else 0
    score += 2 if equals_best else 0

    print(f"\n{'='*80}")
    print(f"Example {i+1}/{num_test_examples}")
    print(f"{'='*80}")
    print(f"FEN: {fen}")
    print(f"Color: {color}")
    print(f"\nGenerated Answer:\n{output}")

    rows.append({
        "idx": i,
        "FEN": fen,
        "Color": color,
        "Move": move or "",
        "HasReasoningTag": has_reasoning,
        "HasAnswerTag": has_answer_tag,
        "InTop5": in_top5,
        "EqualsBest": equals_best,
        "Score": score,
        "Top5Moves": ", ".join(sorted(top5)) if top5 else "",
        "BestMove": best_move or "",
        "RawOutput": output,
    })

# table
df = pd.DataFrame(rows).sort_values("Score", ascending=False).reset_index(drop=True)
print("\n=== SCORE TABLE (top 10 rows) ===")
print(df[["idx","Move","InTop5","EqualsBest","HasReasoningTag","HasAnswerTag","Score"]].head(50))

# save
df.to_csv("sft_answer_scoring_before_training.csv", index=False)
print("\nSaved: sft_answer_scoring_before_training.csv")
print("\nStarting training...\n")


TESTING MODEL BEFORE TRAINING + SCORING

Example 1/50
FEN: rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR b KQkq - 0 1
Color: Black

Generated Answer:
<reasoning>
The current position shows a typical middlegame setup with both sides having active pieces and pawns. Black's turn. The key piece activity is the knight on f6 which can potentially control key squares. However, there are no immediate threats or weaknesses that require an urgent response. A good move would be to develop the knight to g5, controlling h4 and potentially preparing to castle.
</reasoning>
<answer>
Ng5
</answer><|im_end|>

Example 2/50
FEN: rnbqkbnr/pppp1ppp/4p3/8/4P3/8/PPPP1PPP/RNBQKBNR w KQkq - 0 2
Color: White

Generated Answer:
<reasoning>
The current position shows that White has a strong pawn structure with pawns on c4 and d5. Black has a weak pawn on e6 and a knight on g5 which can potentially target the e5 pawn. However, there are no immediate threats or weaknesses for White to exploit. The best move would b

In [None]:
# --- Dataset & W&B init
from datasets import load_dataset

dataset = load_dataset("czovekboti/sft_chess", split="train")

# Toggle W&B logging (set to False if you don't want to log)
USE_WANDB = True

if USE_WANDB:
    import wandb
    os.environ["WANDB_LOG_MODEL"] = "end"
    os.environ["WANDB_PROJECT"] = "Chess_RL_Project"
    os.environ["WANDB_ENTITY"] = "czovekboti-budapesti-m-szaki-s-gazdas-gtudom-nyi-egyetem"
    wandb.login()
    wandb.init(
        project="Chess_RL_Project",
        entity="",
        name=config.get("name", "run"),
        config={
            "model": config["model"],
            "max_seq_length": config['max_seq_length'],
            "lora_rank": lora_rank,
            "learning_rate": config["learning_rate"],
            "max_steps": config["max_steps"],
        }
    )
    print("W&B initialized.")
else:
    print("W&B disabled.")

In [None]:
formatted_texts = []
from datasets import Dataset
for ex in dataset:
    fen = (ex.get("fen") or "").strip()
    ans = (ex.get("answer") or "").strip()
    if not fen or not ans:
        continue
    color = fen_color(fen)
    messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"\nFEN: {fen}\nYou are with the following pieces: {color}"},
            {"role": "assistant", "content": ans},

    ]
    formatted_texts.append({"conversations": messages})
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
from datasets import Dataset

# formatted_texts is just a list of dicts
dataset = Dataset.from_list(formatted_texts)

dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset[5]["conversations"]

In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        completion_only_loss=True,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = len(dataset),
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use TrackIO/WandB etc
    ),
)

In [None]:
trainer.train()

In [None]:
trained_model = trainer.model

In [None]:
for i in range(num_test_examples):
    example = original_dataset[i]
    fen = example["fen"]
    color = fen_color(fen)

    prompt = build_prompt(fen, color)
    output = generate_answer(trained_model, prompt, max_new_tokens=128, temperature=0.0, top_p=1.0, top_k=0)  # deterministic

    move = extract_move(output)
    has_reasoning = bool(REASONING_RE.search(output))
    has_answer_tag = bool(ANSWER_RE.search(output))

    top5 = to_top5_set(example.get("top_5_moves"))
    in_top5 = (move in top5) if move else False

    best_move = (example.get("best_move") or "").strip() or None
    equals_best = (move == best_move) if (move and best_move) else False
    score = 0
    score += 1 if has_reasoning else 0
    score += 1 if has_answer_tag else 0
    score += 1 if move else 0
    score += 2 if in_top5 else 0
    score += 2 if equals_best else 0

    print(f"\n{'='*80}")
    print(f"Example {i+1}/{num_test_examples}")
    print(f"{'='*80}")
    print(f"FEN: {fen}")
    print(f"Color: {color}")
    print(f"\nGenerated Answer:\n{output}")

    rows.append({
        "idx": i,
        "FEN": fen,
        "Color": color,
        "Move": move or "",
        "HasReasoningTag": has_reasoning,
        "HasAnswerTag": has_answer_tag,
        "InTop5": in_top5,
        "EqualsBest": equals_best,
        "Score": score,
        "Top5Moves": ", ".join(sorted(top5)) if top5 else "",
        "BestMove": best_move or "",
        "RawOutput": output,
    })
df = pd.DataFrame(rows).sort_values("Score", ascending=False).reset_index(drop=True)
print("\n=== SCORE TABLE (top 10 rows) ===")
print(df[["idx","Move","InTop5","EqualsBest","HasReasoningTag","HasAnswerTag","Score"]].head(50))
df.to_csv("sft_answer_scoring_after_training.csv", index=False)


In [None]:
import pandas as pd

# === CONFIG ===
before_path = "sft_answer_scoring_before_training.csv"
after_path  = "sft_answer_scoring_after_training.csv"
# Key columns that identify each sample
key_cols = ["idx"]  # or ["idx","FEN"] if both exist

# === LOAD ===
before = pd.read_csv(before_path)
after  = pd.read_csv(after_path)

# === MERGE ON KEYS ===
merged = before.merge(after, on=key_cols, suffixes=("_before", "_after"))

# === COMPUTE COUNTS ===
metrics = ["InTop5", "EqualsBest"]  # adjust to your column names

print("=== Correct Predictions Summary ===\n")
for metric in metrics:
    if f"{metric}_before" in merged.columns and f"{metric}_after" in merged.columns:
        before_sum = merged[f"{metric}_before"].sum()
        after_sum  = merged[f"{metric}_after"].sum()
        diff = after_sum - before_sum
        print(f"{metric}:")
        print(f"  Before: {before_sum}")
        print(f"  After:  {after_sum}")
        print(f"  Change: {'+' if diff >= 0 else ''}{diff}\n")

# === TOTAL SCORE COMPARISON (if you have a 'Score' column) ===
if "Score_before" in merged.columns and "Score_after" in merged.columns:
    total_before = merged["Score_before"].sum()
    total_after  = merged["Score_after"].sum()
    diff_total = total_after - total_before
    print("Total Score:")
    print(f"  Before: {total_before}")
    print(f"  After:  {total_after}")
    print(f"  Change: {'+' if diff_total >= 0 else ''}{diff_total}")
