# Tako HRM - Evaluation

Evaluate trained models against baseline opponents.

## Evaluation Methods

- **Random baseline** - Win rate vs random play
- **Self-play** - Model vs older checkpoints
- **External engines** - Stockfish (chess), Edax (othello), etc.

---

## Setup (Run Once)

In [None]:
# Ensure we're in the repo directory
import os
if not os.path.exists('scripts/eval.py'):
    os.chdir('tako-v2')

print(f"Working directory: {os.getcwd()}")

In [None]:
# Check GPU availability
import torch

if torch.cuda.is_available():
    device = 'cuda'
    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
else:
    device = 'cpu'
    print("ℹ️  Using CPU for evaluation")

---

## TicTacToe Evaluation

Test model against random play and perfect play.

In [None]:
# Find latest TicTacToe checkpoint
from pathlib import Path

ckpt_dir = Path('checkpoints/tictactoe')
if ckpt_dir.exists():
    checkpoints = sorted(ckpt_dir.glob('*.pt'), key=lambda p: p.stat().st_mtime)
    if checkpoints:
        latest_ckpt = checkpoints[-1]
        print(f"Latest checkpoint: {latest_ckpt.name}")
        print(f"Path: {latest_ckpt}")
    else:
        print("⚠️  No checkpoints found. Train first.")
        latest_ckpt = None
else:
    print("⚠️  Checkpoint directory not found")
    latest_ckpt = None

In [None]:
# Evaluate vs random opponent
if latest_ckpt:
    print("Evaluating TicTacToe model vs random play...")
    print("\n" + "="*80)
    
    !~/.cargo/bin/uv run python scripts/eval.py \
        --config config/tictactoe.yaml \
        --checkpoint {latest_ckpt} \
        --opponent random \
        --games 100 \
        --device {device}
    
    print("\n" + "="*80)
    print("Expected: >90% win rate after convergence")
else:
    print("❌ No checkpoint available")

In [None]:
# Evaluate vs perfect minimax opponent (TicTacToe only)
if latest_ckpt:
    print("Evaluating TicTacToe model vs perfect play...")
    print("\n" + "="*80)
    
    !~/.cargo/bin/uv run python scripts/eval.py \
        --config config/tictactoe.yaml \
        --checkpoint {latest_ckpt} \
        --opponent minimax \
        --games 50 \
        --device {device}
    
    print("\n" + "="*80)
    print("Expected: 0% losses (draws or wins only)")
else:
    print("❌ No checkpoint available")

---

## Othello Evaluation

Test model against random play and Edax engine.

In [None]:
# Find latest Othello checkpoint
from pathlib import Path

ckpt_dir = Path('checkpoints/othello')
if ckpt_dir.exists():
    checkpoints = sorted(ckpt_dir.glob('*.pt'), key=lambda p: p.stat().st_mtime)
    if checkpoints:
        latest_ckpt = checkpoints[-1]
        print(f"Latest checkpoint: {latest_ckpt.name}")
        print(f"Path: {latest_ckpt}")
    else:
        print("⚠️  No checkpoints found. Train first.")
        latest_ckpt = None
else:
    print("⚠️  Checkpoint directory not found")
    latest_ckpt = None

In [None]:
# Evaluate vs random opponent
if latest_ckpt:
    print("Evaluating Othello model vs random play...")
    print("\n" + "="*80)
    
    !~/.cargo/bin/uv run python scripts/eval.py \
        --config config/othello.yaml \
        --checkpoint {latest_ckpt} \
        --opponent random \
        --games 100 \
        --device {device}
    
    print("\n" + "="*80)
    print("Expected: >95% win rate after training")
else:
    print("❌ No checkpoint available")

In [None]:
# Evaluate vs Edax engine (if available)
if latest_ckpt:
    print("Evaluating Othello model vs Edax level 3...")
    print("Note: Requires Edax installed")
    print("\n" + "="*80)
    
    !~/.cargo/bin/uv run python scripts/eval.py \
        --config config/othello.yaml \
        --checkpoint {latest_ckpt} \
        --opponent edax \
        --opponent-level 3 \
        --games 50 \
        --device {device}
    
    print("\n" + "="*80)
    print("Target: Beat Edax level 3 (Phase 1 goal)")
else:
    print("❌ No checkpoint available")

---

## Hex Evaluation

Test model against random play.

In [None]:
# Find latest Hex checkpoint
from pathlib import Path

ckpt_dir = Path('checkpoints/hex')
if ckpt_dir.exists():
    checkpoints = sorted(ckpt_dir.glob('*.pt'), key=lambda p: p.stat().st_mtime)
    if checkpoints:
        latest_ckpt = checkpoints[-1]
        print(f"Latest checkpoint: {latest_ckpt.name}")
        print(f"Path: {latest_ckpt}")
    else:
        print("⚠️  No checkpoints found. Train first.")
        latest_ckpt = None
else:
    print("⚠️  Checkpoint directory not found")
    latest_ckpt = None

In [None]:
# Evaluate vs random opponent
if latest_ckpt:
    print("Evaluating Hex model vs random play...")
    print("\n" + "="*80)
    
    !~/.cargo/bin/uv run python scripts/eval.py \
        --config config/hex.yaml \
        --checkpoint {latest_ckpt} \
        --opponent random \
        --games 100 \
        --device {device}
    
    print("\n" + "="*80)
    print("Expected: >90% win rate after training")
else:
    print("❌ No checkpoint available")

---

## Chess Evaluation

Test model against Stockfish at various levels.

In [None]:
# Find latest Chess checkpoint
from pathlib import Path

ckpt_dir = Path('checkpoints/chess')
if ckpt_dir.exists():
    checkpoints = sorted(ckpt_dir.glob('*.pt'), key=lambda p: p.stat().st_mtime)
    if checkpoints:
        latest_ckpt = checkpoints[-1]
        print(f"Latest checkpoint: {latest_ckpt.name}")
        print(f"Path: {latest_ckpt}")
    else:
        print("⚠️  No checkpoints found. Train first.")
        latest_ckpt = None
else:
    print("⚠️  Checkpoint directory not found")
    latest_ckpt = None

In [None]:
# Evaluate vs Stockfish level 5
if latest_ckpt:
    print("Evaluating Chess model vs Stockfish level 5...")
    print("Note: Requires Stockfish installed")
    print("\n" + "="*80)
    
    !~/.cargo/bin/uv run python scripts/eval.py \
        --config config/chess.yaml \
        --checkpoint {latest_ckpt} \
        --opponent stockfish \
        --opponent-level 5 \
        --games 50 \
        --device {device}
    
    print("\n" + "="*80)
    print("Phase 3 target: ~1700 Elo (after pretraining)")
    print("Phase 5 target: 2500+ Elo (GM level)")
else:
    print("❌ No checkpoint available")

---

## Compare Multiple Checkpoints

In [None]:
# Compare progression across checkpoints
import matplotlib.pyplot as plt
from pathlib import Path
import re

GAME = "tictactoe"  # Change to othello, hex, chess

ckpt_dir = Path(f'checkpoints/{GAME}')
if ckpt_dir.exists():
    checkpoints = sorted(ckpt_dir.glob('step_*.pt'), key=lambda p: int(re.search(r'step_(\d+)', p.name).group(1)))
    
    if len(checkpoints) > 5:
        # Sample 5 checkpoints evenly
        indices = [0, len(checkpoints)//4, len(checkpoints)//2, 3*len(checkpoints)//4, -1]
        sample_ckpts = [checkpoints[i] for i in indices]
        
        print(f"Comparing {len(sample_ckpts)} {GAME} checkpoints:")
        print("="*80)
        
        steps = []
        win_rates = []
        
        for ckpt in sample_ckpts:
            step = int(re.search(r'step_(\d+)', ckpt.name).group(1))
            steps.append(step)
            
            print(f"\nEvaluating checkpoint: {ckpt.name}")
            # Run eval and parse output
            # This is a placeholder - actual implementation would parse eval output
            print(f"  (Eval not implemented in comparison mode yet)")
        
        print("\n" + "="*80)
        print("Use individual eval cells above for detailed results")
    else:
        print(f"Found {len(checkpoints)} checkpoints - need at least 5 for comparison")
else:
    print(f"⚠️  No checkpoints found for {GAME}")