# Chess Puzzle Results Review

This notebook allows you to interactively review puzzle results and compare LLM responses against correct solutions.

In [1]:
import json
import chess
import chess.svg
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

In [2]:
# Load results
with open('results.json', 'r') as f:
    data = json.load(f)

accuracy_report = data['accuracy_report']
detailed_results = data['detailed_results']

# Get the list of puzzles from the first model
first_model = list(detailed_results.keys())[0]
puzzles = detailed_results[first_model]['responses']
total_puzzles = len(puzzles)

print(f"Loaded {total_puzzles} puzzles")
print(f"Models tested: {list(detailed_results.keys())}")

Loaded 1000 puzzles
Models tested: ['gpt-4o-mini', 'gpt-3.5-turbo', 'gpt-4o-mini-context', 'gpt-3.5-turbo-context', 'gpt-4o-mini-legal', 'gpt-3.5-turbo-legal']


In [3]:
# Display accuracy summary
print("=" * 80)
print("ACCURACY SUMMARY")
print("=" * 80)
for model, metrics in accuracy_report.items():
    print(f"\n{model}:")
    print(f"  Correct: {metrics['correct_count']}/{metrics['total_puzzles']} ({metrics['correct_accuracy']:.1f}%)")
    print(f"  Valid:   {metrics['valid_count']}/{metrics['total_puzzles']} ({metrics['valid_move_rate']:.1f}%)")

ACCURACY SUMMARY

gpt-4o-mini:
  Correct: 25/1000 (2.5%)
  Valid:   301/1000 (30.1%)

gpt-3.5-turbo:
  Correct: 31/1000 (3.1%)
  Valid:   332/1000 (33.2%)

gpt-4o-mini-context:
  Correct: 943/1000 (94.3%)
  Valid:   999/1000 (99.9%)

gpt-3.5-turbo-context:
  Correct: 992/1000 (99.2%)
  Valid:   1000/1000 (100.0%)

gpt-4o-mini-legal:
  Correct: 66/1000 (6.6%)
  Valid:   976/1000 (97.6%)

gpt-3.5-turbo-legal:
  Correct: 83/1000 (8.3%)
  Valid:   993/1000 (99.3%)


In [4]:
def display_board(fen, size=400):
    """Display a chess board from FEN."""
    board = chess.Board(fen)
    svg = chess.svg.board(board, size=size)
    return HTML(svg)

def get_move_san(fen, uci_move):
    """Convert UCI move to SAN notation."""
    try:
        board = chess.Board(fen)
        move = chess.Move.from_uci(uci_move)
        return board.san(move)
    except:
        return uci_move

def display_puzzle(puzzle_idx):
    """Display a puzzle with all model responses."""
    clear_output(wait=True)
    
    # Get puzzle data from all models
    puzzle_data = {}
    for model_name, results in detailed_results.items():
        puzzle_data[model_name] = results['responses'][puzzle_idx]
    
    # Use first model's data for common info
    first_model_name = list(puzzle_data.keys())[0]
    puzzle = puzzle_data[first_model_name]
    
    fen = puzzle['fen']
    correct_move = puzzle['correct_move']
    puzzle_id = puzzle['puzzle_id']
    
    board = chess.Board(fen)
    turn = "White" if board.turn == chess.WHITE else "Black"
    correct_san = get_move_san(fen, correct_move)
    
    # Header
    print("=" * 80)
    print(f"Puzzle {puzzle_idx + 1} of {total_puzzles} (ID: {puzzle_id})")
    print("=" * 80)
    print(f"Turn: {turn} to move")
    print(f"FEN: {fen}")
    print()
    
    # Display board
    display(display_board(fen))
    print()
    
    # Correct solution
    print("✅ CORRECT SOLUTION")
    print(f"   UCI: {correct_move}")
    print(f"   SAN: {correct_san}")
    print()
    
    # Model responses
    print("🤖 MODEL RESPONSES")
    print("-" * 80)
    
    for model_name, model_puzzle in puzzle_data.items():
        response = model_puzzle['llm_response']
        extracted = model_puzzle['extracted_move']
        is_valid = model_puzzle['is_valid']
        is_correct = model_puzzle['is_correct']
        
        # Status indicator
        if is_correct:
            status = "✅ CORRECT"
        elif is_valid:
            status = "⚠️  VALID (but wrong)"
        else:
            status = "❌ INVALID"
        
        print(f"\n{model_name}: {status}")
        print(f"   Raw response: {response}")
        
        if extracted:
            extracted_san = get_move_san(fen, extracted)
            print(f"   Extracted: {extracted} (SAN: {extracted_san})")
        else:
            print(f"   Extracted: None (could not parse)")
    
    print()
    print("=" * 80)

In [5]:
# Interactive puzzle navigation
current_idx = 0

def show_current():
    display_puzzle(current_idx)

def on_next(button):
    global current_idx
    if current_idx < total_puzzles - 1:
        current_idx += 1
        show_current()

def on_prev(button):
    global current_idx
    if current_idx > 0:
        current_idx -= 1
        show_current()

def on_goto(change):
    global current_idx
    try:
        idx = int(change['new']) - 1
        if 0 <= idx < total_puzzles:
            current_idx = idx
            show_current()
    except:
        pass

# Create navigation buttons
prev_button = widgets.Button(description="◀ Previous")
next_button = widgets.Button(description="Next ▶")
goto_input = widgets.IntText(value=1, description='Go to:', min=1, max=total_puzzles)

prev_button.on_click(on_prev)
next_button.on_click(on_next)
goto_input.observe(on_goto, names='value')

nav_box = widgets.HBox([prev_button, next_button, goto_input])
display(nav_box)

# Show first puzzle
show_current()

Puzzle 1 of 1000 (ID: 0061g)
Turn: White to move
FEN: 6k1/pp3pp1/2p1q1Pp/3b4/8/6Q1/PB3Pp1/3r1NK1 w - - 0 28




✅ CORRECT SOLUTION
   UCI: g3b8
   SAN: Qb8+

🤖 MODEL RESPONSES
--------------------------------------------------------------------------------

gpt-4o-mini: ⚠️  VALID (but wrong)
   Raw response: Qh4
   Extracted: g3h4 (SAN: Qh4)

gpt-3.5-turbo: ❌ INVALID
   Raw response: Qh6
   Extracted: None (could not parse)

gpt-4o-mini-context: ✅ CORRECT
   Raw response: g3b8
   Extracted: g3b8 (SAN: Qb8+)

gpt-3.5-turbo-context: ✅ CORRECT
   Raw response: 1. g3b8 (Mate in 2)
   Extracted: g3b8 (SAN: Qb8+)

gpt-4o-mini-legal: ⚠️  VALID (but wrong)
   Raw response: g3g4
   Extracted: g3g4 (SAN: Qg4)

gpt-3.5-turbo-legal: ✅ CORRECT
   Raw response: g3b8
   Extracted: g3b8 (SAN: Qb8+)



## Filter by Results

Show only puzzles where specific conditions are met:

In [6]:
def find_puzzles(condition='all'):
    """Find puzzles matching a condition.
    
    Conditions:
    - 'all': All puzzles
    - 'all_wrong': All models got it wrong
    - 'all_correct': All models got it correct
    - 'context_better': Context models correct, base models wrong
    - 'base_wrong': Base models (non-context) got it wrong
    """
    matching = []
    
    for idx in range(total_puzzles):
        puzzle_results = {}
        for model_name in detailed_results.keys():
            puzzle_results[model_name] = detailed_results[model_name]['responses'][idx]['is_correct']
        
        if condition == 'all':
            matching.append(idx)
        elif condition == 'all_wrong':
            if not any(puzzle_results.values()):
                matching.append(idx)
        elif condition == 'all_correct':
            if all(puzzle_results.values()):
                matching.append(idx)
        elif condition == 'context_better':
            base_models = [m for m in puzzle_results.keys() if 'context' not in m]
            context_models = [m for m in puzzle_results.keys() if 'context' in m]
            
            base_all_wrong = all(not puzzle_results[m] for m in base_models)
            context_any_correct = any(puzzle_results[m] for m in context_models)
            
            if base_all_wrong and context_any_correct:
                matching.append(idx)
        elif condition == 'base_wrong':
            base_models = [m for m in puzzle_results.keys() if 'context' not in m]
            if all(not puzzle_results[m] for m in base_models):
                matching.append(idx)
    
    return matching

# Example: Find puzzles where context helped
context_better = find_puzzles('context_better')
print(f"Puzzles where context models got it right but base models didn't: {len(context_better)}")
print(f"Puzzle numbers: {[i+1 for i in context_better[:10]]}...") if len(context_better) > 10 else print(f"Puzzle numbers: {[i+1 for i in context_better]}")

all_wrong = find_puzzles('all_wrong')
print(f"\nPuzzles where ALL models failed: {len(all_wrong)}")
print(f"Puzzle numbers: {[i+1 for i in all_wrong[:10]]}...") if len(all_wrong) > 10 else print(f"Puzzle numbers: {[i+1 for i in all_wrong]}")

Puzzles where context models got it right but base models didn't: 825
Puzzle numbers: [2, 3, 5, 6, 7, 8, 9, 10, 11, 12]...

Puzzles where ALL models failed: 0
Puzzle numbers: []


In [None]:
# Display a specific puzzle number (1-based indexing)
# Change this number and re-run to see different puzzles
for i in range(50):
    # input()
    display_puzzle(i)  # Shows puzzle #1 (0-indexed)

Puzzle 1 of 1000 (ID: 0061g)
Turn: White to move
FEN: 6k1/pp3pp1/2p1q1Pp/3b4/8/6Q1/PB3Pp1/3r1NK1 w - - 0 28




✅ CORRECT SOLUTION
   UCI: g3b8
   SAN: Qb8+

🤖 MODEL RESPONSES
--------------------------------------------------------------------------------

gpt-4o-mini: ⚠️  VALID (but wrong)
   Raw response: Qh4
   Extracted: g3h4 (SAN: Qh4)

gpt-3.5-turbo: ❌ INVALID
   Raw response: Qh6
   Extracted: None (could not parse)

gpt-4o-mini-context: ✅ CORRECT
   Raw response: g3b8
   Extracted: g3b8 (SAN: Qb8+)

gpt-3.5-turbo-context: ✅ CORRECT
   Raw response: 1. g3b8 (Mate in 2)
   Extracted: g3b8 (SAN: Qb8+)

gpt-4o-mini-legal: ⚠️  VALID (but wrong)
   Raw response: g3g4
   Extracted: g3g4 (SAN: Qg4)

gpt-3.5-turbo-legal: ✅ CORRECT
   Raw response: g3b8
   Extracted: g3b8 (SAN: Qb8+)



## Export Filtered Results

Export specific puzzles to review:

In [None]:
def export_puzzles_to_text(puzzle_indices, filename='selected_puzzles.txt'):
    """Export selected puzzles to a text file for review."""
    with open(filename, 'w') as f:
        for idx in puzzle_indices:
            first_model = list(detailed_results.keys())[0]
            puzzle = detailed_results[first_model]['responses'][idx]
            
            fen = puzzle['fen']
            correct_move = puzzle['correct_move']
            puzzle_id = puzzle['puzzle_id']
            correct_san = get_move_san(fen, correct_move)
            
            f.write("=" * 80 + "\n")
            f.write(f"Puzzle {idx + 1} (ID: {puzzle_id})\n")
            f.write("=" * 80 + "\n")
            f.write(f"FEN: {fen}\n")
            f.write(f"Correct: {correct_move} ({correct_san})\n\n")
            
            for model_name in detailed_results.keys():
                model_puzzle = detailed_results[model_name]['responses'][idx]
                response = model_puzzle['llm_response']
                is_correct = model_puzzle['is_correct']
                
                status = "✓" if is_correct else "✗"
                f.write(f"[{status}] {model_name}: {response}\n")
            
            f.write("\n\n")
    
    print(f"Exported {len(puzzle_indices)} puzzles to {filename}")

# Example: Export all puzzles where context helped
# export_puzzles_to_text(context_better, 'context_helped.txt')