In [2]:
import json

# Load dataset_top50_map.json
with open('../dataset_top50_map.json', 'r') as f:
    dataset_top50_map = json.load(f)

with open('progressive_results.json', 'r') as f:
    progressive_results = json.load(f)

def get_score(move, board_hash):
    if move in dataset_top50_map.get(board_hash, []):
        return 1
    return 0

def calculate_stats(data, model_data):
    scores = []
    durations = []
    
    for board_hash, board_data in model_data['boards'].items():
        move = board_data['move']
        duration = board_data['duration']
        
        score = get_score(move, board_hash)
        scores.append(score)
        
        durations.append(duration)
    
    total_score = sum(scores)
    avg_duration = sum(durations) / len(durations) if durations else 0
    max_duration = max(durations) if durations else 0
    min_duration = min(durations) if durations else 0
    
    return {
        'total_score': total_score,
        'average_duration': avg_duration,
        'max_duration': max_duration,
        'min_duration': min_duration
    }

# Calculate stats for all models
for model_data in progressive_results[2:]:
    model = model_data['model']
    instruction = model_data['instruction']
    stats = calculate_stats(progressive_results, model_data)
    
    print(f"\nStats for '{model}' model with instruction '{instruction}':")
    print(f"Total Score: {stats['total_score']:.2f}")
    print("Duration (seconds):")
    print(f"  Min: {stats['min_duration']:.2f}")
    print(f"  Avg: {stats['average_duration']:.2f}")
    print(f"  Max: {stats['max_duration']:.2f}")



Stats for 'o1-preview' model with instruction 'Let's think step-by-step.':
Total Score: 20.00
Duration (seconds):
  Min: 10.60
  Avg: 37.37
  Max: 141.43

Stats for 'o1-preview' model with instruction '
    A player can play a perfect game of tic-tac-toe (to win or at least draw) if, each time it is their turn to play, they choose the first available move from the following list, as used in Newell and Simon's 1972 tic-tac-toe program.[19]
    1. Win: If the player has two in a row, they can place a third to get three in a row.
    2. Block: If the opponent has two in a row, the player must play the third themselves to block the opponent.
    3. Fork: Cause a scenario where the player has two ways to win (two non-blocked lines of 2).
    4. Block an opponent's fork: If there is only one possible fork for the opponent, the player should block it. Otherwise, the player should block all forks in any way that simultaneously allows them to make two in a row. Otherwise, the player should mak