In [1]:
# Change directory to the root of the project
import os 
os.chdir('..')
os.chdir('..')
os.chdir('..')
print(f"Working directory: {os.getcwd()}")

Working directory: /Users/eohjelle/Documents/2025-dots-and-boxes/dots-and-boxes


In this notebook we will create an exhaustive training data set for tic tac toe using the Minimax agent, in the form of a replay buffer compatible with AlphaZeroTrainer. The idea is to use this dataset to run some sweeps, and to understand which deep learning models will perform best in theory.

# Create training set

In [2]:
from applications.tic_tac_toe.game_state import TicTacToeState
from core.implementations.Minimax import Minimax

# Creat minmax agent and expand the game tree
state = TicTacToeState()
agent = Minimax(state)
agent()

(1, 1)

In [3]:
# This is for testing the state_dict design

def count_nodes(root):
    if root.is_leaf():
        return 1
    else:
        return 1 + sum(count_nodes(child) for child in root.children.values())

print(f"Number of nodes: {count_nodes(agent.root)}")

def count_states(agent):
    return len(agent.state_dict)

print(f"Number of states: {count_states(agent)}")


Number of nodes: 549946
Number of states: 5478


In [4]:
# Get list of unique examples, translated into AlphaZero format for compatibility with models

from core.data_structures import TrainingExample

def example(state, node):
    policy = {action: 0.0 for action in node.children.keys()}
    for action in node.value.best_actions:
        policy[action] = 1/len(node.value.best_actions)
    return TrainingExample(
        state=state,
        target=(policy, node.value.value),
        data={'legal_actions': list(node.children.keys())}
    )

examples = [example(state, node) for state, node in agent.state_dict.items()]

In [5]:
k = 2836
for i, example in enumerate(examples[k:k+3]):
    print(f"Example {i+1}:")
    print(f"State: \n{example.state}")
    print(f"Target: {example.target}")
    print(f"Data: {example.data}")
    print("\n")

print(f"Number of unique examples: {len(examples)}")

Example 1:
State: 
 | X | X
---------
O | O | 
---------
O | X | 
Target: ({(0, 0): 1.0, (1, 2): 0.0, (2, 2): 0.0}, 1.0)
Data: {'legal_actions': [(0, 0), (1, 2), (2, 2)]}


Example 2:
State: 
 | X | X
---------
O | O | 
---------
 | X | O
Target: ({(0, 0): 1.0, (1, 2): 0.0, (2, 0): 0.0}, 1.0)
Data: {'legal_actions': [(0, 0), (1, 2), (2, 0)]}


Example 3:
State: 
 | X | X
---------
O | O | 
---------
O | X | X
Target: ({(0, 0): 0.5, (1, 2): 0.5}, 1.0)
Data: {'legal_actions': [(0, 0), (1, 2)]}


Number of unique examples: 5478


In [6]:
# Test encoding and decoding

from applications.tic_tac_toe.tensor_mapping import TokenizedTensorMapping
import torch

device = torch.device('mps') # Change to 'cuda' or 'cpu' if needed

tensor_mapping = TokenizedTensorMapping
states = tensor_mapping.encode_states([ex.state for ex in examples], device) # type: ignore
targets, data = tensor_mapping.encode_examples(examples, device)

for i in range(k, k+3):
    print(f"\nExample {i+1-k}:")
    print(f"Encoded state: {states[i]}")
    print(f"Encoded policy: {targets['policy'][i]}")
    print(f"Encoded value: {targets['value'][i]}")
    print(f"Encoded legal actions: {data['legal_actions'][i]}")


Example 1:
Encoded state: tensor([0, 1, 1, 2, 2, 0, 2, 1, 0], device='mps:0')
Encoded policy: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.], device='mps:0')
Encoded value: 1.0
Encoded legal actions: tensor([1., 0., 0., 0., 0., 1., 0., 0., 1.], device='mps:0')

Example 2:
Encoded state: tensor([0, 1, 1, 2, 2, 0, 0, 1, 2], device='mps:0')
Encoded policy: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.], device='mps:0')
Encoded value: 1.0
Encoded legal actions: tensor([1., 0., 0., 0., 0., 1., 1., 0., 0.], device='mps:0')

Example 3:
Encoded state: tensor([0, 1, 1, 2, 2, 0, 2, 1, 1], device='mps:0')
Encoded policy: tensor([0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000],
       device='mps:0')
Encoded value: 1.0
Encoded legal actions: tensor([1., 0., 0., 0., 0., 1., 0., 0., 0.], device='mps:0')


In [7]:
# Check entropy of target distribution

import torch.nn.functional as F

non_zero_policy = torch.where(targets['policy'] > 0, targets['policy'], torch.ones_like(targets['policy']))
logits = torch.log(non_zero_policy)
entropy = - torch.sum(targets['policy'] * logits, dim=1)

print(f"Some sample entropies: {entropy[:10]}")
print(f"Average entropy: {entropy.mean()}")

# Compare with entropy of random logits
random_logits = torch.randn_like(targets['policy'])
random_entropy = F.cross_entropy(random_logits, targets['policy'])

print(f"Average entropy of random logits: {random_entropy}")

Some sample entropies: tensor([2.1972, -0.0000, 1.3863, -0.0000, 1.3863, 1.3863, 1.3863, -0.0000, 1.3863,
        -0.0000], device='mps:0')
Average entropy: 0.40638184547424316
Average entropy of random logits: 2.154865264892578


In [8]:
from core.data_structures import ReplayBuffer

buffer = ReplayBuffer(max_size=len(examples))
buffer.extend(states, targets, data)
artifact_name = f'tic_tac_toe_{tensor_mapping.__name__}_minimax'
path = f'applications/tic_tac_toe/training_data/{artifact_name}.pkl'
buffer.save(path)
# buffer.save_to_wandb(
#     artifact_name=artifact_name,
#     project='AlphaZero-TicTacToe',
#     description=f'Training data for tic tac toe and {tensor_mapping.__name__} tensor mapping created by Minimax agent'
# )

RuntimeError: Parent directory applications/tic_tac_toe/training_data does not exist.