In [1]:
# Change directory to the root of the project
import os 
os.chdir('..')
os.chdir('..')
os.chdir('..')
print(f"Working directory: {os.getcwd()}")

Working directory: /Users/eohjelle/Documents/2025-dots-and-boxes/dots-and-boxes


In this notebook we will create an exhaustive training data set for tic tac toe using the Minimax agent, in the form of a replay buffer compatible with AlphaZeroTrainer. The idea is to use this dataset to run some sweeps, and to understand which deep learning models will perform best in theory.

# Create training set

In [2]:
from applications.dots_and_boxes.game_state import DotsAndBoxesGameState
from core.implementations.Minimax import Minimax

rows = 3
cols = 2

# Creat minmax agent and expand the game tree
state = DotsAndBoxesGameState(rows, cols)
agent = Minimax(state)
agent()

(1, 0)

In [3]:
# Get list of unique examples, translated into AlphaZero format for compatibility with models

from core.data_structures import TrainingExample

def example(state, node):
    policy = {action: 0.0 for action in node.children.keys()}
    for action in node.value.best_actions:
        policy[action] = 1/len(node.value.best_actions)
    return TrainingExample(
        state=state,
        target=(policy, node.value.value),
        data={'legal_actions': list(node.children.keys())}
    )

examples = [example(state, node) for state, node in agent.state_dict.items()]


In [4]:
k = 2836
for i, example in enumerate(examples[k:k+3]):
    print(f"Example {i+1}:")
    print(f"State: \n{example.state}")
    print(f"Target: {example.target}")
    print(f"Data: {example.data}")
    print("\n")

print(f"Number of unique examples: {len(examples)}")

Example 1:
State: 
+ - + - +
|   | B |
+ . + - +
.   |   .
+ - + . +
|   |   |
+ . + - +

 Player -1 to move.
Target: ({(2, 1): 0.2, (3, 0): 0.2, (3, 4): 0.2, (4, 3): 0.2, (6, 1): 0.2}, -1)
Data: {'legal_actions': [(2, 1), (3, 0), (3, 4), (4, 3), (6, 1)]}


Example 2:
State: 
+ - + - +
|   | B |
+ . + - +
.   |   .
+ - + . +
| A |   |
+ - + - +

 Player -1 to move.
Target: ({(2, 1): 0.25, (3, 0): 0.25, (3, 4): 0.25, (4, 3): 0.25}, 1)
Data: {'legal_actions': [(2, 1), (3, 0), (3, 4), (4, 3)]}


Example 3:
State: 
+ - + - +
|   | B |
+ . + - +
.   |   .
+ - + . +
| B |   .
+ - + - +

 Player 1 to move.
Target: ({(2, 1): 0.2, (3, 0): 0.2, (3, 4): 0.2, (4, 3): 0.2, (5, 4): 0.2}, -1)
Data: {'legal_actions': [(2, 1), (3, 0), (3, 4), (4, 3), (5, 4)]}


Number of unique examples: 131072


Note that the number of unique examples equals $2^{\#\text{edges}}$, as expected.

In [5]:
# Test encoding and decoding

from applications.dots_and_boxes.encoder import DABSimpleTensorMapping, DABMultiLayerTensorMapping
import torch


for i, example in enumerate(examples[k:k+3]):
    encoded_state = DABSimpleTensorMapping.encode_state(example.state, device=torch.device('cpu'))
    encoded_example = DABSimpleTensorMapping.encode_example(example, device=torch.device('cpu'))
    print(f"\nExample {i+1}:")
    print(f"Encoded state: {encoded_state}")
    print(f"Encoded policy: {encoded_example[0]['policy']}")
    print(f"Encoded value: {encoded_example[0]['value']}")
    print(f"Encoded legal actions: {encoded_example[1]['legal_actions']}")



Example 1:
Encoded state: tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1])
Encoded policy: tensor([0.0000, 0.0000, 0.0000, 0.2000, 0.0000, 0.2000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.2000, 0.0000, 0.2000, 0.0000, 0.0000, 0.2000, 0.0000])
Encoded value: -1.0
Encoded legal actions: tensor([0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.])

Example 2:
Encoded state: tensor([1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1])
Encoded policy: tensor([0.0000, 0.0000, 0.0000, 0.2500, 0.0000, 0.2500, 0.0000, 0.0000, 0.0000,
        0.0000, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000, 0.2500, 0.0000])
Encoded value: 1.0
Encoded legal actions: tensor([0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.])

Example 3:
Encoded state: tensor([1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1])
Encoded policy: tensor([0.0000, 0.0000, 0.0000, 0.2000, 0.0000, 0.2000, 0.0000, 0.0000, 0.2000,
        0.0000, 0.2000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2000, 0.

In [6]:
from core.data_structures import ReplayBuffer
import torch

device = torch.device('mps') # Change to 'cuda' or 'cpu' if needed

for tensor_mapping in [DABSimpleTensorMapping]:
    state_encoder = lambda state: tensor_mapping.encode_state(state, device)
    example_encoder = lambda example: tensor_mapping.encode_example(example, device)
    buffer = ReplayBuffer(max_size=len(examples))
    buffer.extend(examples, state_encoder, example_encoder)
    artifact_name = f'dots_and_boxes_{rows}x{cols}_{tensor_mapping.__name__}_minimax'
    path = f'applications/dots_and_boxes/training_data/{artifact_name}.pkl'
    buffer.save(path)
    # buffer.save_to_wandb(
    #     artifact_name=artifact_name,
    #     project='AlphaZero-DotsAndBoxes',
    #     description=f'Training data for dots and boxes with board size {rows}x{cols} using {model_name} model created by Minimax agent'
    # )

In [7]:
# Try loading the training data
buffer = ReplayBuffer.from_file(path)

  checkpoint = torch.load(path, map_location=device)
