In [1]:
# Change directory to the root of the project
import os 
os.chdir('..')
os.chdir('..')
os.chdir('..')
print(f"Working directory: {os.getcwd()}")

Working directory: /Users/eohjelle/Documents/2025-dots-and-boxes/dots-and-boxes


In this notebook we will create an exhaustive training data set for tic tac toe using the Minimax agent, in the form of a replay buffer compatible with AlphaZeroTrainer. The idea is to use this dataset to run some sweeps, and to understand which deep learning models will perform best in theory.

# Create training set

In [2]:
from applications.dots_and_boxes.game_state import DotsAndBoxesGameState
from core.implementations.Minimax import Minimax

rows = 2
cols = 2

# Creat minmax agent and expand the game tree
state = DotsAndBoxesGameState(rows, cols)
agent = Minimax(state)
agent()

(1, 4)

In [3]:
# Check value of starting position
agent.root.value.value

1

If +1, winning, if -1, losing. It seems that the starting position is winning on a 2 x 2 board and losing on a 3 x 2 board, pretty interesting.

In [4]:
# Get list of unique examples, translated into AlphaZero format for compatibility with models

from core.data_structures import TrainingExample

def example(state, node):
    policy = {action: 0.0 for action in node.children.keys()}
    for action in node.value.best_actions:
        policy[action] = 1/len(node.value.best_actions)
    return TrainingExample(
        state=state,
        target=(policy, node.value.value),
        data={'legal_actions': list(node.children.keys())}
    )

examples = [example(state, node) for state, node in agent.state_dict.items()]


In [5]:
k = 2836
for i, example in enumerate(examples[k:k+3]):
    print(f"Example {i+1}:")
    print(f"State: \n{example.state}")
    print(f"Target: {example.target}")
    print(f"Data: {example.data}")
    print("\n")

print(f"Number of unique examples: {len(examples)}")

Example 1:
State: 
+ - + . +
.   .   |
+ . + - +
.   | B |
+ . + - +

 Player -1 to move.
Target: ({(0, 3): 0.0, (1, 0): 0.0, (1, 2): 0.0, (2, 1): 1.0, (3, 0): 0.0, (4, 1): 0.0}, 0)
Data: {'legal_actions': [(0, 3), (1, 0), (1, 2), (2, 1), (3, 0), (4, 1)]}


Example 2:
State: 
+ - + . +
.   .   |
+ . + - +
.   | A |
+ - + - +

 Player 1 to move.
Target: ({(0, 3): 0.2, (1, 0): 0.2, (1, 2): 0.2, (2, 1): 0.2, (3, 0): 0.2}, 1)
Data: {'legal_actions': [(0, 3), (1, 0), (1, 2), (2, 1), (3, 0)]}


Example 3:
State: 
+ - + . +
.   .   |
+ - + - +
.   | B |
+ . + - +

 Player 1 to move.
Target: ({(0, 3): 0.0, (1, 0): 0.0, (1, 2): 0.0, (3, 0): 0.5, (4, 1): 0.5}, 0)
Data: {'legal_actions': [(0, 3), (1, 0), (1, 2), (3, 0), (4, 1)]}


Number of unique examples: 5559


Note that the number of unique examples equals $2^{\#\text{edges}}$, as expected.

In [6]:
# Test encoding and decoding

from applications.dots_and_boxes.encoder import DABSimpleTensorMapping, DABMultiLayerTensorMapping, DABMiddleGroundTensorMapping
import torch

device = torch.device('mps') # Change to 'cuda' or 'cpu' if needed

tensor_mapping = DABMiddleGroundTensorMapping
states = tensor_mapping.encode_states([ex.state for ex in examples], device) # type: ignore
targets, data = tensor_mapping.encode_examples(examples, device)

for i in range(k, k+3):
    print(f"\nExample {i+1-k}:")
    print(f"Encoded state: {states[i]}")
    print(f"Encoded policy: {targets['policy'][i]}")
    print(f"Encoded value: {targets['value'][i]}")
    print(f"Encoded legal actions: {data['legal_actions'][i]}")



Example 1:
Encoded state: tensor([0, 0, 1, 0, 1, 1, 3, 2, 2, 2, 3, 3, 4, 4, 4, 6], device='mps:0')
Encoded policy: tensor([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.], device='mps:0')
Encoded value: 0.0
Encoded legal actions: tensor([1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.], device='mps:0')

Example 2:
Encoded state: tensor([0, 0, 1, 0, 1, 1, 3, 2, 3, 2, 3, 3, 4, 4, 4, 5], device='mps:0')
Encoded policy: tensor([0.2000, 0.2000, 0.0000, 0.2000, 0.0000, 0.0000, 0.0000, 0.2000, 0.0000,
        0.2000, 0.0000, 0.0000], device='mps:0')
Encoded value: 1.0
Encoded legal actions: tensor([1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.], device='mps:0')

Example 3:
Encoded state: tensor([0, 0, 1, 0, 1, 1, 3, 3, 2, 2, 3, 3, 4, 4, 4, 6], device='mps:0')
Encoded policy: tensor([0.0000, 0.0000, 0.0000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5000,
        0.0000, 0.0000, 0.0000], device='mps:0')
Encoded value: 0.0
Encoded legal actions: tensor([1., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0

In [7]:
# Check entropy of target distribution

import torch.nn.functional as F

non_zero_policy = torch.where(targets['policy'] > 0, targets['policy'], torch.ones_like(targets['policy']))
logits = torch.log(non_zero_policy)
entropy = - torch.sum(targets['policy'] * logits, dim=1)

print(f"Some sample entropies: {entropy[:10]}")
print(f"Average entropy: {entropy.mean()}")

# Compare with entropy of random logits
random_logits = torch.randn_like(targets['policy'])
random_entropy = F.cross_entropy(random_logits, targets['policy'])

print(f"Average entropy of random logits: {random_entropy}")

Some sample entropies: tensor([2.0794, 2.3979, 2.3979, 2.3979, -0.0000, 2.3979, -0.0000, -0.0000, 2.3979,
        -0.0000], device='mps:0')
Average entropy: 0.7462002038955688
Average entropy of random logits: 2.888861894607544


For 3 x 2 board, entropy $\approx 0.93$.
For 2 x 2 board, entropy $\approx 0.7462$.

This is useful to know because the entropy is a lower bound on the optimal cross-entropy.

In [8]:
from core.data_structures import ReplayBuffer

buffer = ReplayBuffer(max_size=len(examples))
buffer.extend(states, targets, data)
artifact_name = f'dots_and_boxes_{rows}x{cols}_{tensor_mapping.__name__}_minimax'
path = f'applications/dots_and_boxes/training_data/{artifact_name}.pkl'
buffer.save(path)
buffer.save_to_wandb(
    artifact_name=artifact_name,
    project='AlphaZero-DotsAndBoxes',
    description=f'Training data for dots and boxes with board size {rows}x{cols} and {tensor_mapping.__name__} tensor mapping created by Minimax agent'
)

[34m[1mwandb[0m: Currently logged in as: [33meohjelle[0m ([33meigenway[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [9]:
# Try loading the training data
buffer = ReplayBuffer.from_file(path)

  checkpoint = torch.load(path, map_location=device)
