In [1]:
# Change directory to the root of the project
import os 
os.chdir('../../..')
print(f"Working directory: {os.getcwd()}")

Working directory: /Users/eohjelle/Documents/2025-mcts-playground/mcts-playground


In this notebook we will create an exhaustive training data set for tic tac toe using the Minimax agent, in the form of a replay buffer compatible with AlphaZeroTrainer. The idea is to use this dataset to run some sweeps, and to understand which deep learning models will perform best in theory.

# Create training set

In [2]:
from core.games.open_spiel_state_wrapper import OpenSpielState
from core.algorithms import Minimax
import pyspiel

# Creat minmax agent and expand the game tree
game = pyspiel.load_game("connect_four")
state = OpenSpielState(game.new_initial_state(), num_players=2)
agent = Minimax(state)
agent()

# # Get list of unique examples, translated into AlphaZero format for compatibility with models

# def example(state, node):
#     policy = {action: 0.0 for action in node.children.keys()}
#     for action in node.value.best_actions:
#         policy[action] = 1/len(node.value.best_actions)

KeyboardInterrupt: 

In [3]:
# This is for testing the state_dict design

def count_nodes(root):
    if root.is_leaf():
        return 1
    else:
        return 1 + sum(count_nodes(child) for child in root.children.values())

print(f"Number of nodes: {count_nodes(agent.root)}")

def count_states(agent):
    return len(agent.state_dict)

print(f"Number of states: {count_states(agent)}")


Number of nodes: 8
Number of states: 8


In [4]:
for state, node in agent.state_dict.items():
    print(state)
    print(node.value)
    print(node.children)
    print("-"*100)

.......
.......
.......
.......
.......
.......

MinimaxValue(player=0, value=0.0, best_actions=[0, 1, 2, 3, 4, 5, 6])
{0: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x13457ccd0>, value=None, children={}), 1: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x13457cf50>, value=None, children={}), 2: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x134530fc0>, value=None, children={}), 3: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x134531220>, value=None, children={}), 4: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x1345a27b0>, value=None, children={}), 5: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x1344c3460>, value=None, children={}), 6: Node(state=<core.games.open_spiel_state_wrapper.OpenSpielState object at 0x1344c3680>, value=None, children={})}
--------------------------------------------------------------------

In [4]:
# Get list of unique examples, translated into AlphaZero format for compatibility with models

from core.data_structures import TrainingExample

def example(state, node):
    policy = {action: 0.0 for action in node.children.keys()}
    for action in node.value.best_actions:
        policy[action] = 1/len(node.value.best_actions)
    return TrainingExample(
        state=state,
        target=(policy, node.value.value),
        extra_data={'legal_actions': list(node.children.keys())}
    )

examples = [example(state, node) for state, node in agent.state_dict.items()]

In [5]:
k = 2836
for i, example in enumerate(examples[k:k+3]):
    print(f"Example {i+1}:")
    print(f"State: \n{example.state}")
    print(f"Target: {example.target}")
    print(f"Data: {example.extra_data}")
    print("\n")

print(f"Number of unique examples: {len(examples)}")

Example 1:
State: 
 | X | X
---------
O | O | 
---------
O | X | 
Target: ({(0, 0): 1.0, (1, 2): 0.0, (2, 2): 0.0}, 1.0)
Data: {'legal_actions': [(0, 0), (1, 2), (2, 2)]}


Example 2:
State: 
 | X | X
---------
O | O | 
---------
 | X | O
Target: ({(0, 0): 1.0, (1, 2): 0.0, (2, 0): 0.0}, 1.0)
Data: {'legal_actions': [(0, 0), (1, 2), (2, 0)]}


Example 3:
State: 
 | X | X
---------
O | O | 
---------
O | X | X
Target: ({(0, 0): 0.5, (1, 2): 0.5}, 1.0)
Data: {'legal_actions': [(0, 0), (1, 2)]}


Number of unique examples: 5478


In [6]:
# Test encoding and decoding

from experiments.experimenting_with_model_architectures_in_tic_tac_toe.tensor_mapping import MLPTensorMapping, TokenizedTensorMapping
import torch

tokenized_states, tokenized_targets, tokenized_data = TokenizedTensorMapping.encode_examples(examples, device=torch.device('cpu'))
mlp_states, mlp_targets, mlp_data = MLPTensorMapping.encode_examples(examples, device=torch.device('cpu'))

for i in range(k, k+3):
    print(f"\nExample {i+1-k}:")
    print(f"Tokenized state: {tokenized_states[i]}")
    print(f"Tokenized policy: {tokenized_targets['policy'][i]}")
    print(f"Tokenized value: {tokenized_targets['value'][i]}")
    print(f"Tokenized legal actions: {tokenized_data['legal_actions'][i]}")
    print(f"MLP state: {mlp_states[i]}")
    print(f"MLP policy: {mlp_targets['policy'][i]}")
    print(f"MLP value: {mlp_targets['value'][i]}")
    print(f"MLP legal actions: {mlp_data['legal_actions'][i]}")


Example 1:
Tokenized state: tensor([0, 1, 1, 2, 2, 0, 2, 1, 0])
Tokenized policy: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.])
Tokenized value: 1.0
Tokenized legal actions: tensor([ True, False, False, False, False,  True, False, False,  True])
MLP state: tensor([0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0.])
MLP policy: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.])
MLP value: 1.0
MLP legal actions: tensor([ True, False, False, False,  True,  True,  True,  True,  True])

Example 2:
Tokenized state: tensor([0, 1, 1, 2, 2, 0, 0, 1, 2])
Tokenized policy: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.])
Tokenized value: 1.0
Tokenized legal actions: tensor([ True, False, False, False, False,  True,  True, False, False])
MLP state: tensor([0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1.])
MLP policy: tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.])
MLP value: 1.0
MLP legal actions: tensor([ True, False, False,  True,  True,  True,  True,  True, False]

In [7]:
# Check entropy of target distribution

import torch.nn.functional as F

targets = tokenized_targets

non_zero_policy = torch.where(targets['policy'] > 0, targets['policy'], torch.ones_like(targets['policy']))
logits = torch.log(non_zero_policy)
entropy = - torch.sum(targets['policy'] * logits, dim=1)

print(f"Some sample entropies: {entropy[:10]}")
print(f"Average entropy: {entropy.mean()}")

# Compare with entropy of random logits
random_logits = torch.randn_like(targets['policy'])
random_cross_entropy = F.cross_entropy(random_logits, targets['policy'])

print(f"KL divergence between target and random logits: {random_cross_entropy - entropy.mean()}")

# Compare with entropy of uniform logits
uniform_logits = torch.zeros_like(targets['policy'])
uniform_cross_entropy = F.cross_entropy(uniform_logits, targets['policy'])

print(f"KL divergence between target and uniform logits: {uniform_cross_entropy - entropy.mean()}")

Some sample entropies: tensor([2.1972, -0.0000, 1.3863, -0.0000, 1.3863, 1.3863, 1.3863, -0.0000, 1.3863,
        -0.0000])
Average entropy: 0.40638184547424316
KL divergence between target and random logits: 1.742671251296997
KL divergence between target and uniform logits: 1.406589150428772


In [8]:
from core.data_structures import ReplayBuffer

for tensor_mapping, states, targets, data in zip([MLPTensorMapping, TokenizedTensorMapping], [mlp_states, tokenized_states], [mlp_targets, tokenized_targets], [mlp_data, tokenized_data]):
    buffer = ReplayBuffer(max_size=len(examples))
    buffer.add(states, targets, data)
    artifact_name = f'buffer_minimax_{tensor_mapping.__name__}'
    path = f'experiments/experimenting_with_model_architectures_in_tic_tac_toe/data/{artifact_name}.pt'
    buffer.save_to_file(path)
# buffer.save_to_wandb(
#     artifact_name=artifact_name,
#     project='AlphaZero-TicTacToe',
#     description=f'Training data for tic tac toe and {tensor_mapping.__name__} tensor mapping created by Minimax agent'
# )