In [1]:
# Change directory to the root of the project
import os 
os.chdir('..')

In this notebook we will create an exhaustive training data set for tic tac toe using the Minimax agent, in the form of a replay buffer compatible with AlphaZeroTrainer. The idea is to use this dataset to run some sweeps, and to understand which deep learning models will perform best in theory.

# Create training set

In [2]:
from applications.tic_tac_toe.game_state import TicTacToeState
from core.implementations.Minimax import Minimax

# Creat minmax agent and expand the game tree
state = TicTacToeState()
agent = Minimax(state)
agent()

(0, 1)

In [3]:
# Get list of unique examples, translated into AlphaZero format for compatibility with models

from core.implementations.AlphaZero import AlphaZeroTarget
from core.data_structures import TrainingExample
from typing import Dict, Tuple

def get_subexamples(root) -> Dict[TicTacToeState, TrainingExample[Tuple[int, int], AlphaZeroTarget]]:
    policy = {action: 0.0 for action in root.children.keys()}
    for action in root.value.best_actions:
        policy[action] = 1/len(root.value.best_actions)
    state_to_examples = {}
    state_to_examples[root.state] = TrainingExample(
        state=root.state,
        target=(policy, root.value.value),
        data={'legal_actions': list(root.children.keys())}
    )
    for child in root.children.values():
        state_to_examples.update(get_subexamples(child))
    return state_to_examples

examples = list(get_subexamples(agent.root).values())

In [4]:
k = 1053
for i, example in enumerate(examples[k:k+3]):
    print(f"Example {i+1}:")
    print(f"State: \n{example.state}")
    print(f"Target: {example.target}")
    print(f"Data: {example.data}")
    print("\n")

print(f"Number of unique examples: {len(examples)}")

Example 1:
State: 
X |  | O
---------
 | O | X
---------
X | O | 
Target: ({(0, 1): 0.0, (1, 0): 1.0, (2, 2): 0.0}, 1.0)
Data: {'legal_actions': [(0, 1), (1, 0), (2, 2)]}


Example 2:
State: 
X |  | O
---------
 | O | X
---------
X | O | X
Target: ({(0, 1): 1.0, (1, 0): 0.0}, 1.0)
Data: {'legal_actions': [(0, 1), (1, 0)]}


Example 3:
State: 
X |  | O
---------
 | O | X
---------
X |  | O
Target: ({(0, 1): 0.0, (1, 0): 1.0, (2, 1): 0.0}, 1.0)
Data: {'legal_actions': [(0, 1), (1, 0), (2, 1)]}


Number of unique examples: 5478


In [5]:
from core.data_structures import ReplayBuffer
from applications.tic_tac_toe.mlp_model import TicTacToeModelInterface
from applications.tic_tac_toe.transformer_model import TicTacToeTransformerInterface
import torch

device = torch.device('mps') # Change to 'cuda' or 'cpu' if needed

for model_name, interface in [
    ('mlp', TicTacToeModelInterface),
    ('transformer', TicTacToeTransformerInterface)
]:
    state_encoder = lambda state: interface.encode_state(state, device)
    example_encoder = lambda example: interface.encode_example(example, device)
    buffer = ReplayBuffer(max_size=len(examples))
    buffer.extend(examples, state_encoder, example_encoder)
    buffer.save(f'applications/tic_tac_toe/training_data/{model_name}.pkl')