In [1]:
import sys
sys.path.append("../../")
import time
import torch
from losses.basic_losses import CategoricalCrossentropyLoss, KLDivergenceLoss
from agents.random import RandomAgent
from agents.muzero import MuZeroAgent
from agent_configs.muzero_config import MuZeroConfig
from game_configs.variable_turn_tictactoe_config import VariableTurnTicTacToeConfig
from agents.tictactoe_expert import TicTacToeBestAgent
from modules.world_models.muzero_world_model import MuzeroWorldModel

# Ensure we use CPU for fairness/comparibility or GPU if available
device = "cpu" # or "cuda" if available
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu


  from pkg_resources import resource_stream, resource_exists


In [2]:
params = {
    "num_simulations": 25,
    "per_alpha": 0.0,
    "per_beta": 0.0,
    "per_beta_final": 0.0,
    "n_step": 10,
    "root_dirichlet_alpha": 0.25,
    "residual_layers": [(24, 3, 1)],
    "reward_dense_layer_widths": [],
    "reward_conv_layers": [(16, 1, 1)],
    "actor_dense_layer_widths": [],
    "actor_conv_layers": [(16, 1, 1)],
    "critic_dense_layer_widths": [],
    "critic_conv_layers": [(16, 1, 1)],
    "to_play_dense_layer_widths": [],
    "to_play_conv_layers": [(16, 1, 1)],
    "known_bounds": [-1, 1],
    "support_range": None,
    "minibatch_size": 8,
    "replay_buffer_size": 100000,
    "gumbel": False,
    "gumbel_m": 16,
    "policy_loss_function": CategoricalCrossentropyLoss(),
    "training_steps": 20000, # Reduced for benchmark speed
    "transfer_interval": 1,
    "num_workers": 4,
    "world_model_cls": MuzeroWorldModel,
    "search_batch_size": 0, # Iterative
    "use_virtual_mean": False,
    "virtual_loss": 3.0,
    "compile": True,
    "use_mixed_precision": True,
}

game_config = VariableTurnTicTacToeConfig()


In [None]:
print("--- Running Multi Turn Tic Tac Toe ---")
params_batched = params.copy()
params_batched["search_batch_size"] = 5 
params_batched["use_virtual_mean"] = True

env_batch = VariableTurnTicTacToeConfig().make_env()
config_batch = MuZeroConfig(config_dict=params_batched, game_config=game_config)
config_batch.search_batch_size = 5 # Explicitly set

agent_batch = MuZeroAgent(
    env=env_batch,
    config=config_batch,
    name="variable_turn_tictactoe_muzero",
    device="cpu",
    test_agents=[RandomAgent(), TicTacToeBestAgent()],
)
agent_batch.checkpoint_interval = 100
agent_batch.test_interval = 1000
agent_batch.test_trials = 100

start_time = time.time()
agent_batch.train()
end_time = time.time()
print(f"MuZero Batched Time: {end_time - start_time:.2f}s")

--- Running Multi Turn Tic Tac Toe ---
Using default save_intermediate_weights     : False
Using         training_steps                : 20000
Using default adam_epsilon                  : 1e-08
Using default momentum                      : 0.9
Using default learning_rate                 : 0.001
Using default clipnorm                      : 0
Using default optimizer                     : <class 'torch.optim.adam.Adam'>
Using default weight_decay                  : 0.0
Using default num_minibatches               : 1
Using default training_iterations           : 1
Using default lr_schedule_type              : none
Using default lr_schedule_steps             : []
Using default lr_schedule_steps             : []
Using default lr_schedule_values            : []
Using         use_mixed_precision           : True
Using         compile                       : True
Using default compile_mode                  : default
Using         minibatch_size                : 8
Using         replay_buffer_s

  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists
  from pkg_resources import resource_stream, resource_exists


[Worker 0] Starting self-play...[Worker 3] Starting self-play...

[Worker 2] Starting self-play...[Worker 1] Starting self-play...

Started recording episode 0 to ./videos/variable_turn_tictactoe_muzero/0/episode_000000.mp4
Started recording episode 0 to ./videos/variable_turn_tictactoe_muzero/3/episode_000000.mp4Started recording episode 0 to ./videos/variable_turn_tictactoe_muzero/2/episode_000000.mp4

Started recording episode 0 to ./videos/variable_turn_tictactoe_muzero/1/episode_000000.mp4
Stopped recording episode 0. Recorded 7 frames.
Size: 0
Stopped recording episode 0. Recorded 9 frames.
Size: 7
learning
Stopped recording episode 0. Recorded 10 frames.
Size: 16
Stopped recording episode 0. Recorded 10 frames.
Size: 26
0
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_pl

  warn(


learning
100
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 0, 0, 6, 5],
        [4, 0, 3, 2, 5],
        [2, 1, 0, 6, 5],
        [4, 0, 8, 6, 5],
        [2, 6, 3, 5, 7],
        [4, 3, 8, 6, 5],
        [6, 1, 0, 6, 5],
        [6, 1, 5, 0, 7]])
target value tensor([[-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0

  warn(


learning
200
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 7, 3, 2, 8],
        [6, 0, 3, 4, 3],
        [6, 5, 1, 0, 2],
        [1, 4, 8, 7, 3],
        [7, 2, 4, 0, 3],
        [4, 1, 0, 0, 3],
        [5, 0, 0, 4, 3],
        [0, 1, 0, 4, 3]])
target value tensor([[ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606,  0.9703, -0.9801, -0.9900,  1.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0

  warn(


learning
300
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 0, 8, 8, 0],
        [1, 6, 0, 3, 0],
        [5, 4, 1, 6, 0],
        [1, 5, 6, 7, 0],
        [5, 0, 1, 2, 4],
        [8, 3, 6, 5, 0],
        [5, 7, 6, 0, 0],
        [6, 8, 0, 0, 0]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [-0.9801,  0.9900,  1.0000,  0

  warn(


learning
400
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 0, 2, 8, 1],
        [8, 5, 0, 4, 3],
        [3, 2, 0, 8, 1],
        [8, 4, 7, 2, 0],
        [7, 5, 0, 3, 6],
        [8, 0, 2, 8, 1],
        [6, 5, 0, 8, 1],
        [0, 0, 2, 8, 1]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0

  warn(


learning
500
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 6, 7, 5, 4],
        [3, 0, 3, 2, 5],
        [7, 0, 5, 1, 8],
        [1, 0, 4, 8, 6],
        [8, 5, 6, 4, 7],
        [7, 6, 5, 2, 1],
        [6, 0, 0, 2, 5],
        [6, 8, 3, 5, 7]])
target value tensor([[-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0

  warn(


Started recording episode 300 to ./videos/variable_turn_tictactoe_muzero/3/episode_000300.mp4
learning
600
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 4, 3, 0, 2],
        [0, 1, 7, 8, 4],
        [0, 0, 6, 0, 8],
        [7, 8, 0, 0, 8],
        [2, 4, 5, 0, 8],
        [7, 1, 6, 2, 0],
        [4, 0, 1, 2, 3],
        [1, 3, 4, 7, 2]])
target value tensor([[ 0.9606,  0.9703, -0.9801, -0.9900,  1.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        

  warn(


learning
700
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 1, 5, 0, 2],
        [7, 0, 3, 0, 2],
        [8, 6, 2, 0, 4],
        [4, 3, 0, 0, 2],
        [7, 0, 0, 0, 2],
        [3, 1, 5, 0, 8],
        [3, 5, 2, 6, 4],
        [8, 7, 3, 4, 2]])
target value tensor([[-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9510,  0.9606,  0.9703, -0

  warn(


learning
800
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 8, 7, 3, 1],
        [1, 5, 6, 7, 0],
        [6, 3, 8, 5, 1],
        [2, 3, 7, 4, 8],
        [6, 7, 0, 2, 1],
        [5, 1, 4, 8, 3],
        [1, 3, 0, 4, 5],
        [4, 3, 8, 0, 6]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9900,  1.0000,  0.0000,  0

  warn(


learning
900
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 3, 4, 7, 1],
        [7, 2, 8, 0, 0],
        [4, 1, 0, 7, 0],
        [6, 0, 0, 7, 0],
        [1, 2, 0, 7, 0],
        [4, 6, 7, 2, 3],
        [1, 5, 2, 0, 0],
        [8, 5, 2, 3, 4]])
target value tensor([[-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [-0.9801, -0.9900,  1.0000,  0

  warn(


Started recording episode 500 to ./videos/variable_turn_tictactoe_muzero/1/episode_000500.mp4
learning
1000
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 3, 2, 7, 8],
        [5, 2, 6, 8, 4],
        [1, 3, 6, 4, 8],
        [7, 2, 0, 1, 8],
        [3, 6, 1, 2, 5],
        [0, 7, 1, 8, 5],
        [2, 8, 0, 6, 7],
        [4, 0, 3, 5, 8]])
target value tensor([[-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [ 0.9606,  0.9703, -0.9801, -0.9900,  1.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
       

  from pkg_resources import resource_stream, resource_exists


learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
Hidden state shape: (8, 24, 3, 3)
Hidden state shape: (8, 24, 3, 3)
encoder input shape (8, 18, 3, 3)
Testing Player 0 vs Agent random
learning
Player 0 prediction: (tensor([0.0800, 0.0400, 0.1200, 0.0400, 0.2000, 0.0800, 0.2000, 0.0800, 0.1600]), tensor([0.0800, 0.0400, 0.1200, 0.0400, 0.2000, 0.0800, 0.2000, 0.0800, 0.1600]), 0.29303108614408047, tensor(4), {'network_policy': tensor([0.0547, 0.0664, 0.1289, 0.0913, 0.1934, 0.0884, 0.1953, 0.0918, 0.0898],
       dtype=torch.bfloat16), 'network_value': 0.419921875, 'search_policy': tensor([0.0800, 0.0400, 0.1200, 0.0400, 0.2000, 0.0800, 0.2000, 0.0800, 0.1600]), 'search_value': 0.29303108614408047, 'root_children_values': tensor([0.3164, 0.3672, 0.3429, 0.2910, 0.3885, 0.3496, 0.2969, 0.3273, 0.3147])})
action: 4
Player 0 prediction: (tensor([0.0800, 0.0800, 0.1200, 0.2400, 0.0000, 0.1200, 0.2000, 0.0400, 0.1200]), tensor([0.0800, 0.0800, 0.1200,

  warn(


learning
1100
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 5, 4, 1, 6],
        [0, 2, 5, 0, 5],
        [6, 1, 5, 2, 7],
        [7, 0, 1, 5, 4],
        [6, 2, 0, 5, 5],
        [4, 0, 4, 5, 5],
        [8, 4, 3, 7, 0],
        [1, 3, 2, 4, 0]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  

  warn(


learning
1200
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 6, 2, 3, 4],
        [0, 8, 1, 6, 7],
        [4, 1, 0, 3, 0],
        [4, 3, 1, 5, 0],
        [8, 4, 6, 2, 0],
        [6, 8, 3, 4, 2],
        [5, 4, 6, 3, 7],
        [5, 1, 4, 3, 6]])
target value tensor([[ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.0000,  0.0000,  0.0000,  

  warn(


learning
1300
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 0, 1, 6, 1],
        [7, 8, 4, 5, 3],
        [6, 1, 0, 7, 2],
        [6, 0, 1, 6, 1],
        [5, 7, 6, 0, 2],
        [8, 2, 7, 0, 4],
        [8, 1, 4, 2, 0],
        [1, 4, 6, 0, 1]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9606,  0.9703, -0.9801, -

  warn(


learning
1400
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 4, 7, 5, 2],
        [1, 4, 5, 0, 0],
        [7, 0, 1, 3, 5],
        [1, 0, 3, 8, 7],
        [7, 0, 0, 8, 7],
        [3, 7, 0, 8, 7],
        [6, 8, 1, 2, 0],
        [5, 4, 6, 2, 8]])
target value tensor([[ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  

  warn(


learning
1500
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[8, 0, 7, 8, 2],
        [6, 3, 2, 4, 0],
        [2, 5, 0, 0, 2],
        [3, 7, 4, 6, 0],
        [6, 2, 5, 0, 2],
        [5, 3, 1, 0, 2],
        [5, 7, 6, 0, 2],
        [5, 6, 4, 8, 7]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  

  warn(


learning
1600
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[2, 0, 5, 6, 4],
        [4, 3, 1, 8, 5],
        [2, 0, 1, 8, 0],
        [7, 6, 0, 6, 4],
        [4, 2, 5, 8, 3],
        [1, 6, 0, 6, 4],
        [2, 6, 1, 8, 0],
        [2, 0, 5, 6, 4]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606,  0.9703, -0.9801, -0.9900,  1.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  

  warn(


learning
1700
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[2, 4, 6, 3, 8],
        [2, 6, 8, 0, 3],
        [7, 3, 8, 1, 0],
        [5, 0, 8, 2, 7],
        [4, 0, 3, 7, 6],
        [6, 5, 0, 8, 2],
        [6, 8, 0, 7, 6],
        [8, 6, 3, 0, 6]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [ 0.9606,  0.9703, -0.9801, -0.9900,  1.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  

  warn(


learning
1800
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[5, 0, 0, 3, 2],
        [2, 0, 0, 3, 2],
        [5, 2, 3, 6, 4],
        [3, 0, 0, 3, 2],
        [0, 2, 8, 1, 0],
        [8, 0, 0, 3, 2],
        [8, 4, 2, 0, 2],
        [7, 8, 1, 4, 0]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  

  warn(


learning
1900
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[2, 0, 8, 4, 8],
        [8, 7, 6, 5, 4],
        [4, 1, 5, 0, 3],
        [2, 5, 1, 6, 0],
        [6, 5, 0, 4, 8],
        [5, 3, 0, 6, 2],
        [6, 3, 5, 7, 4],
        [1, 2, 5, 0, 8]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9606, -0.9703, -0.9801,  

  warn(


Stopped recording episode 1000. Recorded 10 frames.
learning
2000
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 3, 0, 2, 7],
        [4, 0, 7, 2, 7],
        [1, 0, 3, 8, 5],
        [3, 0, 7, 2, 7],
        [6, 0, 7, 2, 7],
        [8, 6, 7, 5, 0],
        [8, 0, 7, 2, 7],
        [5, 3, 2, 1, 6]])
target value tensor([[ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9

  from pkg_resources import resource_stream, resource_exists


learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
Hidden state shape: (8, 24, 3, 3)
Hidden state shape: (8, 24, 3, 3)
encoder input shape (8, 18, 3, 3)
Testing Player 0 vs Agent random
Player 0 prediction: (tensor([0.0400, 0.0400, 0.0800, 0.0800, 0.2000, 0.0800, 0.2800, 0.1200, 0.0800]), tensor([0.0400, 0.0400, 0.0800, 0.0800, 0.2000, 0.0800, 0.2800, 0.1200, 0.0800]), 0.29088922119140614, tensor(6), {'network_policy': tensor([0.0586, 0.0586, 0.0957, 0.0884, 0.2197, 0.0928, 0.2002, 0.0801, 0.1050],
       dtype=torch.bfloat16), 'network_value': 0.3359375, 'search_policy': tensor([0.0400, 0.0400, 0.0800, 0.0800, 0.2000, 0.0800, 0.2800, 0.1200, 0.0800]), 'search_value': 0.29088922119140614, 'root_children_values': tensor([0.3164, 0.2930, 0.3789, 0.3486, 0.3885, 0.3730, 0.2643, 0.3023, 0.3945])})
action: 6
Player 0 prediction: (tensor([0.0400, 0.0400, 0.2000, 0.0800, 0.3200, 0.0800, 0.0000, 0.0800, 0.1600]), tensor([0.0400, 0.0400, 0.2000, 0

  warn(


learning
2100
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 1, 0, 8, 2],
        [1, 6, 5, 2, 0],
        [7, 5, 6, 8, 0],
        [6, 8, 7, 4, 5],
        [4, 6, 8, 0, 2],
        [8, 6, 0, 7, 2],
        [8, 6, 7, 2, 0],
        [5, 0, 1, 7, 2]])
target value tensor([[-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -

  warn(


Started recording episode 1100 to ./videos/variable_turn_tictactoe_muzero/1/episode_001100.mp4
learning
2200
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[8, 6, 1, 7, 5],
        [7, 5, 0, 4, 1],
        [0, 6, 3, 7, 2],
        [8, 0, 1, 4, 1],
        [8, 6, 1, 3, 7],
        [1, 4, 0, 4, 1],
        [4, 2, 0, 4, 1],
        [8, 5, 0, 2, 3]])
target value tensor([[ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
      

  warn(


learning
2300
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 4, 0, 3, 6],
        [3, 7, 4, 8, 1],
        [7, 3, 0, 6, 0],
        [1, 8, 4, 2, 0],
        [2, 8, 7, 6, 0],
        [6, 1, 5, 3, 0],
        [7, 4, 6, 8, 2],
        [7, 0, 6, 6, 0]])
target value tensor([[ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  

  warn(


Started recording episode 1200 to ./videos/variable_turn_tictactoe_muzero/0/episode_001200.mp4
Started recording episode 1200 to ./videos/variable_turn_tictactoe_muzero/1/episode_001200.mp4
Stopped recording episode 1200. Recorded 7 frames.
Stopped recording episode 1200. Recorded 9 frames.
Started recording episode 1200 to ./videos/variable_turn_tictactoe_muzero/3/episode_001200.mp4
learning
2400
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[5, 0, 0, 6, 0],
        [0, 0, 7, 6, 0],
        [1, 3, 4, 0, 0],
        [2, 0, 7, 6, 0],
        [1, 2, 4, 3, 0],
        [1, 3, 5, 2, 0],
        [0, 6, 4, 2, 5],
        [2, 5, 7, 4, 0]])
target value tensor([[-0.9900,  1.0000,  0.0000

  warn(


learning
2500
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 3, 4, 0, 5],
        [4, 5, 8, 3, 6],
        [6, 2, 8, 4, 3],
        [5, 2, 0, 6, 5],
        [3, 8, 5, 7, 0],
        [2, 0, 0, 6, 5],
        [2, 5, 3, 4, 8],
        [6, 0, 8, 3, 7]])
target value tensor([[-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606,  0.9703, -0.9801, -

  warn(


learning
2600
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[0, 1, 3, 5, 6],
        [6, 0, 0, 7, 0],
        [7, 6, 2, 4, 0],
        [5, 2, 6, 8, 4],
        [8, 2, 6, 5, 1],
        [7, 0, 6, 7, 0],
        [8, 3, 4, 0, 6],
        [0, 0, 6, 7, 0]])
target value tensor([[-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  

  warn(


learning
2700
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 3, 4, 0, 2],
        [1, 0, 2, 8, 7],
        [7, 4, 0, 8, 7],
        [6, 0, 2, 8, 7],
        [4, 1, 8, 0, 6],
        [1, 0, 2, 8, 7],
        [4, 0, 2, 8, 7],
        [5, 3, 0, 4, 6]])
target value tensor([[-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  

  warn(


Started recording episode 1400 to ./videos/variable_turn_tictactoe_muzero/2/episode_001400.mp4
learning
2800
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 3, 5, 4, 7],
        [7, 0, 8, 6, 5],
        [1, 6, 3, 2, 0],
        [8, 0, 8, 6, 5],
        [8, 0, 4, 6, 3],
        [2, 0, 8, 3, 1],
        [5, 0, 7, 6, 4],
        [2, 6, 0, 0, 5]])
target value tensor([[ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
      

  warn(


learning
2900
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[0, 7, 8, 1, 6],
        [4, 0, 7, 1, 2],
        [2, 7, 4, 8, 3],
        [4, 7, 3, 5, 0],
        [0, 7, 1, 6, 2],
        [5, 3, 8, 0, 6],
        [3, 8, 2, 4, 0],
        [1, 5, 3, 0, 2]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  

  warn(


learning
3000
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 8, 5, 7, 2],
        [0, 8, 5, 2, 3],
        [5, 1, 8, 7, 0],
        [7, 8, 4, 3, 1],
        [6, 1, 2, 7, 0],
        [4, 2, 0, 6, 0],
        [8, 6, 0, 6, 0],
        [6, 4, 0, 5, 8]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  

  from pkg_resources import resource_stream, resource_exists


learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
Hidden state shape: (8, 24, 3, 3)
Hidden state shape: (8, 24, 3, 3)
encoder input shape (8, 18, 3, 3)
learning
Testing Player 0 vs Agent random
Player 0 prediction: (tensor([0.0400, 0.0400, 0.2800, 0.0400, 0.2000, 0.0400, 0.1200, 0.0400, 0.2000]), tensor([0.0400, 0.0400, 0.2800, 0.0400, 0.2000, 0.0400, 0.1200, 0.0400, 0.2000]), 0.3667195898789626, tensor(2), {'network_policy': tensor([0.0747, 0.0708, 0.1118, 0.0830, 0.2227, 0.0933, 0.1543, 0.0791, 0.1099],
       dtype=torch.bfloat16), 'network_value': 0.3359375, 'search_policy': tensor([0.0400, 0.0400, 0.2800, 0.0400, 0.2000, 0.0400, 0.1200, 0.0400, 0.2000]), 'search_value': 0.3667195898789626, 'root_children_values': tensor([0.2754, 0.2520, 0.4788, 0.1357, 0.4679, 0.2910, 0.3641, 0.2832, 0.4083])})
action: 2
Player 0 prediction: (tensor([0.0800, 0.0400, 0.0000, 0.0800, 0.2800, 0.0800, 0.0800, 0.0400, 0.3200]), tensor([0.0800, 0.0400, 0.

  warn(


learning
3100
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 3, 5, 2, 7],
        [3, 6, 1, 0, 3],
        [6, 0, 1, 8, 3],
        [8, 4, 1, 5, 2],
        [4, 0, 1, 8, 3],
        [6, 4, 8, 3, 5],
        [2, 0, 1, 8, 3],
        [0, 3, 4, 6, 2]])
target value tensor([[-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [ 1.0000,  0.0000,  0.0000,  

  warn(


learning
3200
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 8, 7, 0, 5],
        [1, 5, 7, 8, 6],
        [6, 2, 0, 7, 5],
        [5, 4, 3, 8, 6],
        [3, 2, 0, 0, 5],
        [4, 6, 0, 1, 5],
        [2, 0, 6, 1, 5],
        [4, 8, 6, 0, 2]])
target value tensor([[-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  

  warn(


Stopped recording episode 1700. Recorded 6 frames.
learning
3300
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 2, 7, 5, 0],
        [8, 2, 0, 6, 8],
        [2, 3, 5, 1, 7],
        [3, 0, 5, 6, 8],
        [1, 8, 2, 7, 6],
        [1, 8, 0, 6, 0],
        [6, 7, 0, 8, 0],
        [2, 1, 3, 5, 0]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.00

  warn(


learning
3400
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[8, 3, 4, 6, 0],
        [7, 0, 4, 0, 7],
        [1, 0, 8, 0, 7],
        [3, 0, 6, 7, 8],
        [4, 6, 5, 8, 1],
        [4, 1, 8, 0, 2],
        [4, 7, 1, 5, 0],
        [3, 4, 6, 2, 1]])
target value tensor([[-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9703, -0.9801, -0.9900,  

  warn(


learning
3500
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 5, 2, 0, 0],
        [2, 0, 8, 5, 0],
        [0, 5, 7, 2, 0],
        [4, 1, 5, 3, 6],
        [1, 4, 5, 2, 8],
        [6, 8, 3, 7, 0],
        [4, 0, 8, 2, 0],
        [2, 4, 0, 6, 3]])
target value tensor([[-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  

  warn(


learning
3600
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 8, 5, 4, 1],
        [4, 3, 8, 6, 1],
        [6, 7, 3, 0, 5],
        [0, 4, 2, 6, 1],
        [4, 3, 8, 7, 6],
        [1, 2, 0, 6, 6],
        [5, 2, 1, 7, 0],
        [0, 1, 2, 0, 6]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  

  warn(


learning
3700
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[5, 1, 6, 0, 4],
        [7, 0, 2, 1, 4],
        [4, 1, 3, 7, 6],
        [0, 0, 4, 3, 4],
        [4, 6, 2, 8, 0],
        [4, 0, 8, 5, 0],
        [7, 2, 6, 8, 0],
        [3, 0, 8, 6, 2]])
target value tensor([[-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  

  warn(


Stopped recording episode 2000. Recorded 7 frames.
learning
3800
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 6, 7, 0, 4],
        [8, 3, 2, 1, 0],
        [2, 8, 5, 0, 6],
        [5, 8, 6, 7, 1],
        [1, 2, 8, 3, 0],
        [8, 0, 1, 7, 6],
        [2, 0, 1, 3, 5],
        [0, 3, 0, 7, 6]])
target value tensor([[ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.00

  warn(


learning
3900
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 5, 0, 4, 2],
        [2, 0, 6, 0, 8],
        [6, 4, 3, 0, 8],
        [2, 8, 0, 7, 8],
        [6, 8, 7, 4, 0],
        [5, 4, 0, 2, 8],
        [5, 6, 0, 0, 8],
        [3, 8, 2, 6, 0]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  

  warn(


learning
4000
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[5, 1, 8, 3, 6],
        [0, 0, 3, 1, 2],
        [6, 0, 0, 1, 2],
        [2, 4, 8, 7, 1],
        [4, 3, 7, 6, 2],
        [8, 0, 4, 2, 0],
        [3, 0, 3, 1, 2],
        [3, 0, 5, 1, 4]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  

  from pkg_resources import resource_stream, resource_exists


learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
Hidden state shape: (8, 24, 3, 3)
Hidden state shape: (8, 24, 3, 3)
encoder input shape (8, 18, 3, 3)
learning
Testing Player 0 vs Agent random
Player 0 prediction: (tensor([0.0800, 0.0400, 0.2400, 0.0800, 0.2000, 0.0800, 0.1200, 0.0400, 0.1200]), tensor([0.0800, 0.0400, 0.2400, 0.0800, 0.2000, 0.0800, 0.1200, 0.0400, 0.1200]), 0.3905782451641376, tensor(2), {'network_policy': tensor([0.0752, 0.0752, 0.1133, 0.0938, 0.2070, 0.0840, 0.1602, 0.0801, 0.1108],
       dtype=torch.bfloat16), 'network_value': 0.43359375, 'search_policy': tensor([0.0800, 0.0400, 0.2400, 0.0800, 0.2000, 0.0800, 0.1200, 0.0400, 0.1200]), 'search_value': 0.3905782451641376, 'root_children_values': tensor([0.4570, 0.2734, 0.4935, 0.4258, 0.5247, 0.4219, 0.4566, 0.3008, 0.4395])})
action: 2
Player 0 prediction: (tensor([0.0800, 0.0400, 0.0000, 0.0400, 0.3600, 0.0400, 0.0800, 0.1600, 0.2000]), tensor([0.0800, 0.0400, 0.0000, 0.

  warn(


learning
4100
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 8, 0, 4, 1],
        [0, 6, 0, 8, 0],
        [7, 3, 2, 4, 5],
        [4, 8, 2, 3, 0],
        [2, 0, 5, 8, 0],
        [1, 0, 0, 8, 0],
        [4, 8, 2, 7, 1],
        [6, 7, 0, 8, 3]])
target value tensor([[-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  

  warn(


learning
4200
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 6, 5, 8, 0],
        [8, 2, 1, 5, 3],
        [4, 3, 7, 8, 6],
        [0, 6, 3, 0, 8],
        [1, 3, 0, 6, 8],
        [0, 0, 0, 6, 8],
        [4, 0, 2, 0, 8],
        [6, 2, 8, 0, 7]])
target value tensor([[ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  

  warn(


learning
4300
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 0, 2, 5, 0],
        [5, 6, 7, 0, 1],
        [3, 0, 5, 2, 3],
        [2, 6, 5, 0, 4],
        [2, 4, 0, 2, 3],
        [0, 4, 0, 2, 3],
        [4, 0, 5, 2, 3],
        [7, 1, 6, 2, 0]])
target value tensor([[-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  

  warn(


learning
4400
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 3, 0, 1, 6],
        [2, 0, 5, 1, 0],
        [4, 6, 1, 0, 3],
        [0, 4, 2, 6, 8],
        [2, 4, 0, 7, 0],
        [2, 6, 3, 5, 0],
        [0, 0, 3, 1, 6],
        [2, 0, 3, 1, 6]])
target value tensor([[-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  

  warn(


learning
4500
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 2, 3, 7, 5],
        [0, 5, 3, 2, 8],
        [2, 3, 8, 0, 1],
        [3, 4, 6, 2, 8],
        [8, 5, 0, 7, 1],
        [1, 6, 4, 8, 0],
        [2, 4, 0, 6, 8],
        [2, 3, 6, 8, 0]])
target value tensor([[ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -

  warn(


learning
4600
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 4, 7, 0, 5],
        [2, 4, 0, 2, 5],
        [0, 1, 8, 2, 3],
        [5, 7, 0, 2, 5],
        [2, 4, 7, 5, 1],
        [2, 5, 6, 8, 1],
        [0, 1, 6, 0, 5],
        [5, 6, 7, 8, 2]])
target value tensor([[-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [-0.9801,  0.9900,  1.0000,  

  warn(


learning
4700
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 0, 7, 2, 7],
        [2, 7, 4, 1, 6],
        [7, 0, 7, 2, 7],
        [1, 0, 7, 2, 7],
        [4, 0, 7, 2, 7],
        [7, 5, 0, 2, 6],
        [8, 3, 5, 6, 7],
        [6, 1, 4, 3, 0]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  

  warn(


learning
4800
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 4, 2, 1, 5],
        [7, 5, 2, 0, 1],
        [6, 2, 8, 7, 0],
        [3, 0, 6, 2, 0],
        [0, 1, 0, 2, 1],
        [7, 3, 6, 4, 8],
        [2, 0, 8, 0, 1],
        [6, 2, 0, 2, 1]])
target value tensor([[ 0.9606,  0.9703, -0.9801, -0.9900,  1.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415, -0.9510,  0.9606,  0.9703, -0.9801, -0.9900],
        [-0.9801,  0.9900,  1.0000,  

  warn(


learning
4900
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[6, 0, 2, 5, 3],
        [3, 4, 7, 8, 2],
        [8, 0, 7, 0, 7],
        [1, 0, 8, 0, 7],
        [4, 8, 5, 3, 1],
        [5, 3, 0, 0, 7],
        [1, 2, 8, 7, 0],
        [4, 1, 0, 6, 8]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9415,  0.9510,  0.9606, -0.9703, -0.9801,  0.9900],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -0.9801, -0.9900,  1.0000],
        [ 0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  

  warn(


Started recording episode 2800 to ./videos/variable_turn_tictactoe_muzero/0/episode_002800.mp4
learning
5000
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 0, 0, 1, 5],
        [3, 6, 0, 1, 2],
        [3, 1, 0, 1, 5],
        [3, 6, 8, 0, 0],
        [8, 0, 6, 2, 0],
        [1, 0, 1, 1, 5],
        [4, 0, 6, 0, 5],
        [6, 8, 2, 5, 3]])
target value tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
      

  from pkg_resources import resource_stream, resource_exists


learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
learning
Hidden state shape: (8, 24, 3, 3)
Hidden state shape: (8, 24, 3, 3)
encoder input shape (8, 18, 3, 3)
Testing Player 0 vs Agent random
Player 0 prediction: (tensor([0.1600, 0.0400, 0.2000, 0.0400, 0.3200, 0.0400, 0.1200, 0.0400, 0.0400]), tensor([0.1600, 0.0400, 0.2000, 0.0400, 0.3200, 0.0400, 0.1200, 0.0400, 0.0400]), 0.5434104383427546, tensor(4), {'network_policy': tensor([0.0625, 0.0615, 0.1128, 0.0742, 0.2715, 0.0850, 0.1494, 0.0786, 0.1045],
       dtype=torch.bfloat16), 'network_value': 0.369140625, 'search_policy': tensor([0.1600, 0.0400, 0.2000, 0.0400, 0.3200, 0.0400, 0.1200, 0.0400, 0.0400]), 'search_value': 0.5434104383427546, 'root_children_values': tensor([0.6145, 0.2021, 0.6006, 0.2354, 0.6696, 0.2969, 0.5265, 0.0967, 0.3398])})
action: 4
learning
Player 0 prediction: (tensor([0.1200, 0.0400, 0.2000, 0.0800, 0.0000, 0.0800, 0.1600, 0.0800, 0.2400]), tensor

  warn(


learning
5100
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[1, 2, 3, 5, 0],
        [8, 1, 3, 4, 7],
        [6, 7, 2, 4, 5],
        [8, 6, 7, 0, 5],
        [2, 0, 1, 0, 0],
        [1, 0, 2, 1, 0],
        [4, 2, 3, 6, 5],
        [1, 0, 2, 1, 0]])
target value tensor([[-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [ 0.9227,  0.9321, -0.9415, -0.9510,  0.9606,  0.9703],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -

  warn(


learning
5200
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[4, 3, 0, 6, 1],
        [7, 8, 0, 6, 8],
        [5, 4, 0, 7, 0],
        [6, 1, 7, 2, 4],
        [0, 0, 7, 6, 8],
        [6, 8, 4, 1, 3],
        [3, 0, 2, 6, 4],
        [7, 0, 1, 4, 6]])
target value tensor([[-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9321, -0.9415,  0.9510,  0.9606, -0.9703, -0.9801],
        [ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [ 0.9606, -0.9703, -0.9801,  

  warn(


average score: 0.98
Test score {'score': 0.98, 'max_score': 1, 'min_score': -1}
Started recording episode 3000 to ./videos/variable_turn_tictactoe_muzero/3/episode_003000.mp4
learning
Stopped recording episode 3000. Recorded 7 frames.
5300
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[2, 0, 5, 7, 6],
        [0, 8, 6, 0, 6],
        [6, 5, 2, 8, 0],
        [5, 7, 8, 4, 1],
        [8, 7, 6, 0, 5],
        [6, 7, 3, 5, 0],
        [0, 8, 0, 7, 6],
        [5, 7, 1, 2, 0]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
   

  warn(


learning
5400
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 0, 4, 0, 0],
        [5, 8, 4, 0, 0],
        [4, 2, 7, 0, 3],
        [6, 1, 3, 0, 0],
        [3, 5, 4, 6, 0],
        [1, 0, 2, 0, 0],
        [6, 4, 7, 2, 8],
        [4, 0, 1, 7, 0]])
target value tensor([[ 1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [-0.9703, -0.9801,  0.9900,  1.0000,  0.0000,  0.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -

  warn(


learning
5500
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[7, 6, 3, 8, 0],
        [4, 8, 3, 1, 7],
        [8, 1, 5, 0, 3],
        [7, 6, 3, 0, 4],
        [5, 0, 4, 0, 3],
        [4, 7, 8, 3, 6],
        [5, 4, 8, 3, 6],
        [4, 2, 8, 1, 7]])
target value tensor([[ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9801, -0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9321, -0.9415, -0.9510,  0.9606,  0.9703, -0.9801],
        [-0.9801,  0.9900,  1.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [-0.9510,  0.9606,  0.9703, -

  warn(


learning
5600
actions shape torch.Size([8, 5])
target value shape torch.Size([8, 6])
predicted values shape torch.Size([8, 6, 1])
target rewards shape torch.Size([8, 6])
predicted rewards shape torch.Size([8, 6, 1])
target to plays shape torch.Size([8, 6, 2])
predicted to_plays shape torch.Size([8, 6, 2])
masks shape torch.Size([8, 6]) torch.Size([8, 6])
actions tensor([[3, 2, 6, 4, 0],
        [2, 3, 7, 5, 1],
        [6, 8, 3, 0, 1],
        [4, 0, 3, 5, 1],
        [5, 4, 0, 0, 1],
        [4, 0, 7, 6, 2],
        [8, 6, 0, 7, 2],
        [3, 0, 2, 0, 1]])
target value tensor([[ 0.9703, -0.9801, -0.9900,  1.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -0.9801,  0.9900,  1.0000],
        [-0.9900,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.9606, -0.9703, -0.9801,  0.9900,  1.0000,  0.0000],
        [ 0.9510,  0.9606, -0.9703, -