In [1]:
# Change directory to the root of the project
import os 
os.chdir('..')

import wandb
import torch
from applications.tic_tac_toe.train import train

In [2]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'loss',
        'goal': 'minimize'
    },
    'parameters': {
        # Optimizer parameters
        'learning_rate': {
            'distribution': 'log_uniform_values',
            'min': 0.00001,
            'max': 0.1
        },
        'weight_decay': {
            'distribution': 'log_uniform_values',
            'min': 0.00001,
            'max': 0.1
        },

        # Model parameters
        'attention_layers': {
            'values': [1, 2, 3, 4]
        },
        'transformer_size': {
            'values': ['tiny', 'small', 'medium', 'large', 'xlarge']
        },
        'dropout': {
            'values': [0.0, 0.1, 0.01, 0.001, 0.0001]
        },
        'norm_first': {
            'values': [True, False]
        },
        'activation': {
            'values': ['relu', 'gelu']
        },

        # Trainer parameters
        'replay_buffer_max_size': {
            'value': 10000
        },
        'value_softness': {
            'distribution': 'uniform',
            'min': 0.0,
            'max': 1.0
        },
        'mask_illegal_moves': {
            'values': [True, False]
        },
        'mask_value': {
            'values': [-20.0, -15.0, -10.0, -5.0]
        },

        # Training parameters
        'num_iterations': {
            'value': 100
        },
        'games_per_iteration': {
            'value': 10
        },
        'batch_size': {
            'values': [128, 256, 512, 1024]
        },
        'steps_per_iteration': {
            'value': 100
        },
        'num_simulations': {
            'values': [100]
        },
        'checkpoint_frequency': {
            'value': 20
        }
    }
}

# Transformer size mapping
transformer_size_mapping = {
    'tiny': { 'embed_dim': 4, 'num_heads': 1, 'feedforward_dim': 16 },
    'small': { 'embed_dim': 8, 'num_heads': 2, 'feedforward_dim': 32 },
    'medium': { 'embed_dim': 16, 'num_heads': 4, 'feedforward_dim': 64 },
    'large': { 'embed_dim': 32, 'num_heads': 8, 'feedforward_dim': 128 },
    'xlarge': { 'embed_dim': 64, 'num_heads': 16, 'feedforward_dim': 256 }
}


In [3]:
# Some default parameters

from core.implementations.AlphaZero import AlphaZeroConfig

# AlphaZero parameters
alphazero_config = AlphaZeroConfig(
    exploration_constant=1.0,
    dirichlet_alpha=0.3,
    dirichlet_epsilon=0.25,
    temperature=1.0
)

# AlphaZero evaluation parameters
alphazero_eval_config = AlphaZeroConfig(
    exploration_constant=1.0,
    dirichlet_alpha=0.0,
    dirichlet_epsilon=0.0,
    temperature=0.0
)

In [4]:
from applications.tic_tac_toe.transformer_model import TicTacToeTransformerInterface

def sweep_agent():
    with wandb.init(project='AlphaZero-TicTacToe') as run:
        config = {
            'model_type': 'transformer',
            'model_params': {
                'attention_layers': run.config.attention_layers,
                **transformer_size_mapping[run.config.transformer_size]
            },
            'device': 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu',
            'tree_search_params': alphazero_config,
            'tree_search_eval_params': alphazero_eval_config,
            'trainer_params': {
                'replay_buffer_max_size': run.config.replay_buffer_max_size,
                'value_softness': run.config.value_softness,
                'mask_illegal_moves': run.config.mask_illegal_moves,
                'mask_value': run.config.mask_value
            },
            'optimizer_params': {
                'lr': run.config.learning_rate,
                'betas': (0.9, 0.999),
                'eps': 1e-8,
                'weight_decay': run.config.weight_decay,
                'amsgrad': False
            },
            'training_params': {
                'num_iterations': run.config.num_iterations,
                'games_per_iteration': run.config.games_per_iteration,
                'batch_size': run.config.batch_size,
                'steps_per_iteration': run.config.steps_per_iteration,
                'num_simulations': run.config.num_simulations,
                'checkpoint_frequency': run.config.checkpoint_frequency
            }
        }

        model = TicTacToeTransformerInterface(
            device=config['device'],
            **config['model_params']
        )

        # Use training script
        train(
            config=config,
            model=model,
            use_wandb=True,
            wandb_watch_params={
                'watch': True,
                'log': 'all',
                'log_freq': 100,
                'log_graph': True
            },
            wandb_run=run
        )

In [5]:
sweep_id = wandb.sweep(
    sweep=sweep_config,
    project='AlphaZero-TicTacToe',
    entity='eigenway',
)


wandb.agent(
    sweep_id,
    function=sweep_agent,
    count=50
)

Create sweep with ID: z91b8lti
Sweep URL: https://wandb.ai/eigenway/AlphaZero-TicTacToe/sweeps/z91b8lti


[34m[1mwandb[0m: Agent Starting Run: 1tvknncm with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0007647649282292424
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.27162888872040936
[34m[1mwandb[0m: 	weight_decay: 0.005288860670490674
[34m[1mwandb[0m: Currently logged in as: [33meohjelle[0m ([33meigenway[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 98 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.6434
Average policy_loss: 1.8133
Average value_loss: 0.8301
Replay buffer size: 98
Time taken: 6.7s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.8988
Average policy_loss: 1.1526
Average value_loss: 0.7461
Replay buffer size: 188
Time taken: 9.1s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.4862
Average policy_loss: 0.8457
Average value_loss: 0.6405
Replay buffer size: 282
Time taken: 8.4s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.4541
Average policy_loss: 0.9220
Average value_loss: 0.5321
Replay buffer size: 376
Time taken: 12.2

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
iteration_time,▁▁▂▆▂▆▂▂▂▂▂▂▂▇▂▂▇▂▂▃▇▂▃▃▂▂▃█▃▃▃█▃▃▃▃▃▃▃▃
loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▇▆█▅▄▄▄▂▃▂▁▄▃▅▃▂▃▄▂▄▅▄▂▄▆▄▅▆▆▅▅█▄▄▅▄▅▄▄
policy_loss,█▃▂▁▁▁▁▂▂▂▂▂▃▃▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
value_loss,█▅▃▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.3
buffer_size,8519.0
iteration_time,50.30384
loss,1.18897
num_games,10.0
num_positions,88.0
policy_loss,0.92552
total_time_hours,0.59856
value_loss,0.26345


[34m[1mwandb[0m: Agent Starting Run: h47yk2qs with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.015988642514553713
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.3581637718293932
[34m[1mwandb[0m: 	weight_decay: 0.00015937504990452334


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 77 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.0407
Average policy_loss: 2.5109
Average value_loss: 0.5299
Replay buffer size: 77
Time taken: 5.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 71 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.4659
Average policy_loss: 0.9214
Average value_loss: 0.5445
Replay buffer size: 148
Time taken: 5.9s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 77 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.3646
Average policy_loss: 0.8745
Average value_loss: 0.4901
Replay buffer size: 225
Time taken: 5.3s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 75 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.2981
Average policy_loss: 0.9000
Average value_loss: 0.3981
Replay buffer size: 300
Time taken: 7.5s

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇██
iteration_time,▁▁▁▁▁▁▁▁▆▂▂▂▇▂▂▂▇▂▂▇▂▂▂▂▂▂▂▂▂▇▃▂▂▂▇▂▂███
loss,█▇▅▅▆▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▁▅▄▃▇▅▃▇▅▆▁▇▇▆▇▇▆▆▆██▅█▆▆▇▆█▇▅▇▆▇▇▇▆▅▅▅
policy_loss,█▄▄▅▅▅▅▄▅▄▃▃▃▃▃▃▃▃▂▂▃▂▂▂▂▂▁▁▁▁▂▁▁▁▁▁▁▂▁▁
value_loss,█▅▇▇▅▄▃▃▃▃▃▃▃▃▃▂▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,9189.0
iteration_time,44.42413
loss,0.83218
num_games,10.0
num_positions,92.0
policy_loss,0.67355
total_time_hours,0.49617
value_loss,0.15863


[34m[1mwandb[0m: Agent Starting Run: gmj6nbnq with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.000696525560025685
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.3835268511490555
[34m[1mwandb[0m: 	weight_decay: 0.0001228261684577894


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 74 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.0069
Average policy_loss: 3.0167
Average value_loss: 0.9902
Replay buffer size: 74
Time taken: 13.2s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 93 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.5127
Average policy_loss: 1.8395
Average value_loss: 0.6732
Replay buffer size: 167
Time taken: 14.8s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.0815
Average policy_loss: 1.5837
Average value_loss: 0.4978
Replay buffer size: 253
Time taken: 14.2s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.8694
Average policy_loss: 1.4378
Average value_loss: 0.4316
Replay buffer size: 337
Time taken: 1

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇██
iteration_time,▁▁▁▆▁▁▁▁▇▁▇▂▇▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▇▂▂▁█▁█▁▂▁██
loss,█▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▃▅▆▂▃▅▂▅▅▇▃▅▂▆▄▆▂▃▄▆█▆▂▂▃▄▆▃▅▄▆▁▅▁▃▄▅█▆▆
policy_loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,██▇▆▆▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.15
buffer_size,8890.0
iteration_time,76.78697
loss,1.09626
num_games,10.0
num_positions,92.0
policy_loss,0.91862
total_time_hours,0.89752
value_loss,0.17764


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zno139it with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.02312932352647822
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.12951020836647031
[34m[1mwandb[0m: 	weight_decay: 0.0005114952601260686


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.4415
Average policy_loss: 1.2819
Average value_loss: 0.1596
Replay buffer size: 80
Time taken: 6.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.4980
Average policy_loss: 0.4389
Average value_loss: 0.0590
Replay buffer size: 160
Time taken: 7.1s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 78 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.3726
Average policy_loss: 0.3358
Average value_loss: 0.0368
Replay buffer size: 238
Time taken: 7.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.4152
Average policy_loss: 0.3569
Average value_loss: 0.0583
Replay buffer size: 322
Time taken: 8.8s

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
iteration_time,▁▁▁▁▅▁▆▂▂▁▂▂▂▂▂▇▂▂▂▂▂▂▂▂▂▂▇▂▂█▂█▇▂█▂▃▂▃▃
loss,▂▁▁▄▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▆▆▆▇▇▇▇▇█▇▇▇▇███
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▅▂██▅▁▂▃▃▂▃▃▂▂▇▆▇▄▆▄▄▄▃▄▃▅▃▄▅▂▅▅▇▄▅▃▃▄▃
policy_loss,█▁▁▁▂▂▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
value_loss,▂▁▅▇▅▆▆▅▅▅▆▅▆▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█▇▅▅▅▆▆▆▆▆

0,1
best_win_rate,1.25
buffer_size,8061.0
iteration_time,53.33186
loss,0.96067
num_games,10.0
num_positions,73.0
policy_loss,0.75696
total_time_hours,0.49783
value_loss,0.20372


[34m[1mwandb[0m: Agent Starting Run: axgf7yh4 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0030123667783113514
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.6408811770146178
[34m[1mwandb[0m: 	weight_decay: 0.00022273427496412733


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 89 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.1487
Average policy_loss: 1.5257
Average value_loss: 0.6230
Replay buffer size: 89
Time taken: 10.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 79 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.6744
Average policy_loss: 1.1825
Average value_loss: 0.4919
Replay buffer size: 168
Time taken: 13.2s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.4395
Average policy_loss: 1.1220
Average value_loss: 0.3175
Replay buffer size: 259
Time taken: 15.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.3731
Average policy_loss: 1.1266
Average value_loss: 0.2465
Replay buffer size: 347
Time taken: 1

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
iteration_time,▂▂█▂▇▁▁▆▁▁▁▁▁▁▁▂▂▂▂▂▂▇▂▇▂▂▂▂▂█▂▂█▂▂█▇▂▂█
loss,█▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▇▆▄▅▂▄▁▃▃▃▃▃▃▂▃▂▂▃▁▄▇▃▅▄▄▆▅▆▆▅▄▇▆█▄█▃▆▆▄
policy_loss,█▇█▇▆▆▆▅▅▅▅▄▄▄▃▂▂▂▂▂▂▁▁▁▁▁▁▂▂▁▂▁▁▁▂▁▁▁▁▁
value_loss,█▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂

0,1
best_win_rate,1.25
buffer_size,8279.0
iteration_time,55.43388
loss,1.17218
num_games,10.0
num_positions,82.0
policy_loss,0.97116
total_time_hours,0.61343
value_loss,0.20102


[34m[1mwandb[0m: Agent Starting Run: kh2a625t with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 1.6908936633823125e-05
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.664563195460991
[34m[1mwandb[0m: 	weight_decay: 4.528343525113816e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 93 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.4561
Average policy_loss: 3.3945
Average value_loss: 1.0615
Replay buffer size: 93
Time taken: 4.8s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 98 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.8106
Average policy_loss: 1.7705
Average value_loss: 1.0401
Replay buffer size: 191
Time taken: 6.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 100 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.6104
Average policy_loss: 1.5903
Average value_loss: 1.0201
Replay buffer size: 291
Time taken: 5.1s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.3972
Average policy_loss: 1.3655
Average value_loss: 1.0317
Replay buffer size: 387
Time taken: 5.0

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
iteration_time,▁▁▅▁▁▅▁▂▁▆▂▂▂█▂▂█▂▂▇▂▂▂█▂█▁▂▂▂▂█▂▂▂█▂█▂▁
loss,█▆▅▄▄▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▅█▆█▇▇▆▇▇▅▇█▇▅███▇▇▄▆▅▆▅▅▇▇▇▆▄▂▇▇▃▁▃▇▅▄▄
policy_loss,█▄▂▂▁▁▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█████▇▆▆▆▆▅▅▅▅▅▅▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁

0,1
best_win_rate,0.85
buffer_size,9525.0
iteration_time,24.78189
loss,1.23062
num_games,10.0
num_positions,96.0
policy_loss,0.82865
total_time_hours,0.30604
value_loss,0.40197


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 27j6qc22 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.00022019063459171505
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.8407042315628009
[34m[1mwandb[0m: 	weight_decay: 0.002038351667423441


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 69 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.3786
Average policy_loss: 0.7106
Average value_loss: 0.6680
Replay buffer size: 69
Time taken: 5.1s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 74 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.0586
Average policy_loss: 1.0200
Average value_loss: 1.0386
Replay buffer size: 143
Time taken: 5.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.7712
Average policy_loss: 0.8054
Average value_loss: 0.9658
Replay buffer size: 219
Time taken: 5.1s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 75 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.1766
Average policy_loss: 0.7661
Average value_loss: 0.4104
Replay buffer size: 294
Time taken: 4.3s

0,1
buffer_size,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇█
iteration_time,▂▂▂▁▂▂▁▁▁▁▁▇▁▁▁▇▁▁▁▁▁▂▂▇▁▁▁▂▇▁▂▇▁█▁▁▁▂█▁
loss,█▇▄▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,██▄▄▆▃▂▃▃▃▄▂▂▃▃▅▄▄▂▃▃▃▅▁▆▃▄▄▆▄▂▂▄▃▆▅▅▃▅▁
policy_loss,█▆▅▅▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.15
buffer_size,6558.0
iteration_time,18.03187
loss,0.43243
num_games,10.0
num_positions,64.0
policy_loss,0.39159
total_time_hours,0.20416
value_loss,0.04084


[34m[1mwandb[0m: Agent Starting Run: jpt57a2u with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.026750570538997133
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: small
[34m[1mwandb[0m: 	value_softness: 0.8864439875622776
[34m[1mwandb[0m: 	weight_decay: 0.02283766580110489


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 1 summary:
Average loss: 0.8244
Average policy_loss: 0.6561
Average value_loss: 0.1684
Replay buffer size: 80
Time taken: 5.8s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 78 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.4510
Average policy_loss: 0.4410
Average value_loss: 0.0099
Replay buffer size: 158
Time taken: 8.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.6549
Average policy_loss: 0.5899
Average value_loss: 0.0651
Replay buffer size: 249
Time taken: 18.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 81 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.7230
Average policy_loss: 0.6243
Average value_loss: 0.0986
Replay buffer size: 330
Time taken: 11.

0,1
buffer_size,▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█
iteration_time,▅▁▁▅▁▅▂▂▂▂▂▂▇▂▂▂▂▂▇▂▂█▂▂▂▂▂▂▂▂▂▇█▂▂▂▃█▂▂
loss,▂▁▄▄▆▆▆▇█▇██████▇▇▇▇▇▇▇▇▆▇▇▇▇▇▇█▇▇▇▇▇▇▇▇
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▁▆▆▄▇▂▆▆▆▅▂▄▇▅█▅▆▁▇▆▅▆▇▆▆▂▄▄▂▆▄▆▅█▄▆▄█▆
policy_loss,▄▁▃▄▅▆▆▆▇▇██████████████████████████████
value_loss,▁▅█▇▇▆█▆▇██▇▇▇▇▇▆▆▆▇▅▆▆▅▆▆▅▅▅▆▆▅▆▅▅▅▅▅▅▅

0,1
best_win_rate,1.15
buffer_size,8851.0
iteration_time,68.52719
loss,0.96798
num_games,10.0
num_positions,93.0
policy_loss,0.8556
total_time_hours,0.79989
value_loss,0.11238


[34m[1mwandb[0m: Agent Starting Run: 7aae0iuu with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0040883839026580986
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.3471896281740898
[34m[1mwandb[0m: 	weight_decay: 0.007121786630147011


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.1128
Average policy_loss: 1.7193
Average value_loss: 0.3935
Replay buffer size: 85
Time taken: 7.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.1234
Average policy_loss: 0.7409
Average value_loss: 0.3824
Replay buffer size: 175
Time taken: 9.8s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.0090
Average policy_loss: 0.6561
Average value_loss: 0.3529
Replay buffer size: 261
Time taken: 8.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.9114
Average policy_loss: 0.6532
Average value_loss: 0.2582
Replay buffer size: 357
Time taken: 10.1

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇███
iteration_time,▁▁▁▆▁▅▂▁▂▂▂▂▂▂▆▂▂▂▇▂▂▆▂▂▇▂▂▇▂▂▂▇▂▂▂█▂▂█▂
loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▃▆▆▆▄▆▅▃▅▆▅▆▇▅█▃▅▅▅▂▂▅▅▃▆▆▄▃▄▆▃▇▆▆▆█▁▄▅
policy_loss,█▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
value_loss,█▃▂▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▂▁▂▁▁▁▂▂▂▂▁▁▂▁▂▁▁▁▁▂

0,1
best_win_rate,1.15
buffer_size,9179.0
iteration_time,57.31453
loss,0.8346
num_games,10.0
num_positions,94.0
policy_loss,0.71464
total_time_hours,0.59254
value_loss,0.11996


[34m[1mwandb[0m: Agent Starting Run: y4nwsoge with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.00012424623867201575
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.7367816761136133
[34m[1mwandb[0m: 	weight_decay: 0.0046441418616425265


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.3955
Average policy_loss: 1.5893
Average value_loss: 0.8062
Replay buffer size: 82
Time taken: 7.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 95 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.0477
Average policy_loss: 1.3468
Average value_loss: 0.7009
Replay buffer size: 177
Time taken: 9.8s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.0626
Average policy_loss: 1.4424
Average value_loss: 0.6202
Replay buffer size: 259
Time taken: 9.1s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 97 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.8423
Average policy_loss: 1.2884
Average value_loss: 0.5539
Replay buffer size: 356
Time taken: 9.4s

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
iteration_time,▁▁▆▁▆▂▂▂▂▇▂▂▇▂▂█▂▂▂▂█▂▂▂█▂█▂▂▂▂▂▃▃▂▂▇█▂▇
loss,█▆▅▄▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▇▄▅▅▅▃▆▃▅▆▄▅▇█▆▇▆▇▄▅▃▄▅▄▄▄▅▆▄▄▃▃▃▅▄▁▄▁▂▂
policy_loss,█▆▅▅▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂
value_loss,█▇▆▅▅▅▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.15
buffer_size,8361.0
iteration_time,42.31344
loss,1.07133
num_games,10.0
num_positions,78.0
policy_loss,0.89892
total_time_hours,0.57367
value_loss,0.17241


[34m[1mwandb[0m: Agent Starting Run: kketn5ba with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.010272375296051264
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.34495132894598934
[34m[1mwandb[0m: 	weight_decay: 0.08248752377861358


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.9463
Average policy_loss: 3.3279
Average value_loss: 0.6184
Replay buffer size: 92
Time taken: 9.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.2981
Average policy_loss: 1.0141
Average value_loss: 0.2840
Replay buffer size: 178
Time taken: 13.5s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.1375
Average policy_loss: 0.9540
Average value_loss: 0.1835
Replay buffer size: 260
Time taken: 14.6s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.0854
Average policy_loss: 0.9266
Average value_loss: 0.1588
Replay buffer size: 352
Time taken: 17

0,1
buffer_size,▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
iteration_time,▁▂▂▂▇▁▆▁▁▁▇▂▁▇▇▂▂▂▇▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂█▂▂█▂
loss,▁▄▅▆▆▅▇▇▇▇█▇██▇▇▇▆▆▇▇▆▆▆▆▅▅▄▄▄▃▃▃▃▃▃▃▃▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▃▅▂▂▂▃▅▂▂▁▄▆▄▆▁▃▃▃▂▆▅▆▅▅▅▄█▇▃▇▅▇▅▄▆▇█▆▇
policy_loss,█▄▃▄▅▅▅▄▅▅▆▆▆▆▇▆▆▆▅▆▅▅▅▄▄▅▄▅▄▄▃▃▃▂▂▂▂▁▂▁
value_loss,▁▁▅▆▇▇████▇████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▆▅▅▅

0,1
best_win_rate,1.3
buffer_size,8677.0
iteration_time,55.44838
loss,1.11185
num_games,10.0
num_positions,94.0
policy_loss,0.89864
total_time_hours,0.66025
value_loss,0.21321


[34m[1mwandb[0m: Agent Starting Run: vb24b986 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 1
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.005962047799779216
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.6851834526319077
[34m[1mwandb[0m: 	weight_decay: 0.031782394509169996


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.4657
Average policy_loss: 2.0447
Average value_loss: 0.4210
Replay buffer size: 80
Time taken: 6.3s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.8174
Average policy_loss: 1.4596
Average value_loss: 0.3579
Replay buffer size: 160
Time taken: 9.1s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.7249
Average policy_loss: 1.4595
Average value_loss: 0.2654
Replay buffer size: 244
Time taken: 11.9s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.4822
Average policy_loss: 1.2656
Average value_loss: 0.2166
Replay buffer size: 332
Time taken: 12.

0,1
buffer_size,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
iteration_time,▁▂▂▂▇▂▂▂▇▁▁▁▁▁▂▁▇▂▂██▂▂█▂▂█▂▂▂▂▂█▂▁▂▂▂▂█
loss,█▇▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▃▆▆▅▇▆▅▅▄▇▆▅▆▁▆▁▄▆▂▅▃▃▅▆▄▅▆▆█▅▅▂▄▄▃▄▅█▄
policy_loss,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▃▄▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▂▁▁▁▁

0,1
best_win_rate,1.15
buffer_size,8781.0
iteration_time,39.16082
loss,1.05874
num_games,10.0
num_positions,89.0
policy_loss,0.93158
total_time_hours,0.47161
value_loss,0.12715


[34m[1mwandb[0m: Agent Starting Run: 644rmirp with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0945757029389616
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.4647788559355033
[34m[1mwandb[0m: 	weight_decay: 0.00010081781918285132


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.1848
Average policy_loss: 1.9661
Average value_loss: 0.2186
Replay buffer size: 96
Time taken: 9.2s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.0595
Average policy_loss: 0.8774
Average value_loss: 0.1822
Replay buffer size: 181
Time taken: 16.9s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.9821
Average policy_loss: 0.8330
Average value_loss: 0.1491
Replay buffer size: 267
Time taken: 18.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 97 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.9407
Average policy_loss: 0.8163
Average value_loss: 0.1244
Replay buffer size: 364
Time taken: 17

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█████
iteration_time,▁▂▂▂▂▁▁▂▁█▂█▁▇▂▇▁▁▇▂█▁▂▇▂▁▂▂▂▂▂▂▂▂▂▁▂█▂▂
loss,▂▁▂▃▄▆▅▅▅▅▅▅▅▅▅▅▆▅▅▅▆▅▅▅▇▆▅▆▅▅▆▇▆▆▆█▆▆▆▆
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,█▇▆▂█▅▁▅▆▇▆▁█▁▇▆▄▄▄██▄▂▃▅▅▅▁▂▆▃▆▇▆▃▇▃▁▆▄
policy_loss,▁▁▁▃▃▃▄▄▄▄▃▃▃▃▃▃▃▃▂▂▃█▄▃▃▃▄▃▄▃▄▄▄▅▄▅▅▅▄▅
value_loss,▂▁▁▃▃▅▅▅▅▆▆█▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▆▇▇▇▆▇▆▇▆▆▇

0,1
best_win_rate,1.2
buffer_size,8822.0
iteration_time,65.5283
loss,1.14178
num_games,10.0
num_positions,81.0
policy_loss,0.95197
total_time_hours,0.73217
value_loss,0.18982


[34m[1mwandb[0m: Agent Starting Run: 3oqcgwoz with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.02761871300937066
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: small
[34m[1mwandb[0m: 	value_softness: 0.7139143001081747
[34m[1mwandb[0m: 	weight_decay: 4.160840845407718e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 68 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.5231
Average policy_loss: 0.5507
Average value_loss: 0.9723
Replay buffer size: 68
Time taken: 6.7s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.6749
Average policy_loss: 0.5917
Average value_loss: 0.0832
Replay buffer size: 144
Time taken: 9.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.6203
Average policy_loss: 0.5457
Average value_loss: 0.0746
Replay buffer size: 235
Time taken: 10.3s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 100 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.5995
Average policy_loss: 0.5229
Average value_loss: 0.0766
Replay buffer size: 335
Time taken: 10

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
iteration_time,▁▅▁▆▁▁▁▁▁▆▁▁▁▇▁▁▁▇▁▁▇▁▁▁▁▁▁▁▁▁▇▂▂▂██▂▂█▂
loss,█▇▆▅▅▄▄▄▃▂▂▁▂▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▁▇▆▆█▅██▇▇▇█▇█▇▇█▆██▇█▆▇██▆▇▆▆▆█▇▇▇█▇▇▆▇
policy_loss,█▇▅▃▃▂▂▁▁▁▁▁▁▁▁▂▂▂▃▃▃▃▄▃▃▃▃▃▃▃▃▄▃▄▄▄▄▄▄▄
value_loss,█▆▆▅▄▃▂▂▁▁▂▂▂▁▂▂▁▂▁▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,9617.0
iteration_time,47.07154
loss,0.51835
num_games,10.0
num_positions,93.0
policy_loss,0.47194
total_time_hours,0.49814
value_loss,0.04641


[34m[1mwandb[0m: Agent Starting Run: e0jzbm9x with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.05018288995345499
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.7387659090520059
[34m[1mwandb[0m: 	weight_decay: 0.0005560097255888672


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.4770
Average policy_loss: 1.1895
Average value_loss: 0.2875
Replay buffer size: 92
Time taken: 12.8s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.3052
Average policy_loss: 1.0978
Average value_loss: 0.2074
Replay buffer size: 174
Time taken: 16.6s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.2803
Average policy_loss: 1.0931
Average value_loss: 0.1872
Replay buffer size: 259
Time taken: 15.2s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.1975
Average policy_loss: 1.0279
Average value_loss: 0.1697
Replay buffer size: 353
Time taken: 1

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
iteration_time,▁▁█▁▂▂▂▁█▂▂▂▁█▁█▁▁▁█▁▁▁▁▁▁▁█▁▁█▁▂▁▁▁▁▁▇█
loss,█▇▆▄▅▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▃▆▇▅▄▇▅▅▇▆█▇▆▃▇▇▇▅▆▅▅▅▆▃▃▆▅▇▆▇▇▇▄▆▅▅▁▆█
policy_loss,█▆▅▅▅▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁
value_loss,█▅▄▄▃▃▂▂▂▂▁▂▂▁▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.2
buffer_size,9063.0
iteration_time,49.8879
loss,0.90608
num_games,10.0
num_positions,100.0
policy_loss,0.79984
total_time_hours,0.62492
value_loss,0.10624


[34m[1mwandb[0m: Agent Starting Run: ks9zitvr with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0019883099148537905
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.3160272671629232
[34m[1mwandb[0m: 	weight_decay: 0.007373828602539275


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.2494
Average policy_loss: 3.5353
Average value_loss: 0.7141
Replay buffer size: 88
Time taken: 10.7s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.6440
Average policy_loss: 2.3606
Average value_loss: 0.2833
Replay buffer size: 180
Time taken: 16.7s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.2151
Average policy_loss: 1.9039
Average value_loss: 0.3112
Replay buffer size: 264
Time taken: 17.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.1401
Average policy_loss: 1.8578
Average value_loss: 0.2822
Replay buffer size: 355
Time taken: 1

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇██
iteration_time,▁▁▁▇▁▁▁▁▁▁▁▁▁▁▁▁▇▁▁▇▁█▁▁▁▁▁▁▁█▁▁▁█▁▁▁█▁█
loss,█▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▅▆▄▁▃▂▁▃▅▁▄▃▅▃▅▄▅▄▄▁▆▂▆▆▅▇▆▆█▃▄▆▃▄▄▅▅▂▅█
policy_loss,█▃▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▂▂▃▃▃▃▃▃▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,8619.0
iteration_time,57.6447
loss,1.4102
num_games,10.0
num_positions,89.0
policy_loss,1.17593
total_time_hours,0.68847
value_loss,0.23427


[34m[1mwandb[0m: Agent Starting Run: 0nat9frd with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 2.1410160841635885e-05
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.8072946001422179
[34m[1mwandb[0m: 	weight_decay: 0.0002148860229820879


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 83 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.7771
Average policy_loss: 3.5850
Average value_loss: 1.1921
Replay buffer size: 83
Time taken: 8.1s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 2 summary:
Average loss: 3.7461
Average policy_loss: 2.5813
Average value_loss: 1.1648
Replay buffer size: 175
Time taken: 11.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 3.5585
Average policy_loss: 2.3939
Average value_loss: 1.1646
Replay buffer size: 266
Time taken: 10.4s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 83 new positions
Training phase...

Iteration 4 summary:
Average loss: 3.2735
Average policy_loss: 2.1197
Average value_loss: 1.1538
Replay buffer size: 349
Time taken: 9.

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
iteration_time,▁▂▁▅▇▁▂▁▆▁▁▂▁▁▁▂▂▂▂▂▂▂▂▇▂█▂█▂▂▂▂█▂▂▂▂█▂▂
loss,█▇▆▆▆▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▁▁▄▇▄▆▁▂▃▃▄▆▅▆▆▁▆▆▆▆▅▅▆▅▄▁▆█▇▅▃▄▇█▆▄▅▃▄▆
policy_loss,█▅▅▄▄▄▄▄▄▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,████▇▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.0
buffer_size,9121.0
iteration_time,45.02117
loss,1.40749
num_games,10.0
num_positions,93.0
policy_loss,1.11731
total_time_hours,0.50462
value_loss,0.29018


[34m[1mwandb[0m: Agent Starting Run: xykl357a with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0005121470987175406
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.1726190103548938
[34m[1mwandb[0m: 	weight_decay: 0.007464331859139258


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 97 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.3217
Average policy_loss: 1.7917
Average value_loss: 0.5300
Replay buffer size: 97
Time taken: 12.0s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.5107
Average policy_loss: 1.0388
Average value_loss: 0.4719
Replay buffer size: 181
Time taken: 9.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.4252
Average policy_loss: 1.0344
Average value_loss: 0.3908
Replay buffer size: 263
Time taken: 11.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.3806
Average policy_loss: 1.0198
Average value_loss: 0.3608
Replay buffer size: 350
Time taken: 12

0,1
buffer_size,▁▁▁▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
iteration_time,▁▁▂▂▂▇▂█▂█▂▂▂█▂▂▂▂▂▇▂▂▇▂▂▂▂▂▂▇▂▂▂▇▂▂▂▂▂▂
loss,█▇▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▃▅▁▄▁▃▄▆▂▅▃▄▂▄▆▆▆▅▅█▇▅█▆▃▄▅▄▆█▃▅▅▄▂█▇▅▇
policy_loss,████▇▇▇▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁
value_loss,█▅▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.3
buffer_size,8880.0
iteration_time,55.36618
loss,0.95304
num_games,10.0
num_positions,95.0
policy_loss,0.72379
total_time_hours,0.70074
value_loss,0.22926


[34m[1mwandb[0m: Agent Starting Run: yeiehj82 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.05479021497554934
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.12650695874330942
[34m[1mwandb[0m: 	weight_decay: 0.004076918710720771


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.1943
Average policy_loss: 0.9016
Average value_loss: 0.2926
Replay buffer size: 82
Time taken: 12.7s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 74 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.9105
Average policy_loss: 0.8376
Average value_loss: 0.0730
Replay buffer size: 156
Time taken: 13.9s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.9099
Average policy_loss: 0.8510
Average value_loss: 0.0589
Replay buffer size: 232
Time taken: 14.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.9117
Average policy_loss: 0.8557
Average value_loss: 0.0560
Replay buffer size: 308
Time taken: 1

0,1
buffer_size,▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇████
iteration_time,▁▆▂▁▁▁▁▁▆▁▂▂▂▂▂▂▂▂▂▇▂▂▂▂▃▂▂▂▂█▂▂▂▂▂▂▂▇▂▂
loss,▆▁▄▄▄▆▆▆▅▆▇▇▇██████▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▃▂▁▃▃▃▁▂▄▅▆▆▆▅▇▆▅█▆▅▆█▅▆▆▆▆█▃▇▅▄▃▅▄▅▆▇▇▆
policy_loss,▃▁▃▃▄▆▅▆▇▇███▇▇▇▇▇▇▆▆▆▅▆▆▄▅▄▄▄▃▃▄▃▃▂▂▃▃▂
value_loss,▁▁▅▅▄▄▄▆▆▆▇▇▇███▇█▇█▇▇▇▇▇▇▇▇▇▇▇█▇▇███▇▇█

0,1
best_win_rate,1.2
buffer_size,8632.0
iteration_time,56.81119
loss,1.15819
num_games,10.0
num_positions,78.0
policy_loss,0.87782
total_time_hours,0.72566
value_loss,0.28037


[34m[1mwandb[0m: Agent Starting Run: oru4y97b with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 8.561095321660657e-05
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.08079199391530123
[34m[1mwandb[0m: 	weight_decay: 0.031745478452373875


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.4586
Average policy_loss: 3.2049
Average value_loss: 1.2537
Replay buffer size: 86
Time taken: 10.9s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.9337
Average policy_loss: 1.9046
Average value_loss: 1.0291
Replay buffer size: 178
Time taken: 14.1s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.6929
Average policy_loss: 1.6710
Average value_loss: 1.0218
Replay buffer size: 264
Time taken: 15.2s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 78 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.4128
Average policy_loss: 1.5061
Average value_loss: 0.9067
Replay buffer size: 342
Time taken: 1

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇███
iteration_time,▁▁▁▁▁▁▁▁▁▁▆▂▂▂▆▇▇▂▇▂▂▂▂▇▂▂▂▂▇▂▃▂█▂▂▃▂█▃▃
loss,█▅▄▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▂▅▅▇▅▅▇█▅▇▇▆▆▄▆▇▂▅▅█▅▆▅▄▅▅▄▅▄▆▆▅▆▃▃▅▃▁▁
policy_loss,█▄▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▅▅▅▅▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.2
buffer_size,9016.0
iteration_time,69.21698
loss,1.34059
num_games,10.0
num_positions,91.0
policy_loss,0.99875
total_time_hours,0.73921
value_loss,0.34184


[34m[1mwandb[0m: Agent Starting Run: 2bm423gu with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0014886639470558932
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.6612738967432795
[34m[1mwandb[0m: 	weight_decay: 0.026467512654139595


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.4477
Average policy_loss: 2.6571
Average value_loss: 0.7906
Replay buffer size: 96
Time taken: 8.5s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 75 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.6079
Average policy_loss: 1.1647
Average value_loss: 0.4432
Replay buffer size: 171
Time taken: 12.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.4513
Average policy_loss: 1.0917
Average value_loss: 0.3597
Replay buffer size: 255
Time taken: 12.9s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.3455
Average policy_loss: 1.0253
Average value_loss: 0.3202
Replay buffer size: 349
Time taken: 11

0,1
buffer_size,▁▁▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
iteration_time,▁▂▂▇▂▇▂▂▂█▂▂▂▂▂▂▂██▂█▃▂▂▂▂▂█▂▂▂▂▂▂▂▂▂▂▂▂
loss,█▅▆▅▆▆▅▄▄▄▄▄▄▃▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▃▇▂▂▃▁▃▅▆▅▅▆▄▅▅▅▆▇▇▅▅▇█▅▇▆▇▇▆▅▆▆▅█▇█▆▆▅▇
policy_loss,▆▆▆▇▇████▇▆▆▅▅▅▅▄▅▄▄▃▂▃▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁
value_loss,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.35
buffer_size,8816.0
iteration_time,54.08804
loss,1.0748
num_games,10.0
num_positions,88.0
policy_loss,0.92887
total_time_hours,0.64344
value_loss,0.14592


[34m[1mwandb[0m: Agent Starting Run: yz2wd6tn with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.01229218191847406
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.5681225822304464
[34m[1mwandb[0m: 	weight_decay: 0.04645816210044651


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.0360
Average policy_loss: 0.9106
Average value_loss: 0.1254
Replay buffer size: 76
Time taken: 6.0s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 78 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.4916
Average policy_loss: 0.4552
Average value_loss: 0.0364
Replay buffer size: 154
Time taken: 7.2s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.4778
Average policy_loss: 0.4616
Average value_loss: 0.0163
Replay buffer size: 236
Time taken: 8.3s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.5162
Average policy_loss: 0.5035
Average value_loss: 0.0127
Replay buffer size: 312
Time taken: 10.6

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
iteration_time,▁▂▂▂▇▃▂▂▂▂▂▇▂▂█▃▂▂▇▂▂▇▂▇▂▃▂▃█▂█▂█▃▂█▂█▂▂
loss,▁▁▂▅▆█████▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▃▁▃▆▃▅▅▇▆▃▇▇█▇▇▆▇▅▇▆▄▅▅▆▇█▆▆▅▆▄▅▅▇▆▅▆█▆▆
policy_loss,▁▂▃▆▇█████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▅▅
value_loss,▁▂▄▆▇█████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇

0,1
best_win_rate,1.1
buffer_size,9135.0
iteration_time,47.24488
loss,0.77742
num_games,10.0
num_positions,90.0
policy_loss,0.65264
total_time_hours,0.57339
value_loss,0.12478


[34m[1mwandb[0m: Agent Starting Run: vq97zk6d with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.00013596003118112012
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.8880805620409284
[34m[1mwandb[0m: 	weight_decay: 0.0001514571740597795


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.2991
Average policy_loss: 0.4488
Average value_loss: 1.8504
Replay buffer size: 80
Time taken: 9.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 73 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.5716
Average policy_loss: 0.7104
Average value_loss: 1.8612
Replay buffer size: 153
Time taken: 9.7s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 70 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.5229
Average policy_loss: 0.7118
Average value_loss: 1.8111
Replay buffer size: 223
Time taken: 9.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.3448
Average policy_loss: 0.6821
Average value_loss: 1.6626
Replay buffer size: 309
Time taken: 9.3s

0,1
buffer_size,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
iteration_time,▁▁▁▇▂▂▆▂▂▂▂▂▇▂▂▆▂▂▂▂▂▂▂█▂▂▂▂▇▂▂██▂▂▂▂█▂▃
loss,██▇▆▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▃▆█▆▁▁▁▁▃▅▅▅▆▇▅▄▁▂▅▃▄▄▃▃▃▅▆▄▃▃▄▅▅▅▃▅▅▅▆
policy_loss,▄▂▂▁▆███████████████████████████████████
value_loss,██▇▆▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.3
buffer_size,7984.0
iteration_time,55.2597
loss,1.01005
num_games,10.0
num_positions,92.0
policy_loss,0.84481
total_time_hours,0.58627
value_loss,0.16523


[34m[1mwandb[0m: Agent Starting Run: 73dqea8j with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0002385870068881881
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.26828062571952627
[34m[1mwandb[0m: 	weight_decay: 0.0003903107761078536


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 75 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.1539
Average policy_loss: 2.2329
Average value_loss: 1.9209
Replay buffer size: 75
Time taken: 7.1s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 2 summary:
Average loss: 3.1055
Average policy_loss: 1.1805
Average value_loss: 1.9249
Replay buffer size: 155
Time taken: 6.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.9063
Average policy_loss: 0.8979
Average value_loss: 2.0084
Replay buffer size: 243
Time taken: 6.4s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.8751
Average policy_loss: 0.8541
Average value_loss: 2.0210
Replay buffer size: 319
Time taken: 6.8s

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
iteration_time,▁▁▄▁▁▁▁▁▁▁▂▆▂▂▂▂▂▂▇▂▇▂▂▂█▂▂▂▂▂▇▂▂▇▂▂▂▂██
loss,█▆▅▅▅▅▅▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▅▃▁▂▅▄▇▅▅▄▂▅▄▅▃▃▁▁▄▂▇▂▃▄▆▅▂▃▃▂▇▇▄▅▇▅▇▇▇█
policy_loss,█▃▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
value_loss,██████████▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,7983.0
iteration_time,45.17246
loss,1.12444
num_games,10.0
num_positions,92.0
policy_loss,0.85419
total_time_hours,0.45503
value_loss,0.27025


[34m[1mwandb[0m: Agent Starting Run: 26e7scze with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.029235095700522375
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.9913142554881128
[34m[1mwandb[0m: 	weight_decay: 0.003936729881848898


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 1 summary:
Average loss: 0.6400
Average policy_loss: 0.3219
Average value_loss: 0.3181
Replay buffer size: 96
Time taken: 8.9s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 97 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.3296
Average policy_loss: 0.2741
Average value_loss: 0.0555
Replay buffer size: 193
Time taken: 8.8s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 89 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.5481
Average policy_loss: 0.4491
Average value_loss: 0.0990
Replay buffer size: 282
Time taken: 10.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.6115
Average policy_loss: 0.5261
Average value_loss: 0.0854
Replay buffer size: 376
Time taken: 14.

0,1
buffer_size,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇█
iteration_time,▁▁▇▂▂▂▂█▃█▂█▂▂▂▂▃█▇▂█▂▃▂▂█▂▂▂██▃▂█▃▃▃▂▃▂
loss,▁▃▃▄▄▇▇▇▇▇▇▇▇▇████████████▇▇▇▇▇▇▇▇▇▇▇▇▇▇
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▅▅▄▆▂▄▅▆▂▇▁▁▂▇▅▅▇▄▆▅▇▇▄▅▅▅▅▅▅▅▅▄▆▅▆▅▄▇█▆
policy_loss,▁▁▆▆▇▇▇▇█████████████████████████████▇▇▇
value_loss,█▁▂▂▃▃▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃

0,1
best_win_rate,1.3
buffer_size,8694.0
iteration_time,68.04538
loss,1.13098
num_games,10.0
num_positions,90.0
policy_loss,1.00409
total_time_hours,0.8042
value_loss,0.12689


[34m[1mwandb[0m: Agent Starting Run: ayk0zy83 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.047491153348280475
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.9863737655778184
[34m[1mwandb[0m: 	weight_decay: 0.0002199847613275889


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.8021
Average policy_loss: 1.3621
Average value_loss: 0.4400
Replay buffer size: 76
Time taken: 8.0s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.9213
Average policy_loss: 0.8083
Average value_loss: 0.1130
Replay buffer size: 156
Time taken: 13.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 93 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.7974
Average policy_loss: 0.6849
Average value_loss: 0.1125
Replay buffer size: 249
Time taken: 11.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.8301
Average policy_loss: 0.7173
Average value_loss: 0.1128
Replay buffer size: 336
Time taken: 12

0,1
buffer_size,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
iteration_time,▁▂▂▂▂▇▂▂▇▂▂▇▂▂▂▂▂▂▇▂▂▂▂▂▇▂▂▂▂▂██▂▂▂█▂▂▂█
loss,▆█▆▄▂▁▁▁▁▂▁▂▁▂▂▁▂▃▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▁▅▅▇▅▆▅▅█▄▆▃▄▂▄▄▅▅▆▅▄▅▆▅▅▅▅▂▆▆▇▅▇▅▇▆▇▆▆▆
policy_loss,▆▄▂▂▂▂▂▁▂▁▂▂▃▂▃▄▄▄▄▄▄▄▄▄▄▅▅▅▆▇▆▆▆▆▆▇▇▇▇█
value_loss,▇█▆▅▄▃▄▃▂▃▁▂▂▂▂▂▃▁▁▂▃▂▂▂▃▃▃▃▃▄▃▄▃▃▃▃▃▄▄▄

0,1
best_win_rate,1.05
buffer_size,9158.0
iteration_time,46.8017
loss,0.8001
num_games,10.0
num_positions,91.0
policy_loss,0.72872
total_time_hours,0.56217
value_loss,0.07138


[34m[1mwandb[0m: Agent Starting Run: 8qwv1rqb with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 1
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0020321793649479615
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.4303273080956088
[34m[1mwandb[0m: 	weight_decay: 3.63682262421725e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 79 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.6525
Average policy_loss: 1.9260
Average value_loss: 0.7265
Replay buffer size: 79
Time taken: 10.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.8049
Average policy_loss: 1.3595
Average value_loss: 0.4455
Replay buffer size: 169
Time taken: 10.7s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 78 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.5119
Average policy_loss: 1.1763
Average value_loss: 0.3357
Replay buffer size: 247
Time taken: 8.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 74 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.3536
Average policy_loss: 1.0821
Average value_loss: 0.2715
Replay buffer size: 321
Time taken: 8.

0,1
buffer_size,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
iteration_time,▂▃▂▂▂█▁▁▁▁▁▁▁▁▁▇▁▁▁▁▆▁▁▆▁▁▁▁▅▁▁▁▁▁▁▁█▂▂▂
loss,█▆▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▃▆▃▅▂▁▃▁▄▃▂▁▂▃▂▃▂▁▂▃▃▁▁▃▂▂▃▁▂▁▃▆█▇▆▆▇█▇
policy_loss,█▄▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂
value_loss,█▅▄▃▃▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁

0,1
best_win_rate,1.1
buffer_size,7585.0
iteration_time,30.93466
loss,0.86507
num_games,10.0
num_positions,92.0
policy_loss,0.73515
total_time_hours,0.30319
value_loss,0.12993


[34m[1mwandb[0m: Agent Starting Run: uxn9jdci with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.02483559543936108
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.2423451955306365
[34m[1mwandb[0m: 	weight_decay: 0.0016108255576352908


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 67 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.7442
Average policy_loss: 2.9705
Average value_loss: 1.7736
Replay buffer size: 67
Time taken: 6.9s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.6594
Average policy_loss: 1.1409
Average value_loss: 1.5185
Replay buffer size: 155
Time taken: 12.8s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 79 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.7759
Average policy_loss: 0.9291
Average value_loss: 0.8468
Replay buffer size: 234
Time taken: 14.4s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.2040
Average policy_loss: 0.9248
Average value_loss: 0.2791
Replay buffer size: 320
Time taken: 13

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
iteration_time,▆▁▆▂▂▂▂█▂▂▂▂▂▂▂▁▁▁▁▇▇▁▁▁▁▁▁▁▁█▁▁▇▁▁▂▂▁▁▇
loss,█▄▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▁▁▁▄▂▅▄█▆▇▆▆▅▅▆▆▅▅▅▆▆█▄▇▃▇▆▆▅▇▆█▇▇▆▇█▄▃▅
policy_loss,▇▇██▇▆▆▆▅▄▄▄▄▃▃▃▃▂▃▂▂▂▂▂▂▂▂▁▁▂▂▂▂▂▁▂▁▁▁▁
value_loss,█▄▂▁▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,9175.0
iteration_time,45.60673
loss,0.8843
num_games,10.0
num_positions,92.0
policy_loss,0.70691
total_time_hours,0.56158
value_loss,0.17739


[34m[1mwandb[0m: Agent Starting Run: ftta8ltu with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 6.109139092490841e-05
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: small
[34m[1mwandb[0m: 	value_softness: 0.926134885910707
[34m[1mwandb[0m: 	weight_decay: 1.7826581663215916e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 77 new positions
Training phase...

Iteration 1 summary:
Average loss: 8.0444
Average policy_loss: 6.4361
Average value_loss: 1.6083
Replay buffer size: 77
Time taken: 7.3s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 2 summary:
Average loss: 6.7415
Average policy_loss: 5.4957
Average value_loss: 1.2459
Replay buffer size: 164
Time taken: 6.6s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 5.4759
Average policy_loss: 4.4173
Average value_loss: 1.0586
Replay buffer size: 250
Time taken: 6.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 4 summary:
Average loss: 4.8456
Average policy_loss: 3.9345
Average value_loss: 0.9111
Replay buffer size: 342
Time taken: 6.8s

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇██
iteration_time,▁▁▁▅▁▅▁▁▁▂▂▂▂▇▂▂▂▂▂▂▂▂▂▇▂▂▇▂▂▂▂█▂▂▂▂▂▂▂▂
loss,█▅▄▄▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▁▅▄▄▄▇▅▂▇▇▇▃▇▆▂▅▄▅▁▇▇▇▇▅▇▃▄▄▇▆▄▆▆█▂▆▅█▄█
policy_loss,█▅▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▇▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,8786.0
iteration_time,41.12503
loss,1.48959
num_games,10.0
num_positions,99.0
policy_loss,1.28315
total_time_hours,0.46678
value_loss,0.20644


[34m[1mwandb[0m: Agent Starting Run: 10qsugl9 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 1
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0010217147467280667
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -20
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.6649995855674485
[34m[1mwandb[0m: 	weight_decay: 0.010690439823965388


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 73 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.6845
Average policy_loss: 3.0000
Average value_loss: 0.6845
Replay buffer size: 73
Time taken: 8.9s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 78 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.5685
Average policy_loss: 2.1418
Average value_loss: 0.4267
Replay buffer size: 151
Time taken: 11.1s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 83 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.3544
Average policy_loss: 2.0077
Average value_loss: 0.3467
Replay buffer size: 234
Time taken: 12.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 97 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.2420
Average policy_loss: 1.9863
Average value_loss: 0.2557
Replay buffer size: 331
Time taken: 11

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇███
iteration_time,▁▁▇▁▁▁▁▂█▁▁▇▁▂▇▁▁▁▁▁▁▁▁█▁▇▁▇▁▇▁▁▇▁▇▇▁▁▁▇
loss,█▅▄▄▄▄▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▃█▄▄▄▅▃▂▃▁▅▇▃▆▆▆▅▇▇▇▆▄▃▅▅▆▆▆▆▄▇▃▄▆▆▇██▄
policy_loss,███▇▇▅▄▄▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▅▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.2
buffer_size,8748.0
iteration_time,41.92131
loss,1.03647
num_games,10.0
num_positions,81.0
policy_loss,0.90368
total_time_hours,0.52395
value_loss,0.13279


[34m[1mwandb[0m: Agent Starting Run: rhoncaaa with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 1
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.004629519546244667
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.32978566188767966
[34m[1mwandb[0m: 	weight_decay: 0.0004577065567619913


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.5976
Average policy_loss: 0.9821
Average value_loss: 0.6156
Replay buffer size: 88
Time taken: 8.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 89 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.3040
Average policy_loss: 1.0403
Average value_loss: 0.2638
Replay buffer size: 177
Time taken: 12.5s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.3654
Average policy_loss: 1.0353
Average value_loss: 0.3301
Replay buffer size: 253
Time taken: 11.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 69 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.3271
Average policy_loss: 1.0106
Average value_loss: 0.3165
Replay buffer size: 322
Time taken: 10

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
iteration_time,█▂▆▂▂▂▆▁▁▅▂▆▂▂▂▆▁▁▅▁▅▁▁▁▁▁▁▅▁▁▁▁▅▁▅▁▁▁▁▁
loss,██▇▆▅▄▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,█▄▃▃▃▂▅▄▂▄▅▁▃▄▃▄▂▂▂▂▂▂▅▂▄▂▄▄▂▃▂▂▄▂▅▂▄▂▄▂
policy_loss,▇██▇▇▆▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
value_loss,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.3
buffer_size,7263.0
iteration_time,20.13669
loss,0.70583
num_games,10.0
num_positions,68.0
policy_loss,0.63569
total_time_hours,0.25518
value_loss,0.07014


[34m[1mwandb[0m: Agent Starting Run: 1f3f9wiy with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0008878857214719207
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.9520662692903116
[34m[1mwandb[0m: 	weight_decay: 0.0390967120950752


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.4744
Average policy_loss: 0.8146
Average value_loss: 0.6599
Replay buffer size: 87
Time taken: 8.7s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 98 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.9705
Average policy_loss: 0.4430
Average value_loss: 0.5274
Replay buffer size: 185
Time taken: 6.8s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 97 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.7537
Average policy_loss: 0.3918
Average value_loss: 0.3619
Replay buffer size: 282
Time taken: 7.6s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 100 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.6205
Average policy_loss: 0.3468
Average value_loss: 0.2737
Replay buffer size: 382
Time taken: 6.7

0,1
buffer_size,▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
iteration_time,▂▁▁▁▆▁▂▂▂█▂▇▂▂█▂██▂▁▂█▁▁██▁█▂▂▂▂▂▁▂▂▂▁▇█
loss,█▄▂▂▁▁▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▅██▇▅▄▂▃▃▃▃▄▃▂▄▂▃▄▃▃▂▂▂▃▄▂▂▄▃▂▁▃▂▃▂▃▂▃▅▃
policy_loss,█▂▁▁▃▃▄▄▄▅▅▅▅▅▅▅▅▅▅▅▅▆▅▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆▆
value_loss,█▆▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,7598.0
iteration_time,30.82473
loss,0.7419
num_games,10.0
num_positions,72.0
policy_loss,0.65811
total_time_hours,0.36438
value_loss,0.08379


[34m[1mwandb[0m: Agent Starting Run: ffl4ob5w with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 1
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0015000292643164077
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.3952874363751031
[34m[1mwandb[0m: 	weight_decay: 0.012888957366713936


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.9375
Average policy_loss: 2.1361
Average value_loss: 0.8014
Replay buffer size: 88
Time taken: 8.1s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.7517
Average policy_loss: 1.2343
Average value_loss: 0.5174
Replay buffer size: 172
Time taken: 8.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.4744
Average policy_loss: 1.0674
Average value_loss: 0.4070
Replay buffer size: 258
Time taken: 6.9s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 83 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.4237
Average policy_loss: 1.0310
Average value_loss: 0.3927
Replay buffer size: 341
Time taken: 6.9s

0,1
buffer_size,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇█
iteration_time,▁▆▂▁▂▂▇▂▂▂▂█▂█▂▂▂▂▂▂▂▇▂▂▇▂▂█▂▂▂▂█▂▂▂█▂▂▂
loss,█▇▆▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▇▄▃▄▄▇▆▆▅▇▆▇▃▅▅▆▅█▅▅▅▅▁▅▅▆▇▅▇▇▅▆▃▅▆▃▇▇▅
policy_loss,█▅▃▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▅▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.2
buffer_size,8999.0
iteration_time,45.84922
loss,0.94875
num_games,10.0
num_positions,92.0
policy_loss,0.79222
total_time_hours,0.53194
value_loss,0.15653


[34m[1mwandb[0m: Agent Starting Run: kjddt8wi with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.029538367363252747
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.42492449765990126
[34m[1mwandb[0m: 	weight_decay: 2.0992703840682736e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 70 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.8967
Average policy_loss: 2.0433
Average value_loss: 1.8534
Replay buffer size: 70
Time taken: 8.1s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 70 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.5259
Average policy_loss: 0.5460
Average value_loss: 1.9799
Replay buffer size: 140
Time taken: 7.2s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 70 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.2712
Average policy_loss: 0.5238
Average value_loss: 0.7474
Replay buffer size: 210
Time taken: 7.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 75 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.0647
Average policy_loss: 0.7428
Average value_loss: 0.3219
Replay buffer size: 285
Time taken: 9.8s

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇████
iteration_time,▁▂▂▁▁▆▁▁▆▂▂█▁▂▂▁▁▁▁▁▁▁█▁▁▁▁▁▇▁▁▁█▁▁▇▁▁▁▁
loss,█▅▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▁▂▁▁▂▃▃▂▇▆▇▆▇▇██▇███▆▇██▇▇▆█▅▆█▇█▇▇█▇██▆
policy_loss,█▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,8991.0
iteration_time,37.39099
loss,0.74295
num_games,10.0
num_positions,94.0
policy_loss,0.58928
total_time_hours,0.41761
value_loss,0.15366


[34m[1mwandb[0m: Agent Starting Run: gndt3i61 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0004107382254549766
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.44418547824738286
[34m[1mwandb[0m: 	weight_decay: 0.00026423209834333


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.5694
Average policy_loss: 3.2988
Average value_loss: 1.2707
Replay buffer size: 85
Time taken: 10.3s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 2 summary:
Average loss: 3.4527
Average policy_loss: 2.4851
Average value_loss: 0.9676
Replay buffer size: 181
Time taken: 12.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.8954
Average policy_loss: 2.1035
Average value_loss: 0.7919
Replay buffer size: 266
Time taken: 15.1s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.6452
Average policy_loss: 1.9694
Average value_loss: 0.6757
Replay buffer size: 358
Time taken: 1

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇████
iteration_time,▁▇▂▂▂▂█▂▂▂█▂▂▂▂▂▂█▂▂█▂▂▇▂▂█▂▂▂▂▂▂▂▂▂█▂▂▂
loss,█▅▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▃▄▁▁▅▁▄▂▅▄▅▄▃▃▃▆▄▇▅▆▇▅▆▄█▇▃▂█▅▆▃▇▇█▃▂▆▂
policy_loss,█▆▆▆▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▆▅▄▄▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.25
buffer_size,9074.0
iteration_time,72.54292
loss,0.98272
num_games,10.0
num_positions,94.0
policy_loss,0.83311
total_time_hours,0.88975
value_loss,0.14961


[34m[1mwandb[0m: Agent Starting Run: igd0zblk with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 2.008490693503024e-05
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -15
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.6688208057630084
[34m[1mwandb[0m: 	weight_decay: 5.213378425619524e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 99 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.6733
Average policy_loss: 4.1587
Average value_loss: 0.5146
Replay buffer size: 99
Time taken: 7.0s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 2 summary:
Average loss: 4.4883
Average policy_loss: 4.0028
Average value_loss: 0.4854
Replay buffer size: 187
Time taken: 9.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 3 summary:
Average loss: 4.6555
Average policy_loss: 4.0677
Average value_loss: 0.5878
Replay buffer size: 277
Time taken: 10.9s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 4 summary:
Average loss: 4.6074
Average policy_loss: 3.9904
Average value_loss: 0.6170
Replay buffer size: 371
Time taken: 9.4

0,1
buffer_size,▁▂▂▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇███
iteration_time,▁▁▅▁▁▂▁▂▇▂▂▂▂▂▂▂▇▂▂█▂▂▂▂▂▂▂▂▂█▂▂▂▂▂▂▂▂▂▂
loss,███▇▇▅▅▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▇▄▇▁▅█▅▄▃█▇▅▇█▇▅▃▆▆▅▄▄▂▄▅▆▅▂▅▄▂▄▂▄▄▃▂▃▄
policy_loss,██▇▇▆▅▅▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
value_loss,▆▅██▇▆▆▅▆▆▅▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,8687.0
iteration_time,52.04467
loss,2.16834
num_games,10.0
num_positions,83.0
policy_loss,1.86544
total_time_hours,0.5836
value_loss,0.30289


[34m[1mwandb[0m: Agent Starting Run: pa53kjlz with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.013969647630191706
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.5996844467971668
[34m[1mwandb[0m: 	weight_decay: 0.0001260649060622499


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 95 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.5448
Average policy_loss: 2.0556
Average value_loss: 0.4891
Replay buffer size: 95
Time taken: 14.8s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.9567
Average policy_loss: 0.7160
Average value_loss: 0.2407
Replay buffer size: 181
Time taken: 14.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.8428
Average policy_loss: 0.6795
Average value_loss: 0.1633
Replay buffer size: 266
Time taken: 13.2s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.7957
Average policy_loss: 0.6627
Average value_loss: 0.1330
Replay buffer size: 350
Time taken: 1

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
iteration_time,▁▁▅▁▁▇▁▁▂▁▁▂▂▂▂█▂▂▇▂▂▂█▁▂▂█▁▁▂▂▂▂▂▂▁█▂▂█
loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▃▂▁▆▆▅▅▇▆█▅▆▄▆▇▅█▅▆▅▅▆▇█▇▄█▆▇▇█▄▂▆▇▂▃█▄█
policy_loss,█▆▅▄▄▄▄▄▃▃▃▂▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▅▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,0.95
buffer_size,9401.0
iteration_time,54.6204
loss,0.64714
num_games,10.0
num_positions,100.0
policy_loss,0.57785
total_time_hours,0.64032
value_loss,0.06929


[34m[1mwandb[0m: Agent Starting Run: xrs2pwv6 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.005894732331429047
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.8159122015632893
[34m[1mwandb[0m: 	weight_decay: 0.007810698451177676


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.8942
Average policy_loss: 1.2127
Average value_loss: 0.6815
Replay buffer size: 88
Time taken: 7.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.0352
Average policy_loss: 0.6070
Average value_loss: 0.4282
Replay buffer size: 180
Time taken: 10.5s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 99 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.9223
Average policy_loss: 0.5281
Average value_loss: 0.3942
Replay buffer size: 279
Time taken: 11.9s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 89 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.8782
Average policy_loss: 0.5577
Average value_loss: 0.3205
Replay buffer size: 368
Time taken: 13

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
iteration_time,▁▁▂▂▇▂▇▂▂█▂▇▂▂▂▇▂▂▂▂▂▂▂▇▂▇▂▂▂▂▂▂▂▂▇▂▇▂▂▇
loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▅▅▃█▇▄▅▇▇▄█▃▅▇▂▆▃▄▃▅▃▅▄▃▄▆▁▅▆▇▂▇▆▇▄▃▃▇▅▆
policy_loss,▁▄▆▆▇▇▇▇▇▇▇▇█▇██████████████████████████
value_loss,█▇▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,8996.0
iteration_time,63.63387
loss,0.88
num_games,10.0
num_positions,92.0
policy_loss,0.78876
total_time_hours,0.79082
value_loss,0.09125


[34m[1mwandb[0m: Agent Starting Run: tr6ss13k with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.03862567744897157
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.6781602793764515
[34m[1mwandb[0m: 	weight_decay: 0.006082793782026864


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.6506
Average policy_loss: 2.2301
Average value_loss: 1.4206
Replay buffer size: 85
Time taken: 11.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 82 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.1396
Average policy_loss: 0.5553
Average value_loss: 1.5843
Replay buffer size: 167
Time taken: 10.4s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 95 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.2129
Average policy_loss: 0.5163
Average value_loss: 0.6966
Replay buffer size: 262
Time taken: 10.4s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.6859
Average policy_loss: 0.5199
Average value_loss: 0.1661
Replay buffer size: 358
Time taken: 1

0,1
buffer_size,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
iteration_time,▁▁▁▁▂▂▂▂▂▆▂▂▂▂▆▂▂█▂▇▂▂▂▇▂▂▂▇▂▂▇▂▂█▂▃▃▃▃▂
loss,█▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▃▇▇▆▇▆█▇▇▇▆▇█▆▇▇█▇▆▅▅▅▇▆▇▇▆▇▆▆▅▅█▁▇▆▆▃▁
policy_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▂▂▂▂▃
value_loss,▇█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,9292.0
iteration_time,72.05141
loss,0.92811
num_games,10.0
num_positions,93.0
policy_loss,0.83012
total_time_hours,0.7313
value_loss,0.098


[34m[1mwandb[0m: Agent Starting Run: qjqzvbk9 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 2
[34m[1mwandb[0m: 	batch_size: 256
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.015946964266376425
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.7340837764314487
[34m[1mwandb[0m: 	weight_decay: 0.036254775684257376


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 75 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.2184
Average policy_loss: 1.6723
Average value_loss: 0.5461
Replay buffer size: 75
Time taken: 7.0s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.1974
Average policy_loss: 1.0291
Average value_loss: 0.1683
Replay buffer size: 165
Time taken: 13.5s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.0501
Average policy_loss: 0.8833
Average value_loss: 0.1668
Replay buffer size: 256
Time taken: 16.2s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 100 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.9373
Average policy_loss: 0.8149
Average value_loss: 0.1224
Replay buffer size: 356
Time taken: 1

0,1
buffer_size,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇████
iteration_time,▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂█▂▂█▂▃▃▂▂▂▂▂▂▂▂▂█▂▂▃█
loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,█▁▅▃▅▆▄▅▃▅▇█▅▅▆▅▇▅▆▄▅▆█▂▃▅▃▅▅▅▆▆▅▄▆▅▄█▄▅
policy_loss,▇▁▆▆▇▇█▆▇▇▇▆▇▅▆▆▅▆▆▆▆▅▅▆▆▆▆▇▆▆▆▆▆▇▇▆▆▆▆▅
value_loss,█▂▂▁▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.2
buffer_size,9112.0
iteration_time,55.79177
loss,0.96737
num_games,10.0
num_positions,89.0
policy_loss,0.86445
total_time_hours,0.67035
value_loss,0.10291


[34m[1mwandb[0m: Agent Starting Run: b29skn5d with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.03951229107871487
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: small
[34m[1mwandb[0m: 	value_softness: 0.4830237912634562
[34m[1mwandb[0m: 	weight_decay: 0.0002582797354448806


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 76 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.5095
Average policy_loss: 0.7264
Average value_loss: 0.7831
Replay buffer size: 76
Time taken: 10.0s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.9776
Average policy_loss: 0.6944
Average value_loss: 0.2831
Replay buffer size: 163
Time taken: 11.5s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 89 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.8532
Average policy_loss: 0.6554
Average value_loss: 0.1978
Replay buffer size: 252
Time taken: 12.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.8493
Average policy_loss: 0.6835
Average value_loss: 0.1657
Replay buffer size: 344
Time taken: 1

0,1
buffer_size,▁▁▁▁▁▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
iteration_time,▁▁▂▂▂▂▇▂▂▂▂▂▂▂▂▇▂▇▂▂▂█▂▂▃▇▂▂█▂▂█▂▂█▂▂▂▂█
loss,█▃▄▄▃▃▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▃▅▂▆▄▁▅▅▅▅▆▅▇█▄▅▄▆▅█▄▄▆▆▄▇▅█▅▄▅▆▅▆▄▄▇▄▇▄
policy_loss,▁▄▆▇█▆▅▄▄▄▃▄▄▃▄▄▄▄▄▅▅▄▄▅▅▆▆▅▆▆▆▆▇▆▆▇▇███
value_loss,█▄▃▂▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,9175.0
iteration_time,68.52223
loss,0.83115
num_games,10.0
num_positions,81.0
policy_loss,0.71548
total_time_hours,0.7959
value_loss,0.11567


[34m[1mwandb[0m: Agent Starting Run: 1qexohwq with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.004272176241629109
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.6510314825776856
[34m[1mwandb[0m: 	weight_decay: 0.0001031966998491176


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.9031
Average policy_loss: 1.5660
Average value_loss: 0.3372
Replay buffer size: 86
Time taken: 8.2s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 98 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.1175
Average policy_loss: 0.5808
Average value_loss: 0.5367
Replay buffer size: 184
Time taken: 8.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.0386
Average policy_loss: 0.5364
Average value_loss: 0.5023
Replay buffer size: 276
Time taken: 9.6s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.9011
Average policy_loss: 0.5363
Average value_loss: 0.3648
Replay buffer size: 367
Time taken: 10.7

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
iteration_time,▁▁█▂▂█▁▁▁▁▁▁▁▇▁▁▇▁▁▁▁▁▁▁▆▁▁▁▇▁▁▁▁▁▁▁▁▁▇▁
loss,█▇▄▁▂▂▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▇█▆▃▂▃▃▄▃▄▂▃▃▃▃▄▄▂▃▃▂▃▃▄▃▃▃▄▂▄▃▃▃▃▂▁▃▃▂▃
policy_loss,█▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
value_loss,█▄▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.15
buffer_size,7463.0
iteration_time,32.70994
loss,0.70581
num_games,10.0
num_positions,74.0
policy_loss,0.65882
total_time_hours,0.4209
value_loss,0.04699


[34m[1mwandb[0m: Agent Starting Run: ym5evfg9 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.01715578684502239
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.5180851764381642
[34m[1mwandb[0m: 	weight_decay: 0.00026745893171497984


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.0491
Average policy_loss: 1.1416
Average value_loss: 0.9075
Replay buffer size: 90
Time taken: 11.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 85 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.1437
Average policy_loss: 0.9110
Average value_loss: 0.2327
Replay buffer size: 175
Time taken: 15.1s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.0400
Average policy_loss: 0.8409
Average value_loss: 0.1992
Replay buffer size: 262
Time taken: 17.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 95 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.0119
Average policy_loss: 0.8343
Average value_loss: 0.1776
Replay buffer size: 357
Time taken: 1

0,1
buffer_size,▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
iteration_time,▁▂▆▁▁▆▁▂▂▂█▂▂█▂▂▂█▂▂▂▂▂▂▂█▂█▂▂▂▂▂█▂▂▂▂▂▂
loss,█▄▂▁▁▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▇▆▅▅▆▃▄▅▄▇▁▅▅▆█▇█▂▇▅▁▇▇▂▇▆▂▃▅▇▆█▇█▇▇▆▇▂▇
policy_loss,█▄▂▂▂▁▁▁▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
value_loss,█▆▄▂▁▁▁▃▄▃▄▄▄▄▃▄▄▄▄▄▃▃▃▃▃▂▂▂▂▃▂▂▂▂▂▁▁▁▁▁

0,1
best_win_rate,1.1
buffer_size,9041.0
iteration_time,70.46886
loss,0.97942
num_games,10.0
num_positions,95.0
policy_loss,0.85291
total_time_hours,0.86627
value_loss,0.12651


[34m[1mwandb[0m: Agent Starting Run: r8dzlq0u with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.01
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.002869612612950579
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.7666264946368808
[34m[1mwandb[0m: 	weight_decay: 1.194626106470751e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.8040
Average policy_loss: 1.2633
Average value_loss: 0.5407
Replay buffer size: 91
Time taken: 12.9s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 79 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.0612
Average policy_loss: 0.7909
Average value_loss: 0.2702
Replay buffer size: 170
Time taken: 11.6s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.9182
Average policy_loss: 0.7209
Average value_loss: 0.1973
Replay buffer size: 254
Time taken: 11.3s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 96 new positions
Training phase...

Iteration 4 summary:
Average loss: 0.8609
Average policy_loss: 0.6980
Average value_loss: 0.1629
Replay buffer size: 350
Time taken: 1

0,1
buffer_size,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇███
iteration_time,▆▁▁▆▁▆▁▇▁▇▁▂▇▁▂▇▁▁▇▁▂▂▇▂▂█▂▂▇▂▂▂█▂▂▂▂▂▂█
loss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▁▇▆▂█▅▇▇▅█▅▇▆▄▅▄▇▄▄▆▃▄▆▅▅▆▃█▄▅▅▆▇▂▅▃▅█▄
policy_loss,▆█▆▆▅▄▄▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂
value_loss,█▄▃▃▂▂▂▂▂▂▂▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.05
buffer_size,9299.0
iteration_time,60.2578
loss,0.70152
num_games,10.0
num_positions,89.0
policy_loss,0.62503
total_time_hours,0.67711
value_loss,0.07649


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 42xwipnd with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 1
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.009840250466546557
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.965336122464968
[34m[1mwandb[0m: 	weight_decay: 0.000397018136409093


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 73 new positions
Training phase...

Iteration 1 summary:
Average loss: 2.9848
Average policy_loss: 1.4515
Average value_loss: 1.5333
Replay buffer size: 73
Time taken: 4.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.9776
Average policy_loss: 0.9427
Average value_loss: 1.0348
Replay buffer size: 161
Time taken: 8.4s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.2787
Average policy_loss: 0.8796
Average value_loss: 0.3991
Replay buffer size: 252
Time taken: 8.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.0962
Average policy_loss: 0.8737
Average value_loss: 0.2225
Replay buffer size: 339
Time taken: 9.8s

0,1
buffer_size,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
iteration_time,▁▂▂▂▂▇▂▂▂▇▇▂▁▂▂▂▂▂▂▇▃▃▃▃█▃█▃▃▂▂▂▂▂▇▂▂▂▂▂
loss,█▃▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▆▅█▆▄▄▄▅▁▁▃▂▃▃▆▆▇▅▇▆▆▆▅▆▅▃▆▃▃▂▄▄▄▂▂▄▃▃▃
policy_loss,█▂▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃
value_loss,█▆▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.25
buffer_size,8041.0
iteration_time,33.57955
loss,1.14047
num_games,10.0
num_positions,78.0
policy_loss,0.97709
total_time_hours,0.40341
value_loss,0.16338


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: e3omdks4 with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.006317382532920036
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.5749564637708762
[34m[1mwandb[0m: 	weight_decay: 1.13971962739925e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 79 new positions
Training phase...

Iteration 1 summary:
Average loss: 4.2492
Average policy_loss: 3.5491
Average value_loss: 0.7001
Replay buffer size: 79
Time taken: 16.1s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 79 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.5985
Average policy_loss: 1.1368
Average value_loss: 0.4617
Replay buffer size: 158
Time taken: 11.9s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 74 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.2648
Average policy_loss: 0.9123
Average value_loss: 0.3526
Replay buffer size: 232
Time taken: 10.8s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.2313
Average policy_loss: 0.8940
Average value_loss: 0.3373
Replay buffer size: 320
Time taken: 1

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇███
iteration_time,▂▂▅▁▁▁▁▁▁▅▁▄▁▁▁▁▁▄▁▁▁▁▁▅▁▂▂▂▂▂█▃▃▃▃▃▃▃▃▃
loss,█▅▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▁▅▅▅▇▆▇▇▇▅█▇█████▇██▇█▆▇▇█▄▅▇███▆██▇▇▇▆
policy_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▆▆▅▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,0.85
buffer_size,9588.0
iteration_time,63.28807
loss,0.59675
num_games,10.0
num_positions,95.0
policy_loss,0.52918
total_time_hours,0.55514
value_loss,0.06757


[34m[1mwandb[0m: Agent Starting Run: mwgt93eq with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.02989979919223195
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: large
[34m[1mwandb[0m: 	value_softness: 0.18152099402570257
[34m[1mwandb[0m: 	weight_decay: 1.5932900722595426e-05


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.9625
Average policy_loss: 1.4489
Average value_loss: 0.5135
Replay buffer size: 87
Time taken: 11.3s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.2880
Average policy_loss: 0.9782
Average value_loss: 0.3098
Replay buffer size: 174
Time taken: 27.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 86 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.4261
Average policy_loss: 1.0575
Average value_loss: 0.3686
Replay buffer size: 260
Time taken: 27.0s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.4307
Average policy_loss: 1.0778
Average value_loss: 0.3530
Replay buffer size: 352
Time taken: 2

0,1
buffer_size,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇███
iteration_time,▁▁█▁▁▁█▁▁▁▁█▁▁█▁▁▁▁▁▁▁▁▁▇▁█▁▁▁▁▁█▁▁▁▁▁▁▁
loss,▃▆▇██▇▆▅▅▄▄▄▄▄▄▄▃▃▃▃▃▄▃▂▂▂▂▂▂▂▁▂▂▁▁▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▄▄▅▄▃▃▂▁▄▆▃▅▆▅▄▇▅▅▅▆▇▄▆▃▇▅▃▁█▃▆▇▆▆▇▆▆█▄▆
policy_loss,█▁▃▃▃▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
value_loss,█▃▃▃▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.2
buffer_size,8867.0
iteration_time,80.50791
loss,1.21535
num_games,10.0
num_positions,96.0
policy_loss,0.9628
total_time_hours,1.01701
value_loss,0.25254


[34m[1mwandb[0m: Agent Starting Run: 9y8nbmkn with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 3
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.00010840278403668248
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: xlarge
[34m[1mwandb[0m: 	value_softness: 0.873471271655084
[34m[1mwandb[0m: 	weight_decay: 0.002175201218305422


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 93 new positions
Training phase...

Iteration 1 summary:
Average loss: 3.6069
Average policy_loss: 2.5141
Average value_loss: 1.0928
Replay buffer size: 93
Time taken: 9.8s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 100 new positions
Training phase...

Iteration 2 summary:
Average loss: 2.1548
Average policy_loss: 1.1104
Average value_loss: 1.0444
Replay buffer size: 193
Time taken: 9.3s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 2.1444
Average policy_loss: 1.0955
Average value_loss: 1.0489
Replay buffer size: 284
Time taken: 9.9s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 94 new positions
Training phase...

Iteration 4 summary:
Average loss: 2.1003
Average policy_loss: 1.0505
Average value_loss: 1.0498
Replay buffer size: 378
Time taken: 11.

0,1
buffer_size,▁▁▁▁▂▃▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇███
iteration_time,▁▁▂▂▆▂▇▇▂▂▂▂▂▂█▂▂▂▂▂█▃▂█▃▂█▃▂▂▂█▂█▂▂█▃▃▃
loss,██▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▆▇▇▅▇█▇▄▆█▇▅▃▂▄██▇▁▅▆▆▂▄▁▃▄▁▆▆▄▃▄▄▃▅▃▇▄
policy_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂
value_loss,█▇███████████████████████████████▁▁▁▁▁▁▁

0,1
best_win_rate,0.95
buffer_size,8942.0
iteration_time,69.91262
loss,1.23671
num_games,10.0
num_positions,88.0
policy_loss,1.09025
total_time_hours,0.81609
value_loss,0.14646


[34m[1mwandb[0m: Agent Starting Run: 6yd6ohjb with config:
[34m[1mwandb[0m: 	activation: relu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 1024
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.006240627863831675
[34m[1mwandb[0m: 	mask_illegal_moves: False
[34m[1mwandb[0m: 	mask_value: -10
[34m[1mwandb[0m: 	norm_first: False
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: medium
[34m[1mwandb[0m: 	value_softness: 0.8537371052272262
[34m[1mwandb[0m: 	weight_decay: 0.0007780525146509451


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 90 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.8282
Average policy_loss: 1.3740
Average value_loss: 0.4541
Replay buffer size: 90
Time taken: 9.4s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 92 new positions
Training phase...

Iteration 2 summary:
Average loss: 0.9463
Average policy_loss: 0.7601
Average value_loss: 0.1862
Replay buffer size: 182
Time taken: 13.7s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 81 new positions
Training phase...

Iteration 3 summary:
Average loss: 0.9888
Average policy_loss: 0.8126
Average value_loss: 0.1762
Replay buffer size: 263
Time taken: 17.5s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 88 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.0515
Average policy_loss: 0.8967
Average value_loss: 0.1548
Replay buffer size: 351
Time taken: 20

0,1
buffer_size,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
iteration_time,▁▂▇▂▂▆▂▂▂▂▂▇▂▂▂▂▂▂▂▂▇▂▂▂▂▇▂▂█▃▃▃▃▂█▃▂█▃▃
loss,█▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▆▇▃▅▇▂▄▂▃▁▂▃▄▄▄▅▅█▄▅▅▅▇▅▇██▇▇▂▇▅▇▅▇▅▅▆▅▆
policy_loss,▁▆▅▆██▇▇▇▇▆▆▆▆▆▅▆▆▅▅▅▅▅▅▅▅▅▅▅▆▅▆▅▅▅▅▅▅▅▅
value_loss,█▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.25
buffer_size,8658.0
iteration_time,68.67727
loss,1.04205
num_games,10.0
num_positions,91.0
policy_loss,0.89651
total_time_hours,0.77658
value_loss,0.14554


[34m[1mwandb[0m: Agent Starting Run: 7c3e98y3 with config:
[34m[1mwandb[0m: 	activation: gelu
[34m[1mwandb[0m: 	attention_layers: 4
[34m[1mwandb[0m: 	batch_size: 512
[34m[1mwandb[0m: 	checkpoint_frequency: 20
[34m[1mwandb[0m: 	dropout: 0.0001
[34m[1mwandb[0m: 	games_per_iteration: 10
[34m[1mwandb[0m: 	learning_rate: 0.0443884942945996
[34m[1mwandb[0m: 	mask_illegal_moves: True
[34m[1mwandb[0m: 	mask_value: -5
[34m[1mwandb[0m: 	norm_first: True
[34m[1mwandb[0m: 	num_iterations: 100
[34m[1mwandb[0m: 	num_simulations: 100
[34m[1mwandb[0m: 	replay_buffer_max_size: 10000
[34m[1mwandb[0m: 	steps_per_iteration: 100
[34m[1mwandb[0m: 	transformer_size: tiny
[34m[1mwandb[0m: 	value_softness: 0.8855538374778479
[34m[1mwandb[0m: 	weight_decay: 0.00011723740862830225


Training model: transformer
Using device: mps

Iteration 1/100
Self-play phase...
Playing game 10/10
Generated 80 new positions
Training phase...

Iteration 1 summary:
Average loss: 1.2178
Average policy_loss: 1.0263
Average value_loss: 0.1915
Replay buffer size: 80
Time taken: 14.6s

Iteration 2/100
Self-play phase...
Playing game 10/10
Generated 87 new positions
Training phase...

Iteration 2 summary:
Average loss: 1.1077
Average policy_loss: 0.9700
Average value_loss: 0.1377
Replay buffer size: 167
Time taken: 15.0s

Iteration 3/100
Self-play phase...
Playing game 10/10
Generated 91 new positions
Training phase...

Iteration 3 summary:
Average loss: 1.0684
Average policy_loss: 0.9676
Average value_loss: 0.1008
Replay buffer size: 258
Time taken: 20.1s

Iteration 4/100
Self-play phase...
Playing game 10/10
Generated 84 new positions
Training phase...

Iteration 4 summary:
Average loss: 1.0728
Average policy_loss: 0.9732
Average value_loss: 0.0996
Replay buffer size: 342
Time taken: 2

0,1
buffer_size,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
iteration_time,▁▂▂▇▂▂▇▂▂▇▁▁▂▁▇▂▂▁▂▂▂▂▂▂█▂█▂▂█▂▂▂▂▂▂█▂▂█
loss,▇▆▆████▇▇▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▂▂▁▁▁▁▁
num_games,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
num_positions,▂▅▃▂▂▄▅▅▅▆▆▅▁▅▅▆▇██▅▃▅█▁▃▆▃▇▅▇▅▅▇▇█▄▇▇▇▃
policy_loss,████▇▇▇▆▅▅▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁
value_loss,█▃▇▆▆▆▅▅▄▄▄▃▃▃▃▃▂▂▂▃▂▂▁▁▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
best_win_rate,1.15
buffer_size,9052.0
iteration_time,72.10831
loss,0.88509
num_games,10.0
num_positions,96.0
policy_loss,0.79922
total_time_hours,0.83259
value_loss,0.08587
