# Zoning Game AlphaZero Demo

## Individually construct bits and pieces

In [1]:
from nsai_experiments.general_az_1p.utils import disable_numpy_multithreading, use_deterministic_cuda
disable_numpy_multithreading()
use_deterministic_cuda()

from nsai_experiments.general_az_1p.game import Game
from nsai_experiments.general_az_1p.policy_value_net import PolicyValueNet
from nsai_experiments.general_az_1p.agent import Agent

from nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl import ZoningGameGame
from nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl import ZoningGamePolicyValueNet

### The `Game`

In [2]:
mygame = ZoningGameGame()
assert isinstance(mygame, Game)
mygame.reset_wrapper(seed=47)
print(mygame.render().read())  # type: ignore[union-attr]

Tile grid:
[[0 0 0 5 1 0]
 [0 4 0 0 0 0]
 [0 3 0 3 2 4]
 [0 0 0 0 0 0]
 [2 0 0 0 0 0]
 [0 0 0 0 3 1]]
Tile queue (leftmost next): [1 4 2 1 5 2 3 3 2 3 1 1 1 4 2 2 1 5 5 2 1 5 3 2 5 1 0 0 0 0 0 0 0 0 0 0]
where 0 = EMPTY, 1 = RESIDENTIAL, 2 = COMMERCIAL, 3 = INDUSTRIAL, 4 = DOWNTOWN, 5 = PARK.
After 0 moves, current grid score is 3; terminated = False, truncated = False.



### The `PolicyValueNet`

In [3]:
import torch
from nsai_experiments.zoning_game.notebook_utils import get_zg_data
from nsai_experiments.zoning_game.zg_policy import create_policy_indiv_greedy

torch.manual_seed(47)
n_games = 20_000
savedir = "../../zoning_game/zg_data"
valid_frac = 0.15
test_frac = 0.15

states_tensor, values_tensor, moves_tensor = get_zg_data(create_policy_indiv_greedy, n_games = n_games, savedir = savedir)
indices = torch.randperm(len(values_tensor))
full_dataset_3 = torch.utils.data.TensorDataset(states_tensor[indices], moves_tensor[indices], values_tensor[indices])

valid_size_3 = int(valid_frac * len(full_dataset_3))
test_size_3 = int(test_frac * len(full_dataset_3))
train_size_3 = len(full_dataset_3) - valid_size_3 - test_size_3
train_dataset_3, valid_dataset_3, test_dataset_3 = torch.utils.data.random_split(full_dataset_3, [train_size_3, valid_size_3, test_size_3])
print("Done loading, shuffling, splitting data")

Loading data from disk: ../../zoning_game/zg_data/create_policy_indiv_greedy__20000
Done loading, shuffling, splitting data


In [4]:
mynet = ZoningGamePolicyValueNet(random_seed=47)
assert isinstance(mynet, PolicyValueNet)
mynet.predict(mygame.obs)

Neural network training will occur on device 'mps'


(array([0.02805594, 0.02738343, 0.02839885, 0.02788622, 0.0291001 ,
        0.02763609, 0.02681519, 0.02864029, 0.02788756, 0.02838092,
        0.02778699, 0.02877449, 0.02703443, 0.02906117, 0.02899825,
        0.02776492, 0.02859806, 0.02702451, 0.02705516, 0.02807434,
        0.02658405, 0.02678845, 0.02949526, 0.02644843, 0.02754996,
        0.02703502, 0.02698193, 0.02768094, 0.02770268, 0.02843505,
        0.02523457, 0.02692246, 0.02690465, 0.0284914 , 0.02925407,
        0.02813419], dtype=float32),
 array(0.02830549, dtype=float32))

### The `Agent` and `MCTS`

In [5]:
import logging

logging.getLogger().setLevel(logging.WARN)  # TODO
myagent = Agent(mygame, mynet, random_seeds={"mcts": 48, "train": 49, "eval": 50})
train_examples = myagent.play_single_game()
print(len(train_examples))

RNG seeds are fully specified
26


In [6]:
myagent.play_and_train()

..games done in 4.81 seconds
Training on 2146 examples
Training with 3 batches of size 1024
Epoch 1/10, Train Loss: 14.3882
Epoch 10/10, Train Loss: 12.5672
..training done in 1.87 seconds
..evaluation done in 2.77 seconds
Old network+MCTS average reward: 0.31, min: -0.05, max: 0.77, stdev: 0.23
New network+MCTS average reward: 0.33, min: -0.01, max: 0.73, stdev: 0.20
Old bare network average reward: 0.33, min: 0.00, max: 0.76, stdev: 0.22
New bare network average reward: 0.31, min: 0.03, max: 0.75, stdev: 0.20
New network won 9 and tied 3 out of 20 games (52.50% wins where ties are half wins)
Reverting to the old network


Does it perform better with some supervised pretraining?

In [7]:
mynet.train(train_dataset_3, needs_reshape=False);  # takes a little while

Skipping reshape of `examples`.
Training with 296 batches of size 1024
Epoch 1/10, Train Loss: 204.2436
Epoch 10/10, Train Loss: 15.3870


In [8]:
myagent.play_and_train()

..games done in 4.62 seconds
Training on 4328 examples
Training with 5 batches of size 1024
Epoch 1/10, Train Loss: 14.3741
Epoch 10/10, Train Loss: 11.5443
..training done in 1.21 seconds
..evaluation done in 2.76 seconds
Old network+MCTS average reward: 0.26, min: -0.19, max: 0.85, stdev: 0.23
New network+MCTS average reward: 0.30, min: -0.14, max: 0.88, stdev: 0.22
Old bare network average reward: 0.29, min: -0.24, max: 0.84, stdev: 0.24
New bare network average reward: 0.27, min: -0.27, max: 0.84, stdev: 0.26
New network won 14 and tied 1 out of 20 games (72.50% wins where ties are half wins)
Keeping the new network
