# Zoning Game AlphaZero Demo

## Individually construct bits and pieces

In [1]:
from nsai_experiments.general_az_1p.game import Game
from nsai_experiments.general_az_1p.policy_value_net import PolicyValueNet
from nsai_experiments.general_az_1p.agent import Agent

from nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl import ZoningGameGame
from nsai_experiments.general_az_1p.zoning_game.zoning_game_az_impl import ZoningGamePolicyValueNet

### The `Game`

In [2]:
mygame = ZoningGameGame()
assert isinstance(mygame, Game)
mygame.reset_wrapper(seed=47)
print(mygame.render().read())  # type: ignore[union-attr]

Tile grid:
[[0 0 0 5 1 0]
 [0 4 0 0 0 0]
 [0 3 0 3 2 4]
 [0 0 0 0 0 0]
 [2 0 0 0 0 0]
 [0 0 0 0 3 1]]
Tile queue (leftmost next): [1 4 2 1 5 2 3 3 2 3 1 1 1 4 2 2 1 5 5 2 1 5 3 2 5 1 0 0 0 0 0 0 0 0 0 0]
where 0 = EMPTY, 1 = RESIDENTIAL, 2 = COMMERCIAL, 3 = INDUSTRIAL, 4 = DOWNTOWN, 5 = PARK.
After 0 moves, current grid score is 3; terminated = False, truncated = False.



### The `PolicyValueNet`

In [3]:
import torch
from nsai_experiments.zoning_game.notebook_utils import get_zg_data
from nsai_experiments.zoning_game.zg_policy import create_policy_indiv_greedy

torch.manual_seed(47)
n_games = 20_000
savedir = "../../zoning_game/zg_data"
valid_frac = 0.15
test_frac = 0.15

states_tensor, values_tensor, moves_tensor = get_zg_data(create_policy_indiv_greedy, n_games = n_games, savedir = savedir)
indices = torch.randperm(len(values_tensor))
full_dataset_3 = torch.utils.data.TensorDataset(states_tensor[indices], moves_tensor[indices], values_tensor[indices])

valid_size_3 = int(valid_frac * len(full_dataset_3))
test_size_3 = int(test_frac * len(full_dataset_3))
train_size_3 = len(full_dataset_3) - valid_size_3 - test_size_3
train_dataset_3, valid_dataset_3, test_dataset_3 = torch.utils.data.random_split(full_dataset_3, [train_size_3, valid_size_3, test_size_3])
print("Done loading, shuffling, splitting data")

Loading data from disk: ../../zoning_game/zg_data/create_policy_indiv_greedy__20000
Done loading, shuffling, splitting data


In [4]:
mynet = ZoningGamePolicyValueNet()
assert isinstance(mynet, PolicyValueNet)
# mynet.train(train_dataset_3, needs_reshape=False)  # takes a little while
mynet.predict(mygame.obs)

(array([0.02878359, 0.02750564, 0.02725394, 0.02644118, 0.02703556,
        0.02739549, 0.02875161, 0.02773597, 0.02912001, 0.02731342,
        0.02837674, 0.02837252, 0.02701781, 0.02933569, 0.02749422,
        0.02898956, 0.02915286, 0.025719  , 0.02784868, 0.02766767,
        0.02893063, 0.02690739, 0.02791698, 0.0266863 , 0.02847183,
        0.0276081 , 0.02711248, 0.02722818, 0.02664321, 0.02836256,
        0.02602571, 0.02749578, 0.02793976, 0.02626782, 0.02930332,
        0.02978883], dtype=float32),
 0.02528020739555359)

### The `Agent` and `MCTS`

In [5]:
import logging

logging.getLogger().setLevel(logging.WARN)  # TODO
myagent = Agent(mygame, mynet)
train_examples = myagent.play_single_game()
print(len(train_examples))

26


In [6]:
myagent.play_and_train()

Training on 2190 examples
Epoch 1/10, Train Loss: 1361.8778
Epoch 10/10, Train Loss: 498.0550
Old network average reward: 27.0
New network average reward: 25.4
New network won 4 out of 10 games (40.00%)
Reverting to the old network


In [7]:
myagent.play_train_multiple(10)

Training iteration 1 of 10: will play 100 games, train, and evaluate on 10 games
Training on 4399 examples
Epoch 1/10, Train Loss: 1160.3925
Epoch 10/10, Train Loss: 380.0371
Old network average reward: 18.7
New network average reward: 26.4
New network won 6 out of 10 games (60.00%)
Keeping the new network
Training iteration 2 of 10: will play 100 games, train, and evaluate on 10 games
Training on 6572 examples
Epoch 1/10, Train Loss: 311.0075
Epoch 10/10, Train Loss: 25.6942
Old network average reward: 32.1
New network average reward: 30.9
New network won 3 out of 10 games (30.00%)
Reverting to the old network
Training iteration 3 of 10: will play 100 games, train, and evaluate on 10 games
Training on 8751 examples
Epoch 1/10, Train Loss: 281.8561
Epoch 10/10, Train Loss: 21.5699
Old network average reward: 34.3
New network average reward: 34.9
New network won 7 out of 10 games (70.00%)
Keeping the new network
Training iteration 4 of 10: will play 100 games, train, and evaluate on 10 