# Imports

In [33]:
# ignore deprecation warnings ('safe' as long as we don't update packages)
from warnings import filterwarnings
filterwarnings("ignore")

from math import sqrt, log, inf
import copy

import gym
from gym_go.gogame import turn, random_weighted_action, random_action

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
import numpy as np

# Constants

In [34]:
UCB_C = 30

# Board will be a BOARD_SIZE * BOARD_SIZE board (BOARD_SIZE**2)
BOARD_SIZE = 5
ACTIONSPACE_LENGHT = BOARD_SIZE ** 2 + 1
'''
The state object that is returned by the reset and step functions of the environment is a
6 x BOARD_SIZE x BOARD_SIZE numpy array. All values in the array are either 0 or 1.
'''

# 0 - Black pieces
# 1 - White pieces
# 2 - Turn (0 - black, 1 - white)
# 3 - Invalid moves (including ko-protection)
# 4 - Previous move was a pass
# 5 - Game over
BLACK, WHITE, INVALID = 0, 1, 3

# Monte Carlo Tree Search

1. Selection
    - Taverse the tree to find greatest UCB-score
2. Expansion
    - If the selected leaf node has been visited before expand by adding weighted game action
3. Rollout
    - Simulate the game until end-condition from the expanded leaf
4. Back-propagation
    - Updating the value of each ancestor node of the expanded leaf


In [35]:
class Node():
    def __init__(self, env, parent, action):
        self.env : gym.Env = env # This env will be altered by the other player
        self.value : int = 0 # Value estimate
        self.trials : int = 0 # Number of trials for this node
        self.parent : Node = parent # Parent node of this node
        self.children : list[Node] = [] # List of children of this node
        self.action : int = action # The step action made by this node
    
    # calculate a Upper Confidence Bound
    def ucb(self, total_trials):
        return self.value + ( UCB_C * sqrt(log(total_trials) / self.trials) )
    
    # Add a new node to a leaf node
    def expansion(self, move_weights):
        if self.env.done:
            return
        board_shape = self.env.state().shape[1:]
        pass_id = np.prod(board_shape)
        for action in range(0, pass_id + 1):
            action2d = action // board_shape[0], action % board_shape[1]
            if action == pass_id or self.env.state()[3, action2d[0], action2d[1]] == 0:
                child_env = copy.deepcopy(self.env)
                child_env.step(action)
                child = Node(child_env, self, action)
                child.value = move_weights[action]
                self.children.append(child)

    # Simulate game from current move until end-condition returning the score
    def rollout(self, move_selection_method):
        if self.env.done:
            return self.env.reward()
        
        rollout_env = copy.deepcopy(self.env)
        rollout_result = 0
        done = False
        while not done:
            random_action = move_selection_method(rollout_env)
            _, reward, done, _ = rollout_env.step(random_action)
            rollout_result += reward
        return rollout_result

In [55]:
class Monte_Carlo_Tree_Search():
    def __init__(self, size, ml_model):
        self.env : gym.Env = gym.make('gym_go:go-v0', size=size, reward_method='heuristic')
        self.env.reset()
        self.number_of_trials : int = 0
        self.root = Node(self.env, None, None)
        self.ml_model = ml_model
    
    # Gets the weights of all moves from the Machine Learning model
    def get_move_weights(self, state):
        # Do something to get this from ml_model
        # ml_model.move_weights(state)
        move_weights = np.ones(ACTIONSPACE_LENGHT)
        
        board_shape = state.shape[1:]
        for i in range(len(move_weights) - 1):
            action2d = i // board_shape[0], i % board_shape[1], i
            if state[INVALID, action2d[0], action2d[1]] == 1:
                move_weights[i] = 0.0

        return move_weights

    # Gets a weighted move for the given env
    def get_weighted_move(self, env : gym.Env):
        move_weights = self.get_move_weights(env.state())
        return random_weighted_action(move_weights)

    # Update scores of all parent nodes after rollout
    def back_propagation(self, rollout_node: Node, rollout_result):
        current_node = rollout_node
        while current_node != None:
            current_node.trials += 1
            if turn(self.env.state()) == BLACK:
                current_node.value -= rollout_result
            if turn(self.env.state()) == WHITE:                
                current_node.value += rollout_result
            current_node = current_node.parent
        self.number_of_trials += 1
    
    # Find and return the leaf node with the highest UCB-score 
    def selection(self):
        selected_child = self.root
        current_node = self.root
        while len(current_node.children) > 0:
            current_best_ucb = -inf
            for child in current_node.children:
                if child.trials == 0:
                    return child

                child_ucb = child.ucb(self.number_of_trials)

                if child.env.done and child_ucb > current_best_ucb:
                    # ups the number of trials so ucb is lowered since this game is fully explored
                    child.trials += 1

                if child_ucb > current_best_ucb:
                    selected_child = child
                    current_best_ucb = child_ucb

            current_node = selected_child

        return selected_child
    
    # Explores the tree for the given number of iterations
    def run(self, iterations):
        selected_node = self.root
        selected_node.expansion(self.get_move_weights(selected_node.env.state()))
        selected_node = self.root.children[0]

        run = 0
        while run < iterations:
            print("Run ", run)
            selected_node = self.selection()

            if selected_node.env.done:
                self.back_propagation(selected_node, 0)
                run += 1
                continue

            if selected_node.trials > 0:
                selected_node.expansion(self.get_move_weights(selected_node.env.state()))
                selected_node = selected_node.children[0]

            rollout_result = selected_node.rollout(self.get_weighted_move)
            
            self.back_propagation(selected_node, rollout_result)
            run += 1
    
    # searches the tree for a spesific state
    def find_node_from_state(self, state, node: Node = None):
        if node is None:
            node = self.root
        if np.array_equal(node.env.state(), state):
            return node

        for child in node.children:
            if np.array_equal(child.env.state(), state):
                return child
            
            res = self.find_node_from_state(state, child)
            if res != None and np.array_equal(res.env.state(), state):
                return res

    # Attempts to find the best move from the tree by searching for the state and finding the best child for that state
    def get_move_from_env(self, env, node: Node = None):
        if node is None:
            node = self.root
        node = self.find_node_from_state(env.state(), node)
        
        if node != None:
            # print("Found node for state:")
            node.env.render()
            # print("Finding best move from:", len(node.children), "available explored moves.")
            best_child = None
            current_best_value = -inf
            for child in node.children:
                if child.value > current_best_value:
                    best_child = child
                    current_best_value = child.value

            if best_child != None:
                return best_child
        
        return None

    # Makes a list of all (state, move_weights) for all expanded nodes in the tree
    def get_training_data_from_tree(self):
        training_data = []
        self.get_training_data_from_node(training_data, self.root)
        return training_data
    
    # recurrsive tree traversal method for get_training_data_from_tree
    def get_training_data_from_node(self, training_data : list, current_node : Node):
        if len(current_node.children) > 0:
            node_tuple = [current_node.env.state(), [0] * (BOARD_SIZE ** 2 + 1)]
            for child in current_node.children:
                node_tuple[1][child.action] = child.value
                self.get_training_data_from_node(training_data, child)
            training_data.append(node_tuple)
            self.get_training_data_from_node(training_data, child)

    def get_tree_data(self):
        x, y = [], []
        self.__get_node_data(self.root, x, y)
        return torch.tensor(x), torch.tensor(y)

    def __get_node_data(self, node, x, y):
        x.append(node.env.state())
        y.append([0] * (BOARD_SIZE**2 + 1))
        for child in node.children:
            y[-1][child.action] = child.value
            if len(child.children) > 0:
                self.__get_node_data(child, x, y)

def export_tree(mcts : Monte_Carlo_Tree_Search):
    pass

def import_tree(mcts : Monte_Carlo_Tree_Search):
    pass

In [37]:
def play_game_no_render(model : Monte_Carlo_Tree_Search, go_env: gym.Env):
    go_env.reset()
    done = go_env.done
    turn_nr = 0
    while not done:
        action = random_action(go_env.state())
        _, _, done, _ = go_env.step(action)

        if done:
            continue

        node = model.get_move_from_env(go_env)
        action = model.get_weighted_move(go_env)
        if node != None:
            action = node.action  
        _, _, done, _ = go_env.step(action)
        turn_nr += 1
        if turn_nr > 300:
            break

    if node != None:
        model.back_propagation(node, go_env.reward())
    
    return go_env

In [38]:
model = Monte_Carlo_Tree_Search(BOARD_SIZE, None)
model.run(100)

In [39]:
for i in range(1,10):
    env = play_game_no_render(model, copy.deepcopy(model.env))
    if env.done:
        print("Game finished within 300 turns:")
    else:
        print("Game stopped after 300 turns:")
        
    if env.reward() < 0:
        print("White won!")
    if env.reward() > 0:
        print("Black won!")
    if env.reward() == 0:
        print("It's a draw!")

	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─┼─○
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Game finished within 300 turns:
White won!
	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─○─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Game finished within 300 turns:
White won!
	0 1 2 3 4 
0	○═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Game finished within 300 turns:
Black won!
	0 1 2 3 4 
0	╔═╤═○═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Game finished within 300 turns:
Black won!
	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═○
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Game finished within 300 turns:
B

# Convolutional Neural Network

In [40]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

Using cuda:0


In [45]:
class CNN(nn.Module):

    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(6, 32, 5, padding=2)
        self.pool = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(32, 64, 5, padding=2)
        self.fc1 = nn.Linear(64, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 26)


    def forward(self, x):
        x = x.float()
        x = x.to(device)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x,0, -1)
        x = x.t()
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def accuracy(self, x, y):
        return torch.mean(torch.eq(self.forward(x).argmax(1), y.argmax(1)).float())

In [42]:
# utility methods for importing and exporting models, defined outside class
def export_model(cnn, name="cnn"):
    torch.save(cnn.state_dict(), "cnns/" + name + ".pth")

def import_model(cnn, name="cnn"):
    cnn.load_state_dict(torch.load("cnns/" + name + ".pth"))

In [46]:
def train_model(model, x, y, lr, momentum):
    print(f"Training on {len(x)} datapoints")

    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

    running_loss = .0

    for i in range(len(x)):
        inputs, labels = torch.tensor(x[i], device=device), torch.tensor(y[i], device=device)
        # inputs = torch.permute(inputs, (2, 1, 0))
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if i % 200 == 199:
            print(f'[{i + 1:8d}] loss: {running_loss / 200:.3f}')
            running_loss = 0.0

# MCTS and CNN combined

1. Create an MCTS
2. Generate a lot of data from the MCTS
3. Use the data to train a CNN
4. Use the CNN in a new MCTS to hopefully make the MCTS make better choices
5. Generate new data with the new MCTS
6. Train a new CNN... repeat ad infinitum

In [56]:
cnn = CNN()
mcts = Monte_Carlo_Tree_Search(BOARD_SIZE, cnn)
mcts.run(10_000)
x, y = mcts.get_tree_data()
# print(f"Model structure: {cnn}\n\n")

Run  0
Run  1
Run  2
Run  3
Run  4
Run  5
Run  6
Run  7
Run  8
Run  9
Run  10
Run  11
Run  12
Run  13
Run  14
Run  15
Run  16
Run  17
Run  18
Run  19
Run  20
Run  21
Run  22
Run  23
Run  24
Run  25
Run  26
Run  27
Run  28
Run  29
Run  30
Run  31
Run  32
Run  33
Run  34
Run  35
Run  36
Run  37
Run  38
Run  39
Run  40
Run  41
Run  42
Run  43
Run  44
Run  45
Run  46
Run  47
Run  48
Run  49
Run  50
Run  51
Run  52
Run  53
Run  54
Run  55
Run  56
Run  57
Run  58
Run  59
Run  60
Run  61
Run  62
Run  63
Run  64
Run  65
Run  66
Run  67
Run  68
Run  69
Run  70
Run  71
Run  72
Run  73
Run  74
Run  75
Run  76
Run  77
Run  78
Run  79
Run  80
Run  81
Run  82
Run  83
Run  84
Run  85
Run  86
Run  87
Run  88
Run  89
Run  90
Run  91
Run  92
Run  93
Run  94
Run  95
Run  96
Run  97
Run  98
Run  99
Run  100
Run  101
Run  102
Run  103
Run  104
Run  105
Run  106
Run  107
Run  108
Run  109
Run  110
Run  111
Run  112
Run  113
Run  114
Run  115
Run  116
Run  117
Run  118
Run  119
Run  120
Run  121
Run  122
Run

In [58]:
cnn = CNN()
train_model(cnn, x, y, .001, .9)
print(cnn.accuracy(x, y))
# go_env = gym.make()

Training on 167 datapoints


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x10688 and 64x120)