In [308]:
import copy, random
from collections import namedtuple, deque
from math import sqrt, log, exp, inf
from itertools import count

# ignore deprecation warnings ('safe' as long as we don't update packages)
from warnings import filterwarnings
filterwarnings("ignore")

# misc
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

# go env
import gym
from gym_go.gogame import turn, valid_moves, random_weighted_action
# from gym_go.envs.go_env import ,

# torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [322]:
# Global constants
UCB_C = 30
BOARD_SIZE = 5

'''
The state object that is returned by the reset and step functions of the environment is a
6 x BOARD_SIZE x BOARD_SIZE numpy array. All values in the array are either 0 or 1.
'''
BLACK = 0
WHITE = 1
TURN = 2
INVALID = 3
PASS = 4
DONE = 5

# Deep Q-Learning

In [310]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"CUDA is available, using device '{device}'" if torch.cuda.is_available() else f"CUDA is NOT available, using device '{device}'")

# aner ikke hva dette er, ignorer for nå
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
plt.ion()

go_env = gym.make('gym_go:go-v0', size=7, komi=0, reward_method='heuristic')

CUDA is NOT available, using device 'cpu'


In [311]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [312]:
class DQN(nn.Module):

    def __init__(self, h, w, outputs):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(6, 16, kernel_size=1)
        # self.conv1 = nn.Conv2d(6, 16, kernel_size=1, stride=2)
        # self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=1)
        # self.conv2 = nn.Conv2d(16, 32, kernel_size=1, stride=2)
        # self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=1)
        # self.conv3 = nn.Conv2d(32, 32, kernel_size=1, stride=1)
        # self.bn3 = nn.BatchNorm2d(32)

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        def conv2d_size_out(size, kernel_size = 1, stride = 1):
            return (size - (kernel_size - 1) - 1) // stride  + 1
        
        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        
        linear_input_size = convw * convh * 32
        self.head = nn.Linear(linear_input_size, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = torch.from_numpy(x)
        x = x.to(device)
        x = x.float()
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        # x = F.relu(self.bn1(self.conv1(x)))
        # x = F.relu(self.bn2(self.conv2(x)))
        # x = F.relu(self.bn3(self.conv3(x)))
        # print(self.head(x.view(x.size(0), -1)))
        temp = x.view(x.size(0), -1)
        print(temp)
        return self.head(temp)
        # return self.head(x.view(x.size(0), -1))

In [313]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get number of actions from gym action space
n_actions = go_env.action_space.n

policy_net = DQN(BOARD_SIZE, BOARD_SIZE, n_actions).to(device)
target_net = DQN(BOARD_SIZE, BOARD_SIZE, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(20_000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            # print(policy_net(state).max(1)[1])
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        # return torch.tensor([[np.random.choice(np.argwhere(valid_moves(state)).flatten())]], dtype=torch.long)
        return torch.tensor([[np.random.choice(np.argwhere(valid_moves(state)).flatten())]], device=device, dtype=torch.long)
        # return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


episode_durations = []


def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())

In [314]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [315]:
num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = go_env.reset()
    for t in count():
        # Select and perform an action
        action = select_action(state)
        state, reward, done, _ = go_env.step(action.item())
        # reward = torch.tensor([reward])
        reward = torch.tensor([reward], device=device)

        if not done:
            next_state = state
        else:
            next_state = None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break

        # Update the target network, copying all weights and biases in DQN
        if t % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

print('Complete')
go_env.close()
plt.ioff()
plt.show()

tensor([[0.1507, 0.1318, 0.1507,  ..., 0.1507, 0.1507, 0.1507],
        [0.0090, 0.0102, 0.0090,  ..., 0.0090, 0.0090, 0.0090],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0455, 0.0399, 0.0455,  ..., 0.0455, 0.0455, 0.0455]])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x49 and 288x50)

# Monte Carlo Tree Search

1. Selection
    - Taverse the tree to find greatest UCB-score
2. Expansion
    - If the selected leaf node has been visited before expand by adding weighted game action
3. Rollout
    - Simulate the game until end-condition from the expanded leaf
4. Back-propagation
    - Updating the value of each ancestor node of the expanded leaf


In [None]:
# TODO: GymGo.gogame.move/validmove/elns for å nulle ut actionspace
def get_legal_move(env):
    board_shape = env.state().shape[1:]
    pass_id = np.prod(board_shape)
    action = env.action_space.sample() # pick random action
    action2d = action // board_shape[0], action % board_shape[1], action
    while action2d[2] != pass_id and env.state()[INVALID, action2d[0], action2d[1]] == 1:
        action = env.action_space.sample() # pick random action
        action2d = action // board_shape[0], action % board_shape[1], action
    return action2d[2]

def equals(state_a, state_b):
    for i in range(0, 6):
        if not np.array_equal(state_a[i], state_b[i]):
            return False
    return True


In [None]:
class Node():
    def __init__(self, env, parent, action):
        self.env : gym.Env = env # This env will be altered by the other player
        self.value : int = 0 # Value estimate
        self.trials : int = 0 # Number of trials for this node
        self.parent : Node = parent # Parent node of this node
        self.children : list[Node] = [] # List of children of this node
        self.action : int = action # The step action made by this node
    
    # calculate a Upper Confidence Bound
    def ucb(self, total_trials):
        return self.value + ( UCB_C * sqrt(log(total_trials) / self.trials) )
    
    # Add a new node to a leaf node
    def expansion(self, move_weights):
        if self.env.done:
            return
        board_shape = self.env.state().shape[1:]
        pass_id = np.prod(board_shape)
        for action in range(0, pass_id + 1):
            action2d = action // board_shape[0], action % board_shape[1]
            if action == pass_id or self.env.state()[3, action2d[0], action2d[1]] == 0:
                child_env = copy.deepcopy(self.env)
                child_env.step(action)
                child = Node(child_env, self, action)
                child.value = move_weights[action]
                self.children.append(child)

    # Simulate game from current move until end-condition returning the score
    def rollout(self, move_selection_method):
        if self.env.done:
            return self.env.reward()
        
        rollout_env = copy.deepcopy(self.env)
        rollout_result = 0
        done = False
        while not done:
            random_action = move_selection_method(rollout_env)
            _, reward, done, _ = rollout_env.step(random_action)
            rollout_result += reward
        return rollout_result

In [343]:
class Monte_Carlo_Tree_Search():
    def __init__(self, size, ml_model):
        self.env : gym.Env = gym.make('gym_go:go-v0', size=size, reward_method='heuristic')
        self.env.reset()
        self.number_of_trials : int = 0
        self.root = Node(self.env, None, None)
        self.ml_model = ml_model
    
    def get_move_weights(self, state):
        # Do something to get this from ml_model
        move_weights = np.ones(BOARD_SIZE ** 2 + 1)
        
        board_shape = state.shape[1:]
        for i in range(len(move_weights) - 1):
            action2d = i // board_shape[0], i % board_shape[1], i
            if state[INVALID, action2d[0], action2d[1]] == 1:
                move_weights[i] = 0.0

        return move_weights

    def get_weighted_move(self, env : gym.Env):
        move_weights = self.get_move_weights(env.state())
        return random_weighted_action(move_weights)

    # Update scores of all parent nodes after rollout
    def back_propagation(self, rollout_node: Node, rollout_result):
        current_node = rollout_node
        while current_node != None:
            current_node.trials += 1
            if turn(self.env.state()) == BLACK:
                current_node.value -= rollout_result
            if turn(self.env.state()) == WHITE:                
                current_node.value += rollout_result
            current_node = current_node.parent
        self.number_of_trials += 1
    
    # find and return the leaf node with the highest UCB-score 
    def selection(self):
        selected_child = self.root
        current_node = self.root
        while len(current_node.children) > 0:
            selected_child = current_node.children[0]
            current_best_ucb = -inf

            for child in current_node.children:
                if child.trials == 0:
                    return child

                child_ucb = child.ucb(self.number_of_trials)

                if child_ucb > current_best_ucb:
                    selected_child = child
                    current_best_ucb = child_ucb

            current_node = selected_child

        return selected_child
        
    def run(self, iterations):
        selected_node = self.root
        selected_node.expansion(self.get_move_weights(selected_node.env.state()))
        selected_node = self.root.children[0]

        run = 0
        while run < iterations:
            selected_node = self.selection()
            # print("Run:", run, ": Selection", selected_node.action)

            if selected_node.env.done:
                self.back_propagation(selected_node, selected_node.env.reward())
                run += 1
                continue

            if selected_node.trials > 0:
                # print("Run:", run, ": Expansion:")
                selected_node.expansion(self.get_move_weights(selected_node.env.state()))
                selected_node = selected_node.children[0]

            rollout_result = selected_node.rollout(self.get_weighted_move)
            # print("Run:", run, ": Rollout", rollout_result)
            
            self.back_propagation(selected_node, rollout_result)
            run += 1
    
    def find_node_from_state(self, node: Node, state):
        if equals(node.env.state(), state):
            return node

        for child in node.children:
            if equals(child.env.state(), state):
                return child
            
            res = self.find_node_from_state(child, state)
            if res != None and equals(res.env.state(), state):
                return res

    def get_move_from_env(self, env):
        node = self.find_node_from_state(self.root, env.state())
        
        if node != None:
            print("Found node for state:")
            node.env.render()
            print("Finding best move from:", len(node.children), "available explored moves.")
            best_child = None
            current_best_value = -inf
            for child in node.children:
                if child.value > current_best_value:
                    best_child = child
                    current_best_value = child.value

            if best_child != None:
                return best_child
        
        return None

    def get_training_data_from_tree(self):
        training_data = []
        self.get_training_data_from_node(training_data, self.root)
        return training_data
        
    def get_training_data_from_node(self, training_data : list, current_node):
        node_tuple = (current_node.env.state(), [0] * (BOARD_SIZE ** 2))
        for child in current_node.children:
            node_tuple[child.action] = child.value
            self.get_training_data_from_node(training_data, child)
        training_data.append(node_tuple)




In [344]:
model = Monte_Carlo_Tree_Search(5, None)
model.run(100)

In [345]:
# Sjekker hvor utforska treet er for å teste UCB_C konstanten

def print_tried_children(current_node : Node):
    for child in current_node.children:
        if child.trials == 0:
            continue
        print("Child", child.action, ":")
        print("Number of children:", len(child.children))
        print("Value:", child.value, "Number of trials:", child.trials, "UCB:", child.ucb(model.number_of_trials))

current_node = model.root
while len(current_node.children) > 0:
    most_tried = current_node.children[0]
    for child in current_node.children:
        if child.trials == 0:
            continue
        print("Child", child.action, ":")
        child.env.render()
        print("Number of children:", len(child.children))
        print("Value:", child.value, "Number of trials:", child.trials, "UCB:", child.ucb(model.number_of_trials))
        print_tried_children(child)
        if child.trials > most_tried.trials:
            most_tried = child
        current_node = most_tried

Child 0 :
	0 1 2 3 4 
0	○═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Number of children: 25
Value: -267.0 Number of trials: 25 UCB: -254.1242038422639
Child 1 :
Number of children: 0
Value: 48.0 Number of trials: 1 UCB: 112.37898078868042
Child 2 :
Number of children: 0
Value: 58.0 Number of trials: 1 UCB: 122.37898078868042
Child 3 :
Number of children: 0
Value: 279.0 Number of trials: 1 UCB: 343.37898078868045
Child 4 :
Number of children: 0
Value: 16.0 Number of trials: 1 UCB: 80.37898078868042
Child 5 :
Number of children: 0
Value: -297.0 Number of trials: 1 UCB: -232.62101921131958
Child 6 :
Number of children: 0
Value: 76.0 Number of trials: 1 UCB: 140.37898078868042
Child 7 :
Number of children: 0
Value: 31.0 Number of trials: 1 UCB: 95.37898078868042
Child 8 :
Number of children: 0
Value: -121.0 Number of trials: 1 UCB: -56.62101921131958
Child 9 :
Number of children: 0
Value: 49

In [346]:
def play_game(advesary_function, model : Monte_Carlo_Tree_Search, go_env: gym.Env):
    go_env.reset()
    done = go_env.done
    turn_nr = 0
    while not done:
        action = advesary_function(go_env)
        _, _, done, _ = go_env.step(action)
        go_env.render('terminal')

        if done:
            continue

        node = model.get_move_from_env(go_env)
        action = get_legal_move(go_env)
        if node != None:
            action = node.action  
        _, _, done, _ = go_env.step(action)
        go_env.render('terminal')
        turn_nr += 1
        if turn_nr > 300:
            break

    if node != None:
        model.back_propagation(node, go_env.reward())

def play_game_no_render(advesary_function, model : Monte_Carlo_Tree_Search, go_env: gym.Env):
    go_env.reset()
    done = go_env.done
    turn_nr = 0
    while not done:
        action = advesary_function(go_env)
        _, _, done, _ = go_env.step(action)

        if done:
            continue

        node = model.get_move_from_env(go_env)
        action = get_legal_move(go_env)
        if node != None:
            action = node.action  
        _, _, done, _ = go_env.step(action)
        turn_nr += 1
        if turn_nr > 300:
            break

    if node != None:
        model.back_propagation(node, go_env.reward())
    
    return go_env

def play_model_vs_model(model1 : Monte_Carlo_Tree_Search, model2 : Monte_Carlo_Tree_Search, go_env: gym.Env):
    go_env.reset()
    done = go_env.done
    turn_nr = 0
    while not done:
        node1 = model1.get_move_from_env(go_env)
        action = get_legal_move(go_env)
        if node1 != None:
            action = node1.action  
        _, _, done, _ = go_env.step(action)
        go_env.render('terminal')

        if done:
            continue

        node2 = model2.get_move_from_env(go_env)
        action = get_legal_move(go_env)
        if node2 != None:
            action = node2.action  
        _, _, done, _ = go_env.step(action)
        go_env.render('terminal')
        turn_nr += 1
        if turn_nr > 300:
            break

    if node1 != None:
        model1.back_propagation(node1, go_env.reward())
    if node2 != None:
        model2.back_propagation(node2, go_env.reward())

def play_model_vs_model_no_render(model1 : Monte_Carlo_Tree_Search, model2 : Monte_Carlo_Tree_Search, go_env: gym.Env):
    go_env.reset()
    done = go_env.done
    turn_nr = 0
    while not done:
        node1 = model1.get_move_from_env(go_env)
        action = get_legal_move(go_env)
        if node1 != None:
            action = node1.action  
        _, _, done, reward = go_env.step(action)

        if done:
            continue

        node2 = model2.get_move_from_env(go_env)
        action = get_legal_move(go_env)
        if node2 != None:
            action = node2.action  
        _, _, done, reward = go_env.step(action)
        turn_nr += 1
        if turn_nr > 300:
            break
    return go_env

In [347]:
for i in range(1,10):
    env = play_game_no_render(get_legal_move, model, copy.deepcopy(model.env))
    if env.done:
        print("Game finished within 300 turns:")
    else:
        print("Game stopped after 300 turns:")
        
    if env.reward() < 0:
        print("White won!")
    if env.reward() > 0:
        print("Black won!")
    if env.reward() == 0:
        print("It's a draw!")

Found node for state:
	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─○─┼─╢
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Finding best move from: 25 available explored moves.
Game finished within 300 turns:
Black won!
Found node for state:
	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─○
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Finding best move from: 0 available explored moves.
Game finished within 300 turns:
White won!
Found node for state:
	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─○─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─┼─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White Area: 0

Finding best move from: 0 available explored moves.
Game finished within 300 turns:
Black won!
Found node for state:
	0 1 2 3 4 
0	╔═╤═╤═╤═╗
1	╟─┼─┼─┼─╢
2	╟─┼─┼─┼─╢
3	╟─┼─┼─○─╢
4	╚═╧═╧═╧═╝
	Turn: WHITE, Game State (ONGOING|PASSED|END): ONGOING
	Black Area: 25, White