Reinforcement Learning for the Game of the Amazons


In [0]:
%%capture
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'
!pip install -q torch torchvision livelossplot tqdm pandas


In [0]:
from ast import literal_eval
from copy import copy
import csv
from datetime import datetime
import numpy as np
import pandas as pd
from random import randint, getrandbits, random
import torch
import torch.nn as nn
from tqdm import tqdm_notebook as tqdm

In [0]:
class Game:
    def __init__(self):
        # suppress np divide by zero errors for moves
        np.seterr(divide='ignore', invalid='ignore')

        self.board = np.zeros((3, 10, 10), dtype=np.uint8)
        self.isBlackTurn = False
        self.__loadStartingBoard()

    def __loadStartingBoard(self):
        # White amazons
        self.board[0, 0, 3] = 1
        self.board[0, 3, 0] = 1
        self.board[0, 6, 0] = 1
        self.board[0, 9, 3] = 1

        # Black amazons
        self.board[1, 0, 6] = 1
        self.board[1, 3, 9] = 1
        self.board[1, 6, 9] = 1
        self.board[1, 9, 6] = 1

    def move(self, fromXY, toXY, shootAT):
        unchangedBoard = copy(self.board)

        if self.board[int(self.isBlackTurn), fromXY[0], fromXY[1]] == 1:
            if self.__isValidMove(fromXY, toXY):
                self.board[int(self.isBlackTurn), fromXY[0], fromXY[1]] = 0
                self.board[int(self.isBlackTurn), toXY[0], toXY[1]] = 1
                if self.__isValidMove(toXY, shootAT):
                    self.board[2, shootAT[0], shootAT[1]] = 1
                    self.isBlackTurn = not self.isBlackTurn
                    return

        # if any invalidities
        self.board = unchangedBoard
        raise Exception("That move is invalid.")

    def __isValidMove(self, position, destination):
        direction = np.subtract(destination, position)
        direction = direction / np.abs(direction)
        direction = np.array(np.nan_to_num(direction), dtype=np.int8)

        valid = True
        checkingPosition = position + direction

        while tuple(checkingPosition) != destination:
            for i in range(3):
                valid &= (
                    self.board[i, checkingPosition[0], checkingPosition[1]] == 0)
            checkingPosition = checkingPosition + direction

        return valid

    def isGameFinished(self):
        currentPlayersAmazons = np.nonzero(self.board[int(self.isBlackTurn)])
        currentPlayersAmazons = np.transpose(currentPlayersAmazons)
        canMove = False

        # Check if there exists an empty cell neighbouring an amazon
        for amazon in currentPlayersAmazons:
            for i in range(-1, 2):
                for j in range(-1, 2):
                    if (i, j) != (0, 0):
                        pos = amazon[0] + i, amazon[1] + j
                        if self.__isInBoard(pos):
                            isPosEmpty = True
                            for k in range(3):
                                isPosEmpty &= self.board[k,
                                                         pos[0], pos[1]] == 0
                            if isPosEmpty:
                                return False
        return True

    def __isInBoard(self, pos):
        isIn = pos[0] > -1 and pos[0] < 10
        isIn &= pos[1] > -1 and pos[1] < 10
        return isIn

    def calculateReward(self):
        winner = int(not self.isBlackTurn)
        winnersAmazons = np.nonzero(self.board[winner])
        winnersAmazons = np.transpose(winnersAmazons)

        rewardCells = np.zeros((10, 10), dtype=np.uint8)
        for amazon in winnersAmazons:
            rewardCells = self.__rewardHelper(amazon, rewardCells)

        return(np.count_nonzero(rewardCells))

    def __rewardHelper(self, start, cells):
        for i in range(-1, 2):
            for j in range(-1, 2):
                if (i, j) != (0, 0):
                    pos = start[0] + i, start[1] + j
                    if self.__isInBoard(pos):
                        if cells[pos[0], pos[1]] == 0:
                            empty = True
                            for k in range(3):
                                empty &= self.board[k, pos[0], pos[1]] == 0

                            if empty:
                                cells[pos[0], pos[1]] = 1
                                cells = self.__rewardHelper(pos, cells)
        return cells

    def rollbackTo(self, board, turn):
        self.board = board
        self.isBlackTurn = turn

In [0]:
class Environment():
    def __init__(self):
        self.game = Game()
        self.currentCheckpoint = None

    def isGameFinished(self):
        return self.game.isGameFinished()

    def getState(self):
        return tuple(self.game.board)

    def isBlackTurn(self):
        return self.game.isBlackTurn

    def getReward(self):
        return 1
        
    def move(self, fromXY, toXY, shotXY):
        return self.game.move(fromXY, toXY, shotXY)

    def saveCheckpoint(self):
        self.currentCheckpoint = {
            "board": self.game.board,
            "turn": self.game.isBlackTurn
        }

    def loadCheckpoint(self):
        self.game.board = copy(self.currentCheckpoint["board"])
        self.game.isBlackTurn = copy(self.currentCheckpoint["turn"])

    def getSelectionMask(self):
        return self.game.board[int(self.game.isBlackTurn)]

    def getMovementMask(self, moveFrom):
        occupiedCells = []
        for stateImg in self.game.board:
            occupiedCells += list(map(tuple, np.argwhere(stateImg > 0)))
        return self.__validityPoller(moveFrom, occupiedCells)

    def __validityPoller(self, start, blockers):
        valid = np.zeros((10, 10))
        for dirX in range(-1, 2):
            for dirY in range(-1, 2):
                if dirX == 0 and dirY == 0:
                    continue    # Goes nowhere

                x, y = start
                x += dirX
                y += dirY

                while x > -1 and x < 10 and y > -1 and y < 10:
                    if (x, y) in blockers:
                        break

                    valid[x, y] = 1

                    x += dirX
                    y += dirY

        return valid

    def getShotMask(self, newAmazonPos, oldAmazonPos):
        occupiedCells = []
        for stateImg in self.game.board:
            occupiedCells += list(map(tuple, np.argwhere(stateImg > 0)))

        if oldAmazonPos in occupiedCells:
            occupiedCells.remove(oldAmazonPos)

        return self.__validityPoller(newAmazonPos, occupiedCells)

    def toString(self, state=None):

        if state is None:
            state = self.currentCheckpoint["board"]

        string = ""
        for arr in state[:-1]:
            for point in np.transpose(np.nonzero(arr)):
                string += str(point[0]) + str(point[1])

        for char in np.nditer(state[-1]):
            string += str(char)

        return string

    def parseState(self, string):
        amazons = []
        for i in range(0, 17, 2):
            amazons += [(int(string[i]), int(string[i+1]))]

        ownAmazons = np.zeros((10, 10), dtype=np.uint8)
        for amazon in amazons[:4]:
            ownAmazons[amazon] = 1

        oppAmazons = np.zeros((10, 10), dtype=np.uint8)
        for amazon in amazons[4:]:
            oppAmazons[amazon] = 1

        arrows = np.fromstring(string[16:116], dtype=np.uint8)
        arrows -= ord('0')  # Convert from unicode to binary
        arrows = np.reshape(arrows, (10, 10))

        selection, movement = None, None

        if len(string) > 116:
            selection = (int(string[116]), int(string[117]))

        if len(string) > 118:
            movement = (int(string[118]), int(string[119]))

        return ownAmazons, oppAmazons, arrows, selection, movement

In [29]:
class NeuralNet(nn.Module):
    NUMBER_OF_RESIDUAL_LAYERS = 40

    def __init__(self, in_channels=3):
        super(NeuralNet, self).__init__()
        self.inputLayer = self.__inputLayer(in_channels)
        self.residualBlock = self.__residualBlock()
        self.policyHead = self.__policyHead()
        self.valueHead = self.__valueHead()

    def __inputLayer(self, in_channels):
        layers = nn.ModuleList()
        layers.append(nn.Conv2d(in_channels, out_channels=300,
                                kernel_size=3, padding=1, bias=False))
        layers.append(nn.BatchNorm2d(300))
        layers.append(nn.ReLU())
        return layers

    def __residualBlock(self):
        layers = nn.ModuleList()
        layers.append(nn.Conv2d(in_channels=300, out_channels=300,
                                kernel_size=3, padding=1, bias=False))
        layers.append(nn.BatchNorm2d(300))
        layers.append(nn.ReLU())
        layers.append(nn.Conv2d(in_channels=300, out_channels=300,
                                kernel_size=3, padding=1, bias=False))
        layers.append(nn.BatchNorm2d(300))
        return layers

    def __policyHead(self):
        layers = nn.ModuleList()
        layers.append(nn.Conv2d(in_channels=300, out_channels=1,
                                kernel_size=1, padding=0, bias=False))
        layers.append(nn.BatchNorm2d(1))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(10, 10))
        layers.append(nn.Sigmoid())
        return layers

    def __valueHead(self):
        layers = nn.ModuleList()
        layers.append(nn.Conv2d(in_channels=300, out_channels=1,
                                kernel_size=1, padding=0, bias=False))
        layers.append(nn.BatchNorm2d(1))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(10, 10))
        layers.append(nn.ReLU())
        layers.append(nn.Linear(10, 10))
        layers.append(nn.MaxPool2d(10))
        layers.append(nn.Tanh())
        return layers

    def forward(self, networkInput):
        for layer in self.inputLayer:
            networkInput = layer(networkInput)

        for _ in range(self.NUMBER_OF_RESIDUAL_LAYERS):
            original = networkInput     # For skip connection
            for layer in self.residualBlock:
                networkInput = layer(networkInput)
            networkInput = torch.relu(networkInput + original)

        policy = networkInput
        value = networkInput
        for layer in self.policyHead:
            policy = layer(policy)

        for layer in self.valueHead:
            value = layer(value)

        return policy.view(10, 10), value

    def save(self, name):
        path = f"/content/gdrive/My Drive/AmazonsData/Models/{name}"
        torch.save(self.state_dict(), path)

    def __loadPath(self, path):
        self.load_state_dict(torch.load(path))
        self.eval()

    def load(self, name):
        path = f"/content/gdrive/My Drive/AmazonsData/Models/{name}"
        self.__loadPath(path)

    def loadMostRecent(self, typeOfNet):
        directory = "/content/gdrive/My Drive/AmazonsData/Models/"
        allPaths = [join(directory, name)
                    for name in listdir(directory) if typeOfNet in name]

        if len(allPaths) < 1:
            print("There are no saved models in the models folder. Starting fresh..")
        else:
            self.__loadPath(max(allPaths, key=getctime))


if __name__ == "__main__":
    net = NeuralNet()
    net.loadMostRecent("a")


There are no saved models in the models folder. Starting fresh..


In [0]:
class MCTS():

    def __init__(self, env, nets, device):
        self.env = env
        self.nets = nets
        self.device = device

        self.qValues = {}
        self.policies = {}
        self.edgeVisitQuantity = {}
        self.nodeVisitQuantity = {}

        self.valids = {}

    def search(self):
        if self.env.isGameFinished():
            # Returns 3-tuple reward
            return (self.env.getReward(),) * 3

        values = [0, 0, 0]

        state = self.env.getState()
        stateString = self.env.toString(state)
        stateTensor = torch.tensor(
            state, dtype=torch.float, device=self.device).unsqueeze_(0)

        if stateString not in self.policies:
            self.policies[stateString], values[0] = self.nets[0](stateTensor)
            validSelections = torch.tensor(
                self.env.getSelectionMask(), device=self.device)

            self.policies[stateString] *= validSelections  # mask out invalids

            validCoordinates = np.nonzero(validSelections)
            self.valids[stateString] = validCoordinates

        validCoordinates = self.valids[stateString]
        bestScore = float("-inf")
        bestSelection = None
        bestSelectionStr = ""
        bestSelectionArr = None

        for selection in validCoordinates:
            selection = tuple(coord.item() for coord in selection)
            selectionArr = np.zeros((10, 10), dtype=np.uint8)
            selectionArr[selection] = 1
            selectionTuple = state + (selectionArr,)
            selectionString = stateString + \
                str(selection[0]) + str(selection[1])

            if selectionString not in self.valids:
                self.valids[selectionString] = self.env.getMovementMask(
                    selection)

            if len(np.transpose(np.nonzero(self.valids[selectionString]))) < 1:
                valid = self.valids[stateString]
                selectionTensor = torch.tensor(
                    selection, device=self.device, dtype=torch.float)

                for i in range(len(valid)):
                    if torch.all(torch.eq(valid[i],  selectionTensor)):
                        valid = torch.cat([valid[:i], valid[i+1:]])
                        break

                self.valids[stateString] = valid

            else:
                if (stateString, selection) in self.qValues:
                    score = self.qValues[(stateString, selection)] + \
                        self.policies[stateString][selection] * \
                        sqrt(self.nodeVisitQuantity.get(stateString, 0)) / \
                        (self.edgeVisitQuantity.get((stateString, selection), 0) + 1)

                else:
                    score = self.policies[stateString][selection] * \
                        sqrt(self.nodeVisitQuantity.get(stateString, 0))

                if score > bestScore:
                    bestScore = score
                    bestSelection = selection
                    bestSelectionStr = selectionString
                    bestSelectionArr = selectionTuple

        if bestSelectionStr not in self.policies:
            movementTensor = torch.tensor(
                bestSelectionArr, dtype=torch.float, device=self.device).unsqueeze(0)
            self.policies[bestSelectionStr], values[1] = self.nets[1](
                movementTensor)

            # mask out invalids
            self.policies[bestSelectionStr] *= torch.tensor(
                self.valids[bestSelectionStr], device=self.device)

        validCoordinates = np.transpose(
            np.nonzero(self.valids[bestSelectionStr]))
        bestScore = float("-inf")
        bestMove = None

        # Choose coord to move to
        for moveTo in validCoordinates:
            moveTo = tuple(coord.item() for coord in moveTo)

            if (bestSelectionStr, moveTo) in self.qValues:
                score = self.qValues[(bestSelectionStr, moveTo)] + \
                    self.policies[bestSelectionStr][moveTo] * \
                    sqrt(self.nodeVisitQuantity.get(bestSelectionStr, 0)) / \
                    (self.edgeVisitQuantity.get((bestSelectionStr, moveTo), 0) + 1)
            else:
                score = self.policies[bestSelectionStr][moveTo] * \
                    sqrt(self.nodeVisitQuantity.get(bestSelectionStr, 0))

            if score > bestScore:
                bestScore = score
                bestMove = moveTo

        bestMoveString = bestSelectionStr + str(bestMove[0]) + str(bestMove[1])
        shotTensor = torch.tensor(
            bestSelectionArr, dtype=torch.float, device=self.device)
        shotTensor[0][bestSelection], shotTensor[0][bestMove] = 0, 1
        shotTensor[3][bestSelection], shotTensor[3][bestMove] = 0, 1
        shotTensor.unsqueeze_(0)

        # New leaf node
        if bestMoveString not in self.policies:
            self.policies[bestMoveString], values[2] = self.nets[2](shotTensor)
            validSelections = torch.tensor(
                self.env.getShotMask(bestMove, bestSelection), device=self.device)

            # mask out invalids
            self.policies[bestMoveString] *= validSelections

            validCoordinates = np.nonzero(validSelections)
            self.valids[bestMoveString] = validCoordinates
            return [-value for value in values]

        validShots = self.valids[bestMoveString]
        bestScore = float("-inf")
        bestShot = None

        for shot in validShots:
            shot = tuple(coord.item() for coord in shot)
            if (bestMoveString, shot) in self.qValues:
                score = self.qValues[(bestMoveString, shot)] + \
                    self.policies[bestMoveString][shot] * \
                    sqrt(self.nodeVisitQuantity.get(bestMoveString, 0)) / \
                    (self.edgeVisitQuantity.get((bestMoveString, shot), 0) + 1)
            else:
                score = self.policies[bestMoveString][shot] * \
                    sqrt(self.nodeVisitQuantity.get(bestMoveString, 0))

            if score > bestScore:
                bestScore = score
                bestShot = shot

        self.env.move(bestSelection, bestMove, bestShot)

        values = self.search()
        pairs = ((stateString, bestSelection),
                 (bestSelectionStr, bestMove),
                 (bestMoveString, bestShot))

        for i in range(3):
            if pairs[i] in self.qValues:
                self.qValues[pairs[i]] = (
                    self.qValues[pairs[i]] *
                    self.edgeVisitQuantity[pairs[i]] +
                    values[i] /
                    (self.edgeVisitQuantity[pairs[i]] + 1))

                self.edgeVisitQuantity[pairs[i]] += 1

            else:
                x, y = pairs[i][1]
                self.qValues[pairs[i]] = values[i]
                self.edgeVisitQuantity[pairs[i]] = 1

            if pairs[i][0] in self.nodeVisitQuantity:
                self.nodeVisitQuantity[pairs[i][0]] += 1
            else:
                self.nodeVisitQuantity[pairs[i][0]] = 1

        return [-value for value in values]

    def getRandomMove(self):
        selectionState = self.env.toString()
        selection, selPolicy = self.__weightedRandomAction(selectionState)

        movementState = selectionState + str(selection[0]) + str(selection[1])
        moveTo, movePolicy = self.__weightedRandomAction(movementState)

        shootAtState = movementState + str(moveTo[0]) + str(moveTo[1])
        shootAt, shotPolicy = self.__weightedRandomAction(shootAtState)

        return (selection, moveTo, shootAt), [[selectionState, selPolicy, self.env.isBlackTurn()],
                                              [movementState, movePolicy,
                                               self.env.isBlackTurn()],
                                              [shootAtState, shotPolicy, self.env.isBlackTurn()]]

    def __weightedRandomAction(self, state):
        filtered = {key: value for (key, value)
                    in self.edgeVisitQuantity.items() if key[0] == state}

        total = sum(filtered.values())
        adjusted = {key: value / total
                    for (key, value) in filtered.items()}

        randomChoice, total, action = random(), 0, None
        for key, value in adjusted.items():
            total += value
            if randomChoice <= total:
                action = key
                break

        policy = {}
        for key in adjusted.keys():
            policy[key[1]] = adjusted[key]

        return action[1], policy

    def getBestMove(self):
        selectionState = self.env.toString()
        selection = self.__bestAction(selectionState)

        movementState = selectionState + str(selection[0]) + str(selection[1])
        moveTo = self.__bestAction(movementState)

        shootAtState = movementState + str(moveTo[0]) + str(moveTo[1])
        shootAt = self.__bestAction(shootAtState)

        return selection, moveTo, shootAt

    def __bestAction(self, state):
        filtered = {key: value for (key, value)
                    in self.edgeVisitQuantity.items() if key[0] == state}

        action = max(filtered, key=(lambda key: filtered[key]))
        return action[1]


In [0]:
class Agent():

    def __init__(self, currentBestNNet=None):
        self.CURRENT_BEST_NNET = currentBestNNet
        self.device = (torch.device("cuda") if torch.cuda.is_available()
                       else torch.device("cpu"))

    def train(self, loops=1, games=0, searchesPerMove=25, numberOfSamples=2048):
        nnets, optimisers = self.__loadNNets(self.CURRENT_BEST_NNET, True)
        for loop in range(loops):
            print(f"Self-play phase:")
            for game in tqdm(range(games)):
                env = Environment()
                actionsTaken = []
                while not env.isGameFinished():
                    mcts = MCTS(env, nnets, self.device)
                    env.saveCheckpoint()
                    for search in range(searchesPerMove):
                        mcts.search()
                        env.loadCheckpoint()

                    nextMove, actions = mcts.getRandomMove()
                    env.move(*nextMove)
                    actionsTaken += actions

                reward = env.getReward()
                isBlackWinner = not env.isBlackTurn()

                with open("/content/gdrive/My Drive/AmazonsData/actions.csv", "a") as file:
                    writer = csv.writer(file, delimiter="|")
                    for action in actionsTaken:
                        wasBlackTurn = action[2]
                        action[2] = reward if wasBlackTurn != isBlackWinner else -reward
                        writer.writerow(action)

            print("Weight updating phase")

            actions = pd.read_csv("/content/gdrive/My Drive/AmazonsData/actions.csv", delimiter="|")

            numberOfActions = len(actions.index)
            if numberOfActions > 5e6:
                actions = actions.tail(5e6)
                numberOfActions = 5e6

            actions.to_csv("/content/gdrive/My Drive/AmazonsData/actions.csv",
                           sep="|", index=False, header=False)

            samples, env = [], Environment()
            for _ in range(numberOfSamples):
                # Randomly sample from DF
                sampleIndex = randint(0, numberOfActions-1)
                while sampleIndex in samples:
                    sampleIndex = randint(0, numberOfActions-1)

                samples += [sampleIndex]

            for sample in tqdm(samples):
                state, policy, value = actions.iloc[sample]
                policy = literal_eval(policy)
                own, opp, arr, sel, mov = env.parseState(state)

                if sel is not None:
                    active = np.zeros((10, 10), dtype=np.uint8)
                    if mov is None:
                        active[sel] = 1
                        nnetIndex = 1
                    else:
                        active[mov] = 1
                        own[sel] = 0
                        own[mov] = 1

                        nnetIndex = 2

                    state = (own, opp, arr, active)
                else:
                    state = (own, opp, arr)
                    nnetIndex = 0

                state = (torch.tensor(state, dtype=torch.float, device=self.device)
                         .unsqueeze(0))

                optimisers[nnetIndex].zero_grad()
                predictedPolicy, predictedValue = nnets[nnetIndex](state)

                policyT = torch.zeros((10, 10),
                                      dtype=torch.float, device=self.device)
                for action in policy.keys():
                    policyT[action] = policy[action]

                valueT = torch.tensor(value,
                                      dtype=torch.float, device=self.device)

                # Cross entropy
                xEntropy = -torch.log((1-policyT)-predictedPolicy)
                squareErr = (predictedValue - valueT)**2    # Square error

                loss = xEntropy + squareErr

                loss.mean().backward()
                optimisers[nnetIndex].step()

        wins, losses = self.__compareToCurrentBest(nnets)
        print(
            f"Evaluation results: {wins}W and {losses}L --> {100*(wins/(wins+losses))}%")

        if wins/(wins+losses) >= 0.55:
            name = str(datetime.now()) + ".pth"
            print("New best network is {name}")
            self.CURRENT_BEST_NNET = name
            self.__saveNNets(self, nnets, name)

    def __loadNNets(self, name, includeOptimisers=False):
        nNetA = NeuralNet(in_channels=3).to(self.device)
        nNetB = NeuralNet(in_channels=4).to(self.device)
        nNetC = NeuralNet(in_channels=4).to(self.device)

        if name is not None:
            name.replace(".pth", "a.pth")
            nNetA.load(name)

            name.replace("a.pth", "b.pth")
            nNetB.load(name)

            name.replace("b.pth", "c.pth")
            nNetC.load(name)
        else:
            nNetA.loadMostRecent("a.pth")
            nNetB.loadMostRecent("b.pth")
            nNetC.loadMostRecent("c.pth")

        nnets = nNetA, nNetB, nNetC
        if includeOptimisers:
            optimisers = tuple(torch.optim.Adam(
                N.parameters(), lr=0.0001) for N in nnets)
            return nnets, optimisers
        else:
            return nnets

    def __compareToCurrentBest(self, trainedNets, numberOfGames=10, searchesPerMove=25):
        print("Evaluating network")
        previousNets = self.__loadNNets(self.CURRENT_BEST_NNET)
        wins, losses = 0, 0

        for game in tqdm(range(numberOfGames)):
            isTrainedBlack = bool(getrandbits(1))
            isBlacksMove = False
            env = Environment()

            while not env.isGameFinished():
                if isBlacksMove != isTrainedBlack:
                    mcts = MCTS(env, previousNets, self.device)
                else:
                    mcts = MCTS(env, trainedNets, self.device)

                env.saveCheckpoint()
                for search in range(searchesPerMove):
                    mcts.search()
                    env.loadCheckpoint()

                nextMove = mcts.getBestMove()
                env.move(*nextMove)

                isBlacksMove = not isBlacksMove

            isBlackWinner = not env.isBlackTurn
            if isBlackWinner != isTrainedBlack:
                losses += 1
            else:
                wins += 1

        return wins, losses

    def __saveNNets(self, nnets, name):
        name.replace(".pth", "a.pth")
        nnets[0].save(name)

        name.replace("a.pth", "b.pth")
        nnets[1].save(name)

        name.replace("b.pth", "c.pth")
        nnets[2].save(name)

Training Loop


In [0]:
Agent().train(loops=5, games=10)

There are no saved models in the models folder. Starting fresh..
There are no saved models in the models folder. Starting fresh..
There are no saved models in the models folder. Starting fresh..
Self-play phase:


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))