In [1]:
import torch 
from tictactoe import *
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import copy 
from pprint import pprint
from collections import deque
from time import time 
from multiprocessing import Pool 

import random 
import torch.optim as optim
from matplotlib import pyplot as plt

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


# Variants of Tic-Tac-Toe (https://en.wikipedia.org/wiki/Tic-tac-toe_variants)

$m,n,k$ game = play on $m$ by $n$ board to try to get $k$ in a row.

We have that TicTacToe extends Game, meaning it must have a checkGameOver function. We modify it so that in the initializer it takes in an additional parameter, 'gameOverChecker', a function that takes a board and logger and returns $0$ if nobody won, $1$ if Player one won, or $2$ if Player $2$ won. 

This allows us to easily implement any variants of the standard 3x3 game that only modify that winning conditions. For example, Misere Tic-tac-toe, or 'inverse' Tic-Tac-Toe, is the game where Player 1 wins if Player 2 gets 3 in a row (we make wrappers for each of the game versions).

Now we would like to support variants of the game where instead of alternating turns, there is some different rule for check whose turn it is. For this, we will add another parameter to TicTacToe constructor, turnChooser. This is passed into the Game superclass. Note that $turnChooser$ is a function that takes in the current player and gives the next player. This lets us support $random$ $turn$ $tic$-$tac$-$toe$.

Now we would like to support larger boards. To do this, we pass in an additional parameter $dimension$. This then creates a board of size $dimension$ by $dimension$. Although this allows us to define boards of arbitrary size, we create $FourByFourTicTacToe$ and $FiveByFiveTicTacToe$. The most logical win condition for an $n$ by $n$ board is $n$ in a row. For this reason we modify the $checkGameOver$ function to take in $n$ - the number of pieces in a row required for a win. Note that other variants are possible - for example, getting a diamond in $4$ by $4$ could also be considered a win. We can then create an $n$ in a row win condition function by using partial functions with $checkGameOver$.

Next, we would like to support games in 3 dimensions. We do this by adding yet another paremeter to the initialization - $threeDims$. In order to make it easier to handle 2D vs 3D games, we will always assume the board is 3D - n by n by n, but for 2D we can just get the n by n board by calling board[0].

We would like easily define combinations of these variants to create custom games. In order to do this, we define a TicTacToeConfig class. Now, TicTacToe simply takes a TicTacToeConfig. This config has all of the default values so we can set any combinations of the ones we want. 


## Visualizing the Board

Looking at the board in the command line is annoying, and we would like some way of seeing what the algorithm is actually doing in a way that is easier to interpret. We will use PyGame to do this. First, we create a `display` method in TicTacToe.

The correct way to do this would be to have some event that is triggered when we make a move to update the display. Due to lazyness, we will just spawn a different thread. This then renders the grid 60 times per second and colors it according to the current board state. 

http://programarcadegames.com/index.php?lang=en&chapter=array_backed_grids
https://www.pygame.org/docs/tut/ChimpLineByLine.html

### Learning the Value Function with a Neural Network

First, we define the neural network. We use tanh to bound the result between -1 and 1 (since this is the bound of our value function). We use standard activation functions, testing first ReLu and then Leaky ReLu.

In [2]:
# Regular Feed forward network with only dense layers 
class DenseNetRELU(nn.Module):
    def __init__(self, dimension):
        super(DenseNetRELU, self).__init__()
        # 9 input features (each of the positions in the board), with a bias
        # 1 hidden layer with 9 inputs, 1 output (the value of the state)
        numStates = dimension * dimension
        self.first = nn.Linear(numStates, numStates, True)
        self.hiddenOne = nn.Linear(numStates, numStates, True)
        self.hiddenTwo = nn.Linear(numStates, 1, True)
        
    def forward(self, x):
        x = F.relu(self.first(x)) 
        x = F.relu(self.hiddenOne(x)) 
        x = F.tanh(self.hiddenTwo(x)) 
        return x
    
def weights_init_uniform(m):
    classname = m.__class__.__name__
    if classname.find('Linear') != -1:
        m.weight.data.uniform_(0.0, 1.0)
        m.bias.data.fill_(0)
    
class DenseNetLeakyRELU(nn.Module):
    def __init__(self, dimension):
        super(DenseNetLeakyRELU, self).__init__()
        
        numStates = dimension * dimension
        self.first = nn.Linear(numStates, numStates, True)
        self.hiddenOne = nn.Linear(numStates, numStates, True)
        self.hiddenTwo = nn.Linear(numStates, 1, True)
        
    def forward(self, x):
        x = F.leaky_relu(self.first(x)) 
        x = F.leaky_relu(self.hiddenOne(x)) 
        x = F.tanh(self.hiddenTwo(x)) 
        return x
    
# Convolutional neural network (for 2D)
class ConvNet(nn.Module):
    def __init__(self, dimension): 
        super(ConvNet, self).__init__()

    def forward(self, x):
        return 0
    
class ConvNet(nn.Module):
    def __init__(self, dimension):
        super(ConvNet, self).__init__()
        
    def forward(self, x):
        return 0 
        

nets = dict()

for dim in range(3, 11): 
    netOne   = DenseNetRELU(dim)
    netTwo   = DenseNetLeakyRELU(dim)
#     netThree = ConvNet(dim)
#     netFour  = ConvNet(dim)
    
    nets[dim] = [netOne, netTwo, netThree, netFour]

print(nets[10])

NameError: name 'netThree' is not defined

We first define our game tree by defining a node class.

In [3]:
class Node:
    ''' 
    Node is a single board state in our game tree.
    '''
    def __init__(self, board): 
        self.children = []
        self.parent = None
        self.board = board
        self.currTurn = 0

We then make a generic function which will take some starting node and a game constructor (which must create an object that extends the Game abstractclass) and fills out the game tree. It does this more efficiently by using a thread pool. Note that this also returns a list of all the nodes in the tree, which allows us to choose a random node much more easily.

In [4]:
def getGameTree(game, root, num_nodes): 
    p = Pool(4)
    return findAll(game, root, num_nodes)
    
def findAll(game, startingNode, num_nodes):
    '''
    findAll performs BFS from the startingNode. It uses a 
    '''
    visited = [0] * num_nodes
    index = 0 

    # Initialize a queue with the starting node. 
    unvisited = deque()
    unvisited.append(startingNode)
    
    # Continue until there are no more unvisited nodes.
    while len(unvisited) > 0:
        if index % 5000 == 0:
            print(index, len(unvisited))
        # Store the new visited nodes          
        currNode = unvisited.popleft()
        index += 1
        visited[index] = currNode
        
        # If game is over, do not add the children
        game.board = currNode.board
        res = game.checkGameOver()
        if res != 0: 
            continue
            
        # Find all of the children 
        for action in game.getAllActions():
            child = Node(copy.deepcopy(currNode.board))
            child.currTurn = game.turnChooser(currNode.currTurn)
            pieceToPlay = 1 if currNode.currTurn == 0 else 2
            child.board[action[0]][action[1]][action[2]] = pieceToPlay
            child.parent = currNode
            
            unvisited.append(child)
            currNode.children.append(child)
    
    return visited

Now we generate the game tree for all game variants so we have data to train the neural network. Note that for many games the game tree is small enough to store in memory. There are 9 places to place the first piece, then 8 to place the second, etc.., so there are $9! = 362880$ states, many of which are not reachable because someone would win. 

In [5]:
root = Node([[[0,0,0],[0,0,0],[0,0,0]]])
NUM_NODES = 1000000
game = StandardTicTacToe(None, None, None)

vertices = findAll(game, root, NUM_NODES)

NameError: name 'StandardTicTacToe' is not defined

First we implement minimax so that we can evaluate a state. We define player 2 winning as -1, and player 1 winning as 1. If currTurn is 0, it's player 1 to move.

In [6]:
def minimax(state):
    game.board = state.board 
    res = game.getAllActions()
    res2 = game.checkGameOver()
    if res2 == 1:
        return 1 
    elif res2 == 2: 
        return -1
    elif len(res) == 0: 
        return 0
        
    children = state.children
    if state.currTurn == 0: 
        val = -float('inf')
    else: 
        val = float('inf')
        
    for child in children: 
        child_val = minimax(child)
        if state.currTurn == 0: 
            val = max(val, child_val)
        else: 
            val = min(val, child_val)

    return val 

Next we can train it

In [11]:
import multiprocessing as mp

memo = dict() 

st = time() 

trainCount = 500000
criterion = nn.MSELoss() # Using mean square error 
optimizer = optim.SGD(nets[3][0].parameters(), lr=0.01) #  create your optimizer

def train(net, trainCount):
    L = [0] * trainCount
    for poo in range(trainCount):
        if poo % 2000 == 0:
            pass
#             print(f"{poo}/{trainCount} - {poo/trainCount} - {time() - st}s", end="\r")
        optimizer.zero_grad()
        # First, pick some state from the game tree: 
        node = random.choice(vertices)
        # Now give it to the neural net 
        if node == 0:  
            continue
        board = torch.FloatTensor(node.board).reshape(-1)
        stateValue = net(board)

        if node in memo:
            res = memo[node]
        else:
            res = minimax(node)
            memo[node] = res 

        loss = criterion(stateValue, torch.FloatTensor([res]))
        loss.backward()
        optimizer.step() 
        L[poo] = loss
    return L

# with Pool(4) as p:
#         p.map(train, [100, 100, 100, 100])

results = dict() 
for dim in range(3, 4):
    dimNets = nets[dim]
    for netIndex in range(len(dimNets)):
        r = train(dimNets[netIndex], 1000)
        results[(dim, netIndex)] = r

NameError: name 'time' is not defined

In [8]:
L2 = []
for i in range(0, len(L), 2500): 
    if len(L) - i < 2500:
        amount = len(L) - i 
    else: 
        amount = 2500 
    L2.append(sum(L[i:i+2500])/amount)
   
plt.plot(L2)
plt.show()

NameError: name 'L' is not defined

Finally we pass this network into an agent

In [9]:
def netValueFunction(board): 
    tBoard = torch.FloatTensor(board).reshape(-1)
    print(tBoard) 
    out = net(tBoard)
    for i, v in enumerate(out): 
        if v == out.max():
            return i
        
    return None
            
nAgent = ValueAgent(0, netValueFunction, 3)

NameError: name 'ValueAgent' is not defined

### Comparisons of Greedy Policy to Baseline

Here, we compare the greedy policy from the learned value function to an agent that plays random policies and a minimax agent (which plays perfectly).

In [10]:
game = StandardTicTacToe(nAgent, DumbAgent(1), Log(2))

results = []
for i in range(0, 10): 
    foo = game.play()
    if foo == 0: 
        print("Tie")
        results.append(-1)
    else: 
        results.append(foo)
    game.reset()
print(results)

NameError: name 'StandardTicTacToe' is not defined