In [1]:
import numpy as np
import pickle

BOARD_ROWS,BOARD_COLS = 3,3

In [None]:
import sys,os

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
currentdir = os.path.abspath(os.getcwd())

In [2]:
# Class that defines the state of the board, also gives players rewards at the end of games
class State:
    
    # p1 and p2 are the agents that are playing
    def __init__(self, p1, p2):
        # Board is always 3x3 for tic tac toe
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.gameOver = False
        # init player1 always starts, 1 for p1, -1 for p2
        self.playerTurn = 1
        
    # Get a unique hash value that corresponds with the current board state
    # This is used to store the board state in a state-value dictionary
    def getHash(self):
        return str(self.board.reshape(BOARD_ROWS * BOARD_COLS))
    
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.gameOver = False
        self.playerTurn = 1
    
    # Update vacant positions after a turn is made
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i,j] == 0:
                    # Coordinates need to be in tuple form
                    positions.append((i,j))
        return positions
    
    # For when a player makes a move
    def updateBoard(self, position):
        self.board[position] = self.playerTurn
        # Switch to other player
        self.playerTurn = -1 if self.playerTurn == 1 else 1
        
    # After each move, check if there's a winner and give out rewards
    def winner(self):
        # Row win
        for i in range(BOARD_ROWS):
            # p1 wins
            if sum(self.board[i, :]) == 3:
                self.gameOver = True
                return 1
            # p2 wins
            elif sum(self.board[i, :]) == -3:
                self.gameOver = True
                return -1
            
        # Column win 
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.gameOver = True
                return 1
            elif sum(self.board[:, i]) == -3:
                self.gameOver = True
                return -1
            
        # Diagonal win
        diag1 = sum([self.board[i, i] for i in range(BOARD_COLS)])
        diag2 = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)])
        if diag1 == 3 or diag2 == 3:
            self.gameOver = True
            return 1
        elif diag1 == -3 or diag2 == -3:
            self.gameOver = True
            return -1
        
        # Tie
        if not len(self.availablePositions()):
            self.gameOver = True
            return 0
        
        # Game not over
        self.gameOver = False
        return None
    
    # Give rewards only if game is over
    def giveReward(self):
        result = self.winner()
        if result > 0:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result < 0:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        # Tying is worse than winning but better than losing
        # Also since p1 has an advantage going first, they get less of a reward for ties
        else:
            self.p1.feedReward(.1)
            self.p2.feedReward(.5)
            
    # Simulate a single player's turn
    # If human is True, print extra output to show extra results
    def playTurn(self, player, human=False):
        openPositions = self.availablePositions()
        playerAction = player.chooseAction(openPositions, self.board, self.playerTurn)
        # Update board state with action
        self.updateBoard(playerAction)
        boardHash = self.getHash()
        player.addState(boardHash)
        # Check board for winner
        result = self.winner()
        # When there's a human player, print out stuff
        if human:
            self.showBoard()
            if result is not None:
                if result == 1:
                    print(f"{self.p1.name} wins!")
                elif result == -1:
                    print(f"{self.p2.name} wins!")
                else:
                    print("tie!")
                self.reset()
                return True
            else:
                return False
        elif result is not None:
            self.giveReward()
            self.p1.reset()
            self.p2.reset()
            self.reset()
            return True
        else:
            return False
            
    # Simulate playing games with 2 agents
    def play(self, rounds=100):
        for i in range(rounds):
            while not self.gameOver:
                # Player1
                if self.playTurn(self.p1):
                    break 
                # Player2
                if self.playTurn(self.p2):
                    break
    
    # Play human vs agent
    def playInteractive(self):
        while not self.gameOver:
            # Player1
            print(f"{p1.name} going...")
            if self.playTurn(self.p1, True):
                break 
            # Player2
            print(f"{p2.name} going...")
            if self.playTurn(self.p2, True):
                break
                    
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')

In [3]:
# Super class for players (both human and agent)
class Player:
    def __init__(self, name):
        self.name = name
        
    def chooseAction(self, positions, currentBoard=None, turn=None):
        pass
        
    def addState(self, state):
        pass
    
    def feedReward(self, reward):
        pass
    
    def reset():
        pass

In [4]:
# Class that defines agent players
# Two of these players will be learning and playing with each other
class AgentPlayer:
    def __init__(self, name, explRate=.3):
        self.name = name
        # Implement epsilon-greedy method of selecting actions
        # Default .3 value means 30% of time agent takes random action, 70% of time agent takes greedy action
        self.explRate = explRate
        # Record all positions taken
        self.states = []
        # Learning rate 
        self.lr = .2
        self.decayGamma = .9
        # State -> Value
        self.statesValue = {}
        
    # Get a unique hash value that corresponds with the given board state
    def getHash(self, board):
        return str(board.reshape(BOARD_ROWS * BOARD_COLS))
    
    # Using this abstraction because HumanPlayer class will have this as well
    def addState(self, state):
        self.states.append(state)
        
    def chooseAction(self, openPositions, currentBoard, turn):
        if np.random.uniform(0,1) <= self.explRate:
            # Take random action
            index = np.random.choice(len(openPositions))
            action = openPositions[index]
        else:
            maxValue = -999
            for p in openPositions:
                nextBoard = currentBoard.copy()
                nextBoard[p] = turn
                nextBoardHash = self.getHash(nextBoard)
                value = 0 if self.statesValue.get(nextBoardHash) is None else self.statesValue.get(nextBoardHash)
                if value > maxValue:
                    maxValue = value
                    action = p
        return action
    
    # At the end of the game, backpropogate and update state values
    # The updated value of state t equals the current value of state t
    #   adding the difference between the value of next state and the value of current state,
    #   which is multiplied by a learning rate α (Given the reward of intermediate state is 0)
    def feedReward(self, reward):
        for state in (reversed(self.states)):
            if self.statesValue.get(state) is None:
                self.statesValue[state] = 0
            self.statesValue[state] += self.lr * (self.decayGamma * reward - self.statesValue[state])
            reward = self.statesValue[state]
    
    # For when there's a new round
    def reset(self):
        self.states = []

    # After training, an agent has its policy stored in self.stateValues
    # This can be saved to play against a human player
    def savePolicy(self):
        fw = open(currentdir + '/policies/oldpolicy_' + str(self.name), 'wb')
        pickle.dump(self.statesValue, fw)
        fw.close()

    # Loading the policy when playing a human
    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.statesValue = pickle.load(fr)
        fr.close()

In [5]:
# Class for human player
# Mostly inherited from super class Player
class HumanPlayer(Player):    
    def chooseAction(self, positions, currentBoard=None, turn=None):
        while True:
            try:
                i = int(input("Input action row-> "))
                j = int(input("Input action column-> "))
            except ValueError:
                continue
            if (i, j) in positions:
                return (i, j)

In [6]:
# Train agents
p1 = AgentPlayer("p1")
p2 = AgentPlayer("p2")
state = State(p1, p2)
print("training...")
state.play(1000)
# Save Results
p1.savePolicy()
print("saved p1 policy")
p2.savePolicy()
print("saved p2 policy")

training...
saved p1 policy
saved p2 policy


In [9]:
# Human play with trained p1
# Make sure Agent isn't training anymore
p1 = AgentPlayer("computer", explRate=0)
p1.loadPolicy("policy_p1")
p2 = HumanPlayer("human")
state = State(p1, p2)
state.playInteractive()

computer going...
-------------
|   | x |   | 
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
human going...
Input action row-> 1
Input action column-> 1
-------------
|   | x |   | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
computer going...
-------------
| x | x |   | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
human going...
Input action row-> 0
Input action column-> 2
-------------
| x | x | o | 
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
computer going...
-------------
| x | x | o | 
-------------
| x | o |   | 
-------------
|   |   |   | 
-------------
human going...
Input action row-> 2
Input action column-> 0
-------------
| x | x | o | 
-------------
| x | o |   | 
-------------
| o |   |   | 
-------------
human wins!


In [8]:
# Human play with trained p2
# Make sure Agent isn't training anymore
p1 = HumanPlayer("human")
p2 = AgentPlayer("computer", explRate=0)
p2.loadPolicy("policy_p2")
state = State(p1, p2)
state.playInteractive()

human going...


KeyboardInterrupt: 