**Introduction**  
Explain RL  
Explain value action  
Refer to OOP  
Refer to data science blog Jeremy Zhang & Richard Karl bill-the-bot  
First let two computers play each other >> save policy >> play against human

In [1]:
# import libraries & modules
import numpy as np
import pickle

**Class State**  
The state of this game is the board state of both the agent and its opponent (agent or human). Characteristics:  
- Initialise a 3x3 board with zeros indicating available positions   
- Update positions with 1 if player 1 takes a move and -1 if player 2 takes a move  
- The action is what positions a player can choose based on the current board state  
- Reward is between 0 and 1 and is only given at the end of the game

In [2]:
Board_rows = 3
Board_cols = 3

class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((Board_rows, Board_cols))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None # Hash function to map data of arbitrary size to fixed-size values
        self.playerSymbol = 1 # init p1 plays first
        
    # get unique hash of current board state
    def getHash(self):
        self.boardHash = str(self.board.reshape(Board_rows * Board_cols)) #reshape to str
        return self.boardHash
    
    # check if one of players has 3 in a row
    def winner(self):
        # 3 in a row in row
        for i in range(Board_rows):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
        # 3 in a row in column
        for i in range(Board_cols):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
        # 3 in a row in diagonal
        diag_sum1 = sum([self.board[i, i] for i in range(Board_cols)])
        diag_sum2 = sum([self.board[i, Board_cols - i - 1] for i in range(Board_cols)])
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == 3:
            self.isEnd = True
            if diag_sum1 == 3 or diag_sum2 == 3:
                return 1
            else:
                return -1

        # tie because of full board 
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        # if game is at not end
        self.isEnd = False
        return None
    
    # check the available positions
    def availablePositions(self):
        positions = []
        for i in range(Board_rows):
            for j in range(Board_cols):
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple of column and row
        return positions

    # add symbol to board if to action taken
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1 # switch to another player

    # if a game ends
    def giveReward(self):
        result = self.winner()
        
        # give reward to player to be able to backpropagate
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1) #TO DO check why this is lower....
            self.p2.feedReward(0.5)

    # reset board after games is ended
    def reset(self):
        self.board = np.zeros((Board_rows, Board_cols))
        self.boardHash = None
        self.isEnd = False
        self.playerSymbol = 1

    # gameplay for 2 agents playing each other    
    def play(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # Take action and update board state
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                # check board status if it is end

                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)

                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    # gameplay agent vs human
    def playhuman(self):
        while not self.isEnd:
            # Agent (player 1)
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
            # Take action and update board state
            self.updateState(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print("tie!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)

                self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == -1:
                        print(self.p2.name, "wins!")
                    else:
                        print("tie!")
                    self.reset()
                    break
    
    # gameplay agent vs human
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, Board_rows):
            print('-------------')
            out = '| '
            for j in range(0, Board_cols):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')

**Class Player**  
A player class which represents our agent, and the player is able to:  
- Choose actions based on current action-value estimation of the states
- Record all the states of the game
- Update states-value estimation after each game
- Save and load the policy to be able to use it to play against human

In [3]:
class Player:
    def __init__(self, name, exp_rate=0.3):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.9 # standard number
        self.states_value = {}  # dictionary  state -> value

    # DOET DIT IETS?
    def getHash(self, board):
        boardHash = str(board.reshape(Board_rows * Board_cols))
        return boardHash

    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                # print("value", value)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
        return action

    # append a hash state
    def addState(self, state):
        self.states.append(state) # add state to empty list

    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        output = open('policy_p1.pkl', 'wb')
        pickle.dump(self.states_value, output)
        output.close()
    
    def loadPolicy(self, file):
        pkl_file = open(file, 'rb')
        self.states_value = pickle.load(pkl_file)
        pkl_file.close()

**Class HumanPlayer**  
A seperate class for a human player that can take actions

In [4]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

**Train game**  
During training, the process for each player/agent is:
- Look for available positions
- Choose action
- Update board state and add the action to player’s states
- Judge if reach the end of the game and give reward accordingly

In [5]:
if __name__ == "__main__":
    # training
    p1 = Player("p1")
    p2 = Player("p2")

    st = State(p1, p2)
    print("training...")
    st.play(10000)
    p1.savePolicy()

training...
Rounds 0
Rounds 1000
Rounds 2000
Rounds 3000
Rounds 4000
Rounds 5000
Rounds 6000
Rounds 7000
Rounds 8000
Rounds 9000


Play with human

In [6]:
#p1.states_value

In [7]:
#st.availablePositions()

In [14]:
if __name__ == '__main__':
    
    p1 = Player('computer', exp_rate=0)
    p1.loadPolicy('policy_p1.pkl')

    p2 = HumanPlayer('human')

    st = State(p1, p2)
    st.playhuman()

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------


Input your action row: 0
Input your action col: 1


-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
| x |   |   | 
-------------
-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
| x |   | x | 
-------------


Input your action row: 2
Input your action col: 1


-------------
|   | o |   | 
-------------
|   |   |   | 
-------------
| x | o | x | 
-------------
-------------
|   | o |   | 
-------------
|   | x |   | 
-------------
| x | o | x | 
-------------


Input your action row: 01
Input your action col: 1
Input your action row: 0
Input your action col: 1
Input your action row: 0
Input your action col: 0


-------------
| o | o |   | 
-------------
|   | x |   | 
-------------
| x | o | x | 
-------------
-------------
| o | o | x | 
-------------
|   | x |   | 
-------------
| x | o | x | 
-------------
computer wins!
