In [34]:
import numpy as np
import pickle
import random

In [35]:
ROWS = 3
COLS = 3
DECAY = 0.9
LR = 0.2
EPSILON = 0.3

In [36]:
# board = np.zeros(ROWS*COLS)

In [37]:
class Agent:
    def __init__(self, name):
        self.symbol = name
        self.stateHistory = [] #Positions of the board at each step of it's play
        self.stateValueTable = {} #Serves as the policy after training
    
    def addToHistory(self, state):
        self.stateHistory.append(str(state))

    def chooseAction(self, board):
        possible = [idx for idx in range(len(board)) if board[idx] == 0]
        if np.random.uniform(0, 1) <= EPSILON: #Condition for exploration
            action = random.choice(possible)
        else: #Condition for exploitation
            bestVal = -500000
            for move in possible:
                possibleBoard = board.copy()
                possibleBoard[move] = self.symbol
                val = self.stateValueTable.get(str(possibleBoard))
                if val is None:
                    val = 0
                if val > bestVal:
                    bestVal = val
                    action = move
        return action
    
    def chooseBest(self, board):
        possible = [idx for idx in range(len(board)) if board[idx] == 0]
        bestVal = -500000
        for move in possible:
            possibleBoard = board.copy()
            possibleBoard[move] = self.symbol
            val = self.stateValueTable.get(str(possibleBoard))
            if val is None:
                val = 0
            if val > bestVal:
                bestVal = val
                action = move
        return action
    
    def updateStateValue(self, reward): #Backprop REWARD
        for state in reversed(self.stateHistory):
            if self.stateValueTable.get(str(state)) is None:
                self.stateValueTable[str(state)] = 0
            self.stateValueTable[str(state)] += LR*(DECAY*reward - self.stateValueTable[str(state)])
            reward = self.stateValueTable[str(state)]
    
    def reset(self):
        self.stateHistory = []

    def loadPolicy(self, policy):
        with open(str(policy), 'rb') as f:
            self.stateValueTable = pickle.load(f) 



In [38]:
class env():
    board = [0 for _ in range(ROWS*COLS)]

    def updateBoard(self, action, symbol):
        self.board[action] = symbol

    def checkWin(self):
        winner = "NA"

        draw = True
        for i in self.board:
            if i == 0:
                draw = False

        if draw:
            return "DRAW"

        #Win By Vertical 3
        for i in range(0, COLS):
            for j in range(0, ROWS-2):
                if self.board[(j*COLS) + i] == self.board[(j+1)*COLS + i] == self.board[(j+2)*COLS + i]:
                    if self.board[(j*COLS) + i] != 0:
                        winner = self.board[(j*COLS) + i]
        
        #Win by Horizontal 3
        for i in range(0, ROWS):
            for j in range(0, COLS-2):
                if self.board[(i*COLS) + j] == self.board[i*COLS + (j+1)] == self.board[i*COLS + (j+2)]:
                    if self.board[(i*COLS) + j] != 0:
                        winner = self.board[(i*COLS) + j]

        #Win by left->right diagonal
        for i in range(0, COLS-2):
            for j in range(0,  ROWS-2):
                if self.board[(j*COLS) + i] == self.board[((j+1)*COLS) + (i+1)] == self.board[((j+2)*COLS) + (i+2)]:
                    if self.board[(j*COLS) + i] != 0:
                        winner = self.board[(j*COLS) + i]

        #Win by right->left diagonal
        for i in range(COLS-1, 1, -1):
            for j in range(0,  ROWS-2):
                if self.board[(j*COLS) + i] == self.board[((j+1)*COLS) + (i-1)] == self.board[((j+2)*COLS) + (i-2)]:
                    if self.board[(j*COLS) + i] != 0:
                        winner =self.board[(j*COLS) + i]
        
        return winner
    
    def optimalPlay(self, p1: Agent, p2: Agent):
        self.board = [0 for _ in range(ROWS*COLS)]
        while True:
            p1_action = p1.chooseBest(self.board)
            self.updateBoard(p1_action, p1.symbol)
            winner = self.checkWin()
            if winner != "NA":
                return winner, self.board
            else:
                p2_action = p2.chooseBest(self.board)
                self.updateBoard(p2_action, p2.symbol)
                winner = self.checkWin()
                if winner != "NA":
                    return winner, self.board
            
    def playAsHuman(self, p1: Agent, order: int):
        while True:
            if(order == 1): #The agent goes first
                p1_action = p1.chooseBest(self.board)
                self.updateBoard(p1_action, p1.symbol)
                winner = self.checkWin()
                if winner != "NA":
                    return winner, self.board
                else:
                    print("Current Board: ", self.board)
                    p2_action = int(input("enter Position:"))
                    self.updateBoard(p2_action, "O")
                    winner = self.checkWin()
                    if winner != "NA":
                        return winner, self.board

    def play(self, p1: Agent, p2: Agent):
        self.board = [0 for _ in range(ROWS*COLS)]
        complete = False
        while not complete:
            action = p1.chooseAction(self.board)
            self.updateBoard(action, p1.symbol)
            p1.addToHistory(self.board)

            winner = self.checkWin()
            if winner != "NA":
                complete = True
                if winner == p1.symbol:
                    p1.updateStateValue(1)
                    p2.updateStateValue(0)
                elif winner == p2.symbol:
                    p1.updateStateValue(0)
                    p2.updateStateValue(1)
                else:
                    p1.updateStateValue(0.1)
                    p2.updateStateValue(0.5)
                p1.reset()
                p2.reset()
                bc = self.board
                self.board = [0 for _ in range(ROWS*COLS)]
                return winner, bc
                
            else:
                action = p2.chooseAction(self.board)
                self.updateBoard(action, p2.symbol)
                p2.addToHistory(self.board)
                winner = self.checkWin()
                if winner != "NA":
                    complete = True
                    if winner == p1.symbol:
                        p1.updateStateValue(1)
                        p2.updateStateValue(0)
                    elif winner == p2.symbol:
                        p1.updateStateValue(0)
                        p2.updateStateValue(1)
                    else:
                        p1.updateStateValue(0.1)
                        p2.updateStateValue(0.5)
                    p1.reset()
                    p2.reset()
                    bc = self.board
                    self.board = [0 for _ in range(ROWS*COLS)]
                    return winner, bc

In [39]:
def savePolicy(p1 :Agent, p2: Agent):
    with open("Policy_player1", 'wb') as f:
        pickle.dump(p1.stateValueTable, f)
    
    with open("Policy_player2", 'wb') as f:
        pickle.dump(p2.stateValueTable, f)

In [40]:
p1 = Agent("X")
p2 = Agent("O")
bg = env()

In [41]:
def train(p1, p2):
    for i in range(50000):
        if i% 10000 == 0:
            print("Round: " + str(i))
        bg.play(p1,p2)
    print("Demo: ")
    bg.optimalPlay(p1,p2)
    savePolicy(p1,p2)
train(p1,p2)

Round: 0
Round: 10000
Round: 20000
Round: 30000
Round: 40000
Demo: 


In [42]:
agentPlayer1 = Agent("X")
agentPlayer1.loadPolicy("Policy_player1")

play_env = env()
play_env.playAsHuman(agentPlayer1, 1)

Current Board:  [0, 0, 0, 0, 0, 0, 'X', 0, 0]


Current Board:  [0, 'O', 0, 0, 0, 0, 'X', 0, 'X']
Current Board:  [0, 'O', 0, 0, 'X', 0, 'X', 'O', 'X']


('X', ['O', 'O', 'X', 0, 'X', 0, 'X', 'O', 'X'])