In [None]:
#We will take the epsilon greedy strategy here
boardLength = 3
winningLength = 3
import numpy as np
class Environment:
    
    def __init__(self):
        self.board = np.zeros((boardLength, boardLength))
        self.x = 1
        self.o = -1
        self.winner = None
        self.ended = False
        #self.state = 
        
        
    def isEmpty(self, i, j):
        return self.board[i,j] == 0.0
    
    def getReward(self, symbol):
        if not self.gameOver():
            return 0
        elif self.winner == symbol:
            return 1
        else:
            return 0
        
    def getState(self):
        #Returns the state of the board represented as a decimal number
        k = 0
        stateDecimal = 0
        for i in range(boardLength):
            for j in range(boardLength):
                if (self.board[i,j] == 0):
                    num = 0
                elif (self.board[i,j] == self.x):
                    num = 1
                else:
                    num = 2
                stateDecimal += (3**k)*num
                k += 1
        return stateDecimal
                
        
        
    def gameOver(self):
        
        #check for rows
        for i in range(boardLength):
            for player in (self.x, self.o):
                if self.board[i].sum() == player*winningLength:
                    self.winner = player
                    self.ended = True
                    return True
        
        #check for columns
        for i in range(boardLength):
            for player in (self.x, self.o):
                if self.board[:,i].sum() == player*winningLength:
                    self.winner = player
                    self.ended = True
                    return True
        
        #check for diagonals
        for player in (self.x, self.o):
            if self.board.trace() == player*winningLength:
                self.winner = player
                self.ended = True
                return True
            
            if np.fliplr(self.board).trace() == player*winningLength:
                self.winner = player
                self.ended = True
                return True
            
        k = 0
        for i in range(boardLength):
            for j in range(boardLength):
                if not self.isEmpty(i,j):
                    k += 1
        
        if (k == boardLength*boardLength):
            return True
        
        return False
            
    def drawBoard(self):
        for i in range(boardLength):
            print ("-------------")
            for j in range(boardLength):
                print (" ")
                if self.board[i,j] == self.x:
                  print ("x")
                elif self.board[i,j] == self.o:
                  print ("o")
                else:
                  print (" ")
            print ("\n")
        print ("-------------")
    
    def drawBoard1(self):
        print (self.board)
        
    def drawBoard2(self, p):
        print (self.board)
        k = self.getState()
        print(p.value[k])
        

class Agent:
    
    #s
    def __init__(self, eps=0.1, alpha=0.5): 
        self.eps = eps
        self.alpha = alpha
        self.verbose = False
        self.stateHistory = []
        
    def setValue(self, value):
        self.value = value
        
    def setSymbol(self, symbol):
        self.symbol = symbol
        
    def setVerbose(self, verbose):
        self.verbose = verbose
        
    def resetHistory(self):
        self.stateHistory = []
        
    def takeAction(self, env):
        move = []
        r = np.random.random()
        if (r < self.eps):
            if self.verbose:
                print ("Taking random action")
            possibleMoves = []
            for i in range(boardLength):
                for j in range(boardLength):
                    if env.isEmpty(i,j):
                        possibleMoves.append((i,j))
            #print(possibleMoves)
            r = np.random.choice(len(possibleMoves))
            move = possibleMoves[r]
        else:
            x = -1
            y = -1
            maxValue = 0
            for i in range(boardLength):
                for j in range(boardLength):
                    if env.isEmpty(i,j):
                        env.board[i,j] = self.symbol
                        k = env.getState()
                        env.board[i,j] = 0
                        if self.value[k] > maxValue:
                            x = i
                            y = j
                            maxValue = self.value[k]
            move = (x,y)
        
        env.board[move[0], move[1]] = self.symbol
        #env.drawBoard1()
        #print (self.symbol)
        #print (env.board[move[0], move[1]])
        
    def updateStateHistory(self, s):
        self.stateHistory.append(s)
        
    def update(self, env):
        #this is the core part of the program which represents the AI
        #the update equation for the value is V(s) <- v(s) + alpha(V(s') - V(s))
        reward = env.getReward(self.symbol)
        for i in reversed(self.stateHistory):
            self.value[i] = self.value[i] + self.alpha * (reward - self.value[i])
            reward = self.value[i]
        self.resetHistory()
        
        
class Human:
    
    
    def __init__(self):
        pass
        
    def setSymbol(self, symbol):
        self.symbol = symbol
        
    def takeAction(self, env):
        while True:
          # break if we make a legal move
          move = input("Enter coordinates i,j for your next move (i,j=0..2): ")
          i, j = move.split(',')
          i = int(i)
          j = int(j)
          if env.isEmpty(i, j):
            env.board[i,j] = self.symbol
            print (env.board)
            break

    def update(self, env):
        pass

    def updateStateHistory(self, s):
        pass


def playGame(p1, p2, board, draw=False):
    
    currentPlayer = None
    while not board.gameOver():
        if currentPlayer == p1:
            currentPlayer = p2
        else:
            currentPlayer = p1
        
        if draw:
            if draw == 1 and currentPlayer == p1:
                board.drawBoard()
            if draw == 2 and currentPlayer == p2:
                board.drawBoard2(p1)
            
        currentPlayer.takeAction(board)
        
        state = board.getState()
        #update the state in which the board is for both the players
        p1.updateStateHistory(state)
        p2.updateStateHistory(state)
        
        #if draw:
        #    board.drawBoard()
        
        #update the value functions
        p1.update(board)
        p2.update(board)
        
        
if __name__ == '__main__':
    
    p1 = Agent()
    p2 = Agent()
    
    env = Environment()
    p1.setSymbol(env.x)
    p2.setSymbol(env.o)
    p1.setValue([0.5] * 3**9)
    p2.setValue([0.5] * 3**9)
    
    for i in range(10):
        playGame(p1, p2, Environment())
        #print ("-------------")
        
    # play human vs. agent
    # do you think the agent learned to play the game well?
    human = Human()
    human.setSymbol(env.o)
    while True:
        p1.setVerbose(True)
        playGame(p1, human, Environment(), draw=2)
        print (env.board)
        # I made the agent player 1 because I wanted to see if it would
        # select the center as its starting move. If you want the agent
        # to go second you can switch the human and AI.
        answer = input("Play again? [Y/n]: ")
        if answer and answer.lower()[0] == 'n':
            break


In [82]:
    print(p1.value)

[0.5, 5e-324, 0.5, 4.81482486096809e-35, 0.5, 4.930380657631324e-32, 0.5, 5e-324, 0.5, 6.162975822039155e-33, 0.5, 3.944304526105059e-31, 0.5, 0.5, 4.078315292499078e-56, 0.0625, 5e-324, 0.5, 0.5, 1.2037062152420224e-35, 0.5, 0.25, 2.465190328815662e-32, 0.5, 0.5, 0.5, 0.5, 9.62964972193618e-35, 0.5, 1.232595164407831e-32, 0.5, 0.5, 2.465190328815662e-32, 0.125, 3.0814879110195774e-33, 0.5, 0.5, 0.5, 0.25, 0.5, 0.5, 0.5, 0.5, 0.5, 0.25, 0.125, 0.125, 0.5, 0.5, 0.5, 7.888609052210118e-31, 0.5, 7.888609052210118e-31, 0.5, 0.5, 1.504632769052528e-36, 0.5, 0.125, 3.851859888774472e-34, 0.5, 0.5, 0.5, 0.5, 0.25, 0.125, 0.5, 0.5, 0.5, 6.681911775230489e-52, 0.5, 5e-324, 0.5, 0.5, 0.5, 0.5, 0.5, 4.9784122222889134e-60, 0.5, 0.5, 0.5, 0.5, 3.0814879110195774e-33, 0.5, 6.310887241768095e-30, 0.5, 0.5, 1.6155871338926322e-27, 0.0625, 1.4693679385278594e-39, 0.5, 0.5, 0.5, 0.0625, 0.5, 0.5, 0.5, 0.25, 0.5, 0.0625, 0.5, 0.5, 0.5, 0.5, 0.5, 2.5849394142282115e-26, 0.5, 1.2037062152420224e-35, 0.5, 