In [61]:
#We will take the epsilon greedy strategy here
boardLength = 3
winningLength = 3
import numpy as np
class Environment:
    
    def __init__(self):
        self.board = np.zeros((boardLength, boardLength))
        self.x = 1
        self.o = -1
        self.winner = None
        self.ended = False
        #self.state = 
        
        
    def isEmpty(self, i, j):
        return self.board[i,j] == 0.0
    
    def getReward(self, symbol):
        if not self.gameOver():
            return 0
        elif self.winner == symbol:
            return 1
        else:
            return 0
        
    def getState(self):
        #Returns the state of the board represented as a decimal number
        k = 0
        stateDecimal = 0
        for i in range(boardLength):
            for j in range(boardLength):
                if (self.board[i,j] == 0):
                    num = 0
                elif (self.board[i,j] == self.x):
                    num = 1
                else:
                    num = 2
                stateDecimal += (3**k)*num
                k += 1
        return stateDecimal
                
        
        
    def gameOver(self):
        
        #check for rows
        for i in range(boardLength):
            for player in (self.x, self.o):
                if self.board[i].sum() == player*winningLength:
                    self.winner = player
                    self.ended = True
                    return True
        
        #check for columns
        for i in range(boardLength):
            for player in (self.x, self.o):
                if self.board[:,i].sum() == player*winningLength:
                    self.winner = player
                    self.ended = True
                    return True
        
        #check for diagonals
        for player in (self.x, self.o):
            if self.board.trace() == player*winningLength:
                self.winner = player
                self.ended = True
                return True
            
            if np.fliplr(self.board).trace == player*winningLength:
                self.winner = player
                self.ended = True
                return True

        return False
            
    def drawBoard(self):
        for i in range(boardLength):
            print ("-------------")
            for j in range(boardLength):
                print (" ")
                if self.board[i,j] == self.x:
                  print ("x")
                elif self.board[i,j] == self.o:
                  print ("o")
                else:
                  print (" ")
            print ("\n")
        print ("-------------")
    
    def drawBoard1(self):
        print (self.board)

class Agent:
    
    #s
    def __init__(self, eps=0.1, alpha=0.5): 
        self.eps = eps
        self.alpha = alpha
        self.verbose = False
        self.stateHistory = []
        
    def setValue(self, value):
        self.value = value
        
    def setSymbol(self, symbol):
        self.symbol = symbol
        
    def setVerbose(self, verbose):
        self.verbose = verbose
        
    def resetHistory(self):
        self.stateHistory = []
        
    def takeAction(self, env):
        move = []
        r = np.random.random()
        if (r < self.eps):
            if self.verbose:
                print ("Taking random action")
            possibleMoves = []
            for i in range(boardLength):
                for j in range(boardLength):
                    if env.isEmpty(i,j):
                        possibleMoves.append((i,j))
            print(possibleMoves)
            r = np.random.choice(len(possibleMoves))
            move = possibleMoves[r]
        else:
            x = -1
            y = -1
            maxValue = 0
            for i in range(boardLength):
                for j in range(boardLength):
                    if env.isEmpty(i,j):
                        k = env.getState()
                        if self.value[k] > maxValue:
                            x = i
                            y = j
                            maxValue = self.value[k]
            move = (x,y)
        
        env.board[move[0], move[1]] = self.symbol
        env.drawBoard1()
        #print (self.symbol)
        #print (env.board[move[0], move[1]])
        
    def updateStateHistory(self, s):
        self.stateHistory.append(s)
        
    def update(self, env):
        #this is the core part of the program which represents the AI
        #the update equation for the value is V(s) <- v(s) + alpha(V(s') - V(s))
        reward = env.getReward(self.symbol)
        for i in reversed(self.stateHistory):
            self.value[i] = self.value[i] + self.alpha * (reward - self.value[i])
            reward = self.value[i]
        self.resetHistory()
        
        
    


def playGame(p1, p2, board, draw=False):
    
    currentPlayer = None
    while not board.gameOver():
        if currentPlayer == p1:
            currentPlayer = p2
        else:
            currentPlayer = p1
        
        if draw:
            if draw == 1 and currentPlayer == p1:
                board.drawBoard()
            if draw == 2 and currentPlayer == p2:
                board.drawBoard()
            
        currentPlayer.takeAction(board)
        
        state = board.getState()
        #update the state in which the board is for both the players
        p1.updateStateHistory(state)
        p2.updateStateHistory(state)
        
        if draw:
            board.drawBoard()
        
        #update the value functions
        p1.update(board)
        p2.update(board)
        
        
if __name__ == '__main__':
    
    p1 = Agent()
    p2 = Agent()
    
    env = Environment()
    p1.setSymbol(env.x)
    p2.setSymbol(env.o)
    p1.setValue([0.5] * 3**9)
    p2.setValue([0.5] * 3**9)
    
    for i in range(30):
        playGame(p1, p2, Environment())
        print ("-------------")

[[ 1.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[(0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)]
[[ 1.  0.  0.]
 [ 0.  0.  0.]
 [-1.  0.  0.]]
[(0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 1), (2, 2)]
[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [-1.  0.  0.]]
[[ 1. -1.  0.]
 [ 1.  0.  0.]
 [-1.  0.  0.]]
[[ 1. -1.  1.]
 [ 1.  0.  0.]
 [-1.  0.  0.]]
[[ 1. -1.  1.]
 [ 1. -1.  0.]
 [-1.  0.  0.]]
[[ 1. -1.  1.]
 [ 1. -1.  1.]
 [-1.  0.  0.]]
[[ 1. -1.  1.]
 [ 1. -1.  1.]
 [-1. -1.  0.]]
-------------
[[ 1.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1. -1.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1. -1.  1.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1. -1.  1.]
 [-1.  0.  0.]
 [ 0.  0.  0.]]
[[ 1. -1.  1.]
 [-1.  1.  0.]
 [ 0.  0.  0.]]
[[ 1. -1.  1.]
 [-1.  1. -1.]
 [ 0.  0.  0.]]
[[ 1. -1.  1.]
 [-1.  1. -1.]
 [ 1.  0.  0.]]
[[ 1. -1.  1.]
 [-1.  1. -1.]
 [ 1. -1.  0.]]
[[ 1. -1.  1.]
 [-1.  1. -1.]
 [ 1. -1.  1.]]
-------------
[[ 1.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
[[ 1. -1.  0.]
 [ 0.  

ValueError: a must be greater than 0

In [67]:
ex = [[1.0, -1.0, -1.0],
 [ 1.0, -1.0,  1.0],
 [-1.0,  1.0,  0.0]]

pla = -1
if np.fliplr(ex).trace == pla*winningLength:
    print("true")
else:
    print(pla*winningLength)
    print ("false")

-3
false
