This is an exercise in reinforcement learning.  An <b>agent</b> (our AI) makes moves in an <b>environment</b> (our gameboard) using a <b>policy</b> to determine the best move in a given <b>state</b>.

In [1]:
import numpy as np
import random as r

In [2]:
# Creating the environment for our agent to interact with.  A gameboard for tic tac toe.
class GameBoard:
    def __init__(self, dim=3):
        self.grid = np.zeros((dim, dim), dtype=np.int)
        self.dim = dim
        self.game_over = False
        self.game_has_winner = False
        self.outcome = 0 #default value to be replaced by 1 for "X" or -1 for "O"
    
    def available_moves(self):
        result = np.where(self.grid == 0)
        listOfCoordinates = list(zip(result[0], result[1]))
        return listOfCoordinates

    #enter a move, 1 being "X", -1 being "O"
    def update_state(self, coords, val):
        self.grid[coords] = val
    
    def get_state(self):
        return self.grid
    
    def view_state(self):
        print(self.grid)
        
    def evaluate_position(self):
        # first check if there is a winner
        # if a row, column, or diagonal adds up to dim or -dim, it mean X or O has won, respectively
        row_comp = np.any(np.abs(np.sum(self.grid, axis = 0)) == self.dim)
        col_comp = np.any(np.abs(np.sum(self.grid, axis = 1)) == self.dim)
        diag_comp = np.abs(np.trace(self.grid)) == self.dim
        anti_diag_comp = np.abs(np.trace(np.flipud(self.grid))) == self.dim
        if row_comp or col_comp or diag_comp or anti_diag_comp:
            self.game_over = True
            self.game_has_winner = True
        # check if the grid is filled
        # a draw, since, neither player has won by the above criteria
        if not np.any(self.grid == 0):
            self.game_over = True
            self.game_has_winner = False
    
    def determine_winner(self):
        self.evaluate_position()
        if self.game_has_winner:
            row_comp = np.any(np.sum(self.grid, axis = 0) == self.dim)
            col_comp = np.any(np.sum(self.grid, axis = 1) == self.dim)
            diag_comp = np.trace(self.grid) == self.dim
            anti_diag_comp = np.trace(np.flipud(self.grid)) == self.dim
            if row_comp or col_comp or diag_comp or anti_diag_comp:
                self.outcome = 1
            else:
                self.outcome = -1
    
    def result(self):
        self.determine_winner()
        if self.game_over and self.game_has_winner:
            if self.outcome == 1:
                print("X won!")
            else:
                print("O won!")
        elif self.game_over:
            print("It's a draw")
        else:
            print("Game in progress")

In [4]:
# defining our agent that will be playing the game
class Agent:
    def __init__(self, epsilon=0.1, discount_factor=0.8, learning_rate=0.5, move=1): # move=1 for X, -1 for O
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.move = move
        self.game_list = []
        
    def reset(self):
        self.game_list = []
        
    def take_action(self, game, value_table, policy):
        action = policy(game, value_table)
        game.update_state(action, self.move)
        
    def calculate_rewards(self, game):
        reward = self.move * game.outcome
        for i in range(1, len(self.game_list)+1):
            self.game_list[-i]['reward'] = self.discount_factor**(i-1) * reward
            
    def update_value_table(self, game, value_table):
        self.calculate_rewards(game)
        for item in self.game_list:
            key = item['new_state']
            reward = item['reward']
            if key not in value_table.keys():
                value_table[key] = 0
            value_table[key] += self.learning_rate * reward
        
    # value table is a dictionary of states and values.  game.get_state is a numpy array that can't be used
    # as a dictionary key.  We encode the state as a string for it to be usable.  If the agent plays O's,
    # we will multiply the matrix by move (-1) first, so that the state is the same for both X and O
    def encode_state(self, game):
        state = self.move * game.get_state()
        xs = np.where(state == 1)
        os = np.where(state == -1)
        xcoords = list(zip(xs[0], xs[1]))
        ocoords = list(zip(os[0], os[1]))
        encoded_state = '+1:'
        for tup in xcoords:
            for a in tup:
                encoded_state +=str(a)
        encoded_state +='-1:'
        for tup in ocoords:
            for a in tup:
                encoded_state +=str(a)
        return encoded_state
    
    def random_policy(self, game, value_table):
    return r.choice(game.available_moves())

    def greedy_policy(self, game, value_table):
        available_moves = game.available_moves()
        move_scores = []
        for available_move in available_moves:
            game.update_state(available_move, self.move)
            encoded_state = self.encode_state(game)
            if encoded_state not in value_table.keys():
                value_table[encoded_state] = 0
                move_scores.append(0)
            else:
                move_scores.append(value_table[encoded_state])
            game.update_state(available_move, 0)
        

In [5]:
value_table = {}
agents = [Agent(move=1), Agent(move=-1)]
policy = random_policy
num_epochs = 1000

for i in range(num_epochs):
    game = GameBoard()
    index = 0
    inc = 1

    while not game.game_over:
        move_summary = {}
        agent = agents[index]
        move_summary['old_state'] = agent.encode_state(game)
        agent.take_action(game, value_table, policy)
        move_summary['new_state'] = agent.encode_state(game)
        agent.game_list.append(move_summary)
        game.evaluate_position()
        inc *=-1
        index +=inc

    game.determine_winner()
    agents[0].update_value_table(game, value_table)
    agents[1].update_value_table(game, value_table)
    agents[0].reset()
    agents[1].reset()
    if (i+1)%100==0:
        print(f"Epoch: {i+1}")
#     game.view_state()

Epoch: 100
Epoch: 200
Epoch: 300
Epoch: 400
Epoch: 500
Epoch: 600
Epoch: 700
Epoch: 800
Epoch: 900
Epoch: 1000


In [8]:
len(value_table)

3189

In [7]:
new_game = GameBoard()
move = 1 # 1 being the input for X
while not new_game.game_over:
    new_game.view_state()
    row = int(input("Row of move:"))
    col = int(input("Column of move:"))
    new_game.update_state((row, col), move)
    new_game.evaluate_position()
    move *=-1

new_game.view_state()
new_game.result()

[[0 0 0]
 [0 0 0]
 [0 0 0]]
Row of move:1
Column of move:1
[[0 0 0]
 [0 1 0]
 [0 0 0]]
Row of move:0
Column of move:0
[[-1  0  0]
 [ 0  1  0]
 [ 0  0  0]]
Row of move:1
Column of move:2
[[-1  0  0]
 [ 0  1  1]
 [ 0  0  0]]
Row of move:1
Column of move:0
[[-1  0  0]
 [-1  1  1]
 [ 0  0  0]]
Row of move:2
Column of move:0
[[-1  0  0]
 [-1  1  1]
 [ 1  0  0]]
Row of move:0
Column of move:2
[[-1  0 -1]
 [-1  1  1]
 [ 1  0  0]]
Row of move:0
Column of move:1
[[-1  1 -1]
 [-1  1  1]
 [ 1  0  0]]
Row of move:2
Column of move:1
[[-1  1 -1]
 [-1  1  1]
 [ 1 -1  0]]
Row of move:2
Column of move:2
[[-1  1 -1]
 [-1  1  1]
 [ 1 -1  1]]
It's a draw


In [8]:
new_game.winner

0