This is an exercise in reinforcement learning.  An <b>agent</b> (our AI) makes moves in an <b>environment</b> (our gameboard) using a <b>policy</b> to determine the best move in a given <b>state</b>.

In [1]:
import numpy as np
import random as r

# defining string constants for specifying policies to use
RANDOM_POLICY = "random_policy"
GREEDY_POLICY = "greedy_policy"
GREEDY_BUT_TYTHING_POLICY = "greedy_but_tything_policy"

In [2]:
# Creating the environment for our agent to interact with.  A gameboard for tic tac toe.
class GameBoard:
    def __init__(self, dim=3):
        self.grid = np.zeros((dim, dim), dtype=np.int)
        self.dim = dim
        self.game_over = False
        self.game_has_winner = False
        self.outcome = 0 #default value to be replaced by 1 for "X" or -1 for "O"
    
    def available_moves(self):
        result = np.where(self.grid == 0)
        listOfCoordinates = list(zip(result[0], result[1]))
        return listOfCoordinates

    #enter a move, 1 being "X", -1 being "O"
    def update_state(self, coords, val):
        self.grid[coords] = val
    
    def get_state(self):
        return self.grid
    
    def view_state(self):
        print(self.grid)
        
    def evaluate_position(self):
        # first check if there is a winner
        # if a row, column, or diagonal adds up to dim or -dim, it mean X or O has won, respectively
        row_comp = np.any(np.abs(np.sum(self.grid, axis = 1)) == self.dim)
        col_comp = np.any(np.abs(np.sum(self.grid, axis = 0)) == self.dim)
        diag_comp = np.abs(np.trace(self.grid)) == self.dim
        anti_diag_comp = np.abs(np.trace(np.flipud(self.grid))) == self.dim
        if row_comp or col_comp or diag_comp or anti_diag_comp:
            self.game_over = True
            self.game_has_winner = True
        # check if the grid is filled
        # a draw, since, neither player has won by the above criteria
        elif not np.any(self.grid == 0):
            self.game_over = True
            self.game_has_winner = False
    
    def determine_winner(self):
        self.evaluate_position()
        if self.game_has_winner:
            row_comp = np.any(np.sum(self.grid, axis = 1) == self.dim)
            col_comp = np.any(np.sum(self.grid, axis = 0) == self.dim)
            diag_comp = np.trace(self.grid) == self.dim
            anti_diag_comp = np.trace(np.flipud(self.grid)) == self.dim
            if row_comp or col_comp or diag_comp or anti_diag_comp:
                self.outcome = 1
            else:
                self.outcome = -1
    
    def result(self):
        self.determine_winner()
        if self.game_over and self.game_has_winner:
            if self.outcome == 1:
                print("X won!")
            else:
                print("O won!")
        elif self.game_over:
            print("It's a draw")
        else:
            print("Game in progress")

In [4]:
# defining our agent that will be playing the game
class Agent:
    def __init__(self, epsilon=0.1, discount_factor=0.8, learning_rate=0.1, tythe_rate = 0.1,
                 move=1): # move=1 for X, -1 for O
        self.epsilon = epsilon
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.tythe_rate = tythe_rate
        self.move = move
        self.game_list = []
        
    def reset(self):
        self.game_list = []
        
    def take_action(self, game, value_table, policy_string):
        if policy_string == RANDOM_POLICY:
            policy = self.random_policy
        elif policy_string == GREEDY_POLICY:
            policy = self.greedy_policy
        elif policy_string == GREEDY_BUT_TYTHING_POLICY:
            policy = self.greedy_but_tything_policy
        action = policy(game, value_table)
        game.update_state(action, self.move)
        
    def calculate_rewards(self, game):
        reward = self.move * game.outcome
        for i in range(1, len(self.game_list)+1):
            self.game_list[-i]['reward'] = self.discount_factor**(i-1) * reward
            
    def update_value_table(self, game, value_table):
        self.calculate_rewards(game)
        for item in self.game_list:
            key = item['new_state']
            reward = item['reward']
            if key not in value_table.keys():
                value_table[key] = 0
            value_table[key] += self.learning_rate * (reward - value_table[key])
        
    # value table is a dictionary of states and values.  game.get_state is a numpy array that can't be used
    # as a dictionary key.  We encode the state as a string for it to be usable.  If the agent plays O's,
    # we will multiply the matrix by move (-1) first, so that the state is the same for both X and O
    def encode_state(self, game):
        state = self.move * game.get_state()
        xs = np.where(state == 1)
        os = np.where(state == -1)
        xcoords = list(zip(xs[0], xs[1]))
        ocoords = list(zip(os[0], os[1]))
        encoded_state = '+1:'
        for tup in xcoords:
            for a in tup:
                encoded_state +=str(a)
        encoded_state +='-1:'
        for tup in ocoords:
            for a in tup:
                encoded_state +=str(a)
        return encoded_state
    
    def random_policy(self, game, value_table):
        return r.choice(game.available_moves())

    def greedy_policy(self, game, value_table):
        available_moves = game.available_moves() # i.e. get unfilled positions in grid
        move_scores = []
        for available_move in available_moves:
            game.update_state(available_move, self.move) # getting position after potential move is made
            encoded_state = self.encode_state(game)
            if encoded_state not in value_table.keys():
                value_table[encoded_state] = 0
            move_scores.append(value_table[encoded_state]) # getting value of new position
            game.update_state(available_move, 0) # resetting to the original game position
        max_score = max(move_scores)
        max_index = move_scores.index(max_score)
        return available_moves[max_index]
    
    def greedy_but_tything_policy(self, game, value_table): 
    # i.e. 1/10th of the time it makes a random move, the rest of the time it is greedy
        if r.random() < self.tythe_rate:
            return self.random_policy(game, value_table)
        else:
            return self.greedy_policy(game, value_table)

In [5]:
value_table = {}
agents = [Agent(move=1), Agent(move=-1)]
policy = RANDOM_POLICY
# policy = GREEDY_POLICY
# policy = GREEDY_BUT_TYTHING_POLICY
num_epochs = 1000000

for i in range(num_epochs):
    game = GameBoard(5)
    index = 0
    inc = 1

    while not game.game_over:
        move_summary = {}
        agent = agents[index]
        move_summary['old_state'] = agent.encode_state(game)
        agent.take_action(game, value_table, policy)
        move_summary['new_state'] = agent.encode_state(game)
        agent.game_list.append(move_summary)
        game.evaluate_position()
        inc *=-1
        index +=inc

    game.determine_winner()
    agents[0].update_value_table(game, value_table)
    agents[1].update_value_table(game, value_table)
    agents[0].reset()
    agents[1].reset()
    if (i+1)%10000==0:
        print(f"Random Epoch: {i+1}")
#     print(f"Epoch: {i+1}")
#     game.view_state()



# now that we've prepopulated the value table with a random policy, let's see if we can make it better by training it 
# with the semi-greedy tything policy
# value table will not be re-initialized, so that we can use its existing values

agents = [Agent(move=1), Agent(move=-1)]
policy = GREEDY_BUT_TYTHING_POLICY
num_epochs = 1000000

for i in range(num_epochs):
    game = GameBoard(5)
    index = 0
    inc = 1

    while not game.game_over:
        move_summary = {}
        agent = agents[index]
        move_summary['old_state'] = agent.encode_state(game)
        agent.take_action(game, value_table, policy)
        move_summary['new_state'] = agent.encode_state(game)
        agent.game_list.append(move_summary)
        game.evaluate_position()
        inc *=-1
        index +=inc

    game.determine_winner()
    agents[0].update_value_table(game, value_table)
    agents[1].update_value_table(game, value_table)
    agents[0].reset()
    agents[1].reset()
    if (i+1)%10000==0:
        print(f"Semi-Greedy Epoch: {i+1}")

Random Epoch: 10000
Random Epoch: 20000
Random Epoch: 30000
Random Epoch: 40000
Random Epoch: 50000
Random Epoch: 60000
Random Epoch: 70000
Random Epoch: 80000
Random Epoch: 90000
Random Epoch: 100000
Random Epoch: 110000
Random Epoch: 120000
Random Epoch: 130000
Random Epoch: 140000
Random Epoch: 150000
Random Epoch: 160000
Random Epoch: 170000
Random Epoch: 180000
Random Epoch: 190000
Random Epoch: 200000
Random Epoch: 210000
Random Epoch: 220000
Random Epoch: 230000
Random Epoch: 240000
Random Epoch: 250000
Random Epoch: 260000
Random Epoch: 270000
Random Epoch: 280000
Random Epoch: 290000
Random Epoch: 300000
Random Epoch: 310000
Random Epoch: 320000
Random Epoch: 330000
Random Epoch: 340000
Random Epoch: 350000
Random Epoch: 360000
Random Epoch: 370000
Random Epoch: 380000
Random Epoch: 390000
Random Epoch: 400000
Random Epoch: 410000
Random Epoch: 420000
Random Epoch: 430000
Random Epoch: 440000
Random Epoch: 450000
Random Epoch: 460000
Random Epoch: 470000
Random Epoch: 480000
R

KeyboardInterrupt: 

In [6]:
len(value_table)

13347668

In [7]:
# ok, let's see how well the ai does trained on a random policy, then a semi-greedy policy using the same value table
# computer goes first
game = GameBoard(5)
agent = Agent()
policy = GREEDY_POLICY

while not game.game_over:
    agent.take_action(game, value_table, policy)
    game.evaluate_position()
    if game.game_over:
        break
    game.view_state()
    row = int(input("Row of move:"))
    col = int(input("Column of move:"))
    game.update_state((row, col), -1)
    game.evaluate_position()

game.view_state()
game.result()

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 1 0 0 0]
 [0 0 0 0 0]]
Row of move:2
Column of move:2
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  0 -1  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0  1]]
Row of move:2
Column of move:1
[[ 0  0  0  0  0]
 [ 0  0  0  0  0]
 [ 1 -1 -1  0  0]
 [ 0  1  0  0  0]
 [ 0  0  0  0  1]]
Row of move:3
Column of move:2
[[ 1  0  0  0  0]
 [ 0  0  0  0  0]
 [ 1 -1 -1  0  0]
 [ 0  1 -1  0  0]
 [ 0  0  0  0  1]]
Row of move:4
Column of move:2
[[ 1  1  0  0  0]
 [ 0  0  0  0  0]
 [ 1 -1 -1  0  0]
 [ 0  1 -1  0  0]
 [ 0  0 -1  0  1]]
Row of move:0
Column of move:2
[[ 1  1 -1  1  0]
 [ 0  0  0  0  0]
 [ 1 -1 -1  0  0]
 [ 0  1 -1  0  0]
 [ 0  0 -1  0  1]]
Row of move:1
Column of move:2
[[ 1  1 -1  1  0]
 [ 0  0 -1  0  0]
 [ 1 -1 -1  0  0]
 [ 0  1 -1  0  0]
 [ 0  0 -1  0  1]]
O won!


In [93]:
# ok, let's see how well the ai does trained on a random policy, then a semi-greedy policy using the same value table
# human goes first
game = GameBoard(5)
agent = Agent(move=-1)
policy = GREEDY_POLICY

while not game.game_over:
    game.view_state()
    row = int(input("Row of move:"))
    col = int(input("Column of move:"))
    game.update_state((row, col), 1)
    game.evaluate_position()
    if game.game_over:
        break
    agent.take_action(game, value_table, policy)
    game.evaluate_position()

game.view_state()
game.result()

[[0 0 0]
 [0 0 0]
 [0 0 0]]
Row of move:1
Column of move:1
[[ 0  0  0]
 [ 0  1  0]
 [ 0  0 -1]]
Row of move:2
Column of move:1
[[ 0 -1  0]
 [ 0  1  0]
 [ 0  1 -1]]
Row of move:0
Column of move:2
[[ 0 -1  1]
 [ 0  1  0]
 [-1  1 -1]]
Row of move:1
Column of move:0
[[ 0 -1  1]
 [ 1  1 -1]
 [-1  1 -1]]
Row of move:0
Column of move:0
[[ 1 -1  1]
 [ 1  1 -1]
 [-1  1 -1]]
It's a draw


In [76]:
game.dim

3

In [74]:
asdf = game.get_state()

In [75]:
asdf

array([[-1, -1,  1],
       [ 1,  1,  1],
       [-1,  1, -1]])

In [82]:
np.sum(asdf, axis = 0)

array([-1,  1,  1])

In [7]:
new_game = GameBoard()
move = 1 # 1 being the input for X
while not new_game.game_over:
    new_game.view_state()
    row = int(input("Row of move:"))
    col = int(input("Column of move:"))
    new_game.update_state((row, col), move)
    new_game.evaluate_position()
    move *=-1

new_game.view_state()
new_game.result()

[[0 0 0]
 [0 0 0]
 [0 0 0]]
Row of move:1
Column of move:1
[[0 0 0]
 [0 1 0]
 [0 0 0]]
Row of move:0
Column of move:0
[[-1  0  0]
 [ 0  1  0]
 [ 0  0  0]]
Row of move:1
Column of move:2
[[-1  0  0]
 [ 0  1  1]
 [ 0  0  0]]
Row of move:1
Column of move:0
[[-1  0  0]
 [-1  1  1]
 [ 0  0  0]]
Row of move:2
Column of move:0
[[-1  0  0]
 [-1  1  1]
 [ 1  0  0]]
Row of move:0
Column of move:2
[[-1  0 -1]
 [-1  1  1]
 [ 1  0  0]]
Row of move:0
Column of move:1
[[-1  1 -1]
 [-1  1  1]
 [ 1  0  0]]
Row of move:2
Column of move:1
[[-1  1 -1]
 [-1  1  1]
 [ 1 -1  0]]
Row of move:2
Column of move:2
[[-1  1 -1]
 [-1  1  1]
 [ 1 -1  1]]
It's a draw


In [8]:
new_game.winner

0