A lot of the code is shared between this notebook and the previous one, so I won't explain it twice.

In [1]:

from kaggle_environments import make, evaluate

# Create the game environment
env = make("tictactoe", debug=True)

# List of available default agents
print(list(env.agents))

q={} #q-table
seen = {} #a map storing all states that have been seen before
for i in range (3**10):
    q[i] = [0] * 9
# Sets q-table values to 0.
import numpy as np

['random', 'reaction']


In [2]:
def hash (array):
    x = 0
    for i in range (9):
        x += 3**i * array[i]
    return x
# Converts tictactoe board to number (which the q-table takes as an input).

In [3]:
def result (board): #this method takes in the final board state of a game that has finished and returns whether the result is a win for board 1 or board 2 or if the result is a draw.
    # 100 means board 1 won
    # 0 means draw
    # -100 means board 2 won
    for i in range (3):
        if board[i*3:i*3+3].count(1) == 3: return 100
        if board[i*3:i*3+3].count(2) == 3: return -100
    for i in range(3):
        if board[i] == board[i+3] == board[i+6] == 1: return 100
        if board[i] == board[i+3] == board[i+6] == 2: return -100 
    if board[0] == board[4] == board[8] == 1: return 100
    if board[2] == board[4] == board[6] == 1: return 100
    if board[0] == board[4] == board[8] == 2: return -100
    if board[2] == board[4] == board[6] == 2: return -100
    return 0

In [4]:
def checkDone(board): #Checks if the game is done
    if board.count(0) == 0: return True
    for i in range (3):
        if board[i*3:i*3+3].count(1) == 3 or board[i*3:i*3+3].count(2) == 3: return True
    for i in range(3):
        if board[i] == board[i+3] == board[i+6] != 0:
            return True
    if board[0] == board[4] == board[8] != 0: return True
    if board[2] == board[4] == board[6] != 0: return True
    return False
    

To get an in depth explanation of q-learning, click on this link: https://www.simplilearn.com/tutorials/machine-learning-tutorial/what-is-q-learning#:~:text=Q%2Dlearning%20is%20a%20model,next%20action%20to%20be%20taken. 

For the purposes of this notebook, I'll explain q-learning in simpler terms and how I apply it to tic-tac-toe. To implement q-learning, a q-table is needed. Think of a q-table as a 2d list with each row representing a different position and each column representing a different move. Each value in the q-table signifies how good or bad a particular position is. A more positive value indicates a better position, and 0 indicates a draw. The values in this table are filled recursively, with the final state positions being given either a very positive score (5000) for a win, 0 for a draw, and a very negative score for a loss (-5000). For every move that is illegal, a very, very, negative score is assigned (-1000000) and the game terminates. Every other value in the table is updated by a linear combination of the current value and the maximum value after the corresponding move.


The agent starts off with a q-table filed with zeroes, and slowly updates it as it plays against a random opponent.


In [5]:
def q_learning_agent(obs, config):
    gamma = 0.8
    alpha = 0.2
    epsilon = max(0.2,0.5 - 0.5/games * t)
    mark = obs.mark
    valid_moves = [i for i in range (len(obs.board)) if obs.board[i] == 0]
    scores = []
    move = -1
    reward = 0
    copyBoard = [1]*9
    if np.random.uniform(0, 1)< epsilon:
        move= np.random.randint(9)
    else:
        bestScore = max(q[hash(obs.board)])
        move = q[hash(obs.board)].index(bestScore)
    if move not in valid_moves:
        reward -= 1000000
    else:
        copyBoard = obs.board.copy()
        copyBoard[move] = mark
        if (checkDone(copyBoard)):
            if result(copyBoard) == 100:
                reward += 5000
        else: 
            newmark = max(1, (mark + 1)%3)
            copyBoard[random_feeder(copyBoard)] = newmark 
            if (checkDone(copyBoard)):
                if (result(copyBoard)) == 0:
                    reward -= 50
                else: reward -= 5000
    #update q-table
    q[hash(obs.board)][move] = (1-alpha)*q[hash(obs.board)][move] + alpha * (reward + gamma*max(q[hash(copyBoard)]))  
    return move


In [6]:
global rand_move
def random_feeder (board): #feeds a random move to  random_agent.
    global rand_move
    valid_moves = [i for i in range (len(board)) if board[i] == 0]
    rand_move = int(np.random.choice(valid_moves))
    return rand_move

def random_agent (obs, config): #Used to train the agent
    #abc = random_feeder(obs.board) #COMMENT THIS OUT
    return rand_move

In [None]:
t=0
games = 100000
for i in range (games):
    t+=1
    env.run([q_learning_agent, random_agent])
    env.run([random_agent, q_learning_agent])

In [8]:
def q_learning_player(obs, config):
    seen[hash(obs.board)] = 1
    bestScore = max(q[hash(obs.board)])
    move = q[hash(obs.board)].index(bestScore)
    return move

import numpy as np
def get_win_percentages(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config = {'rows': 3, 'columns': 3, 'inarow': 3}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("tictactoe", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("tictactoe", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))
def run(agent1, agent2, n_rounds=100):
    config = {'rows': 3, 'columns': 3, 'inarow': 3}
    outcomes = evaluate("tictactoe", [agent1, agent2], config, [], n_rounds//2)


In [9]:
from kaggle_environments import make, evaluate
env = make("tictactoe")
get_win_percentages(agent1='random', agent2=q_learning_player, n_rounds=1000)


Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.95
Number of Invalid Plays by Agent 1: 0
Number of Invalid Plays by Agent 2: 0


As we can see, the win percentage is 0.05 higher than the previous minimax agent, which is a significant 50% reduction in draws.