In [1]:
from tqdm import tqdm
from TicTacToeGame import TicTacToeGame
from TicTacToeRandomSolver import TicTacToeRandomSolver
from TicTacToeQLearningSolver import TicTacToeQLearningSolver

In [2]:
def evaluate_agent(game_instance, agent, opponent):
    agent.in_training = False
    scores = {'X': 0, 'O': 0, 'tie': 0}
    for i in range(1000):
        while not game_instance.is_gameover():
            if game_instance.player_X_turns:
                agent.take_turn()
            else:
                opponent.take_turn()
        if game_instance.X_wins:
            scores['X'] += 1
        elif game_instance.O_wins:
            scores['O'] += 1
        else:
            scores['tie'] += 1
        game_instance.play_again()
    return scores


In [3]:
def train_agent(agent, episodes_range):
    agent.in_training = True
    agent.train(episodes_range)

In [4]:
learning_rate_arr = [0.1, 0.3, 0.5, 0.7, 0.9]
discount_factor_arr = [0.1, 0.3, 0.5, 0.7, 0.9]
exploration_rate_arr = [0.1, 0.3, 0.5, 0.7, 0.9]
decay_rate_arr = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2]
scores_arr = []

total_iterations = len(learning_rate_arr) * len(discount_factor_arr) * len(exploration_rate_arr) * len(decay_rate_arr)
pbar = tqdm(total=total_iterations, desc='Total Progress')

for learning_rate in learning_rate_arr:
    for discount_factor in discount_factor_arr:
        for exploration_rate in exploration_rate_arr:
            for decay_rate in decay_rate_arr:
                game_instance = TicTacToeGame()
                agent = TicTacToeQLearningSolver(game_instance, learning_rate, discount_factor, exploration_rate, decay_rate)
                opponent = TicTacToeRandomSolver(game_instance)
                train_agent(agent, range(1000))
                scores = evaluate_agent(game_instance, agent, opponent)
                scores_arr.append((learning_rate, discount_factor, exploration_rate, decay_rate, scores['X'], scores['O'], scores['tie']))
                pbar.update(1)
pbar.close()

Total Progress: 100%|██████████| 625/625 [32:57<00:00,  3.16s/it]   


In [5]:
# Turn scores_arr into a dataframe
import pandas as pd
df = pd.DataFrame(scores_arr, columns=['learning_rate', 'discount_factor', 'exploration_rate', 'decay_rate', 'X_wins', 'O_wins', 'ties'])

# Sort by X_wins
df = df.sort_values(by='X_wins', ascending=False)

# Save to csv
df.to_csv('q_learning_results.csv', index=False)

df.head()

Unnamed: 0,learning_rate,discount_factor,exploration_rate,decay_rate,X_wins,O_wins,ties
10,0.1,0.1,0.5,1e-06,687,246,67
81,0.1,0.7,0.3,1e-05,686,245,69
50,0.1,0.5,0.1,1e-06,685,244,71
300,0.5,0.5,0.1,1e-06,682,240,78
387,0.7,0.1,0.5,0.0001,680,268,52


In [9]:
# Create a game instance
game_instance = TicTacToeGame(use_gui=False)

# Q-learning player plays against random player
q_learning_instance = TicTacToeQLearningSolver(game_instance)
random_opponent = TicTacToeRandomSolver(game_instance)

In [10]:
untrained_scores = evaluate_agent(game_instance, q_learning_instance, random_opponent)
scores_history = [untrained_scores]
for i in range(10):
    train_episodes = 10000
    train_agent(q_learning_instance, episodes_range=range(i*train_episodes, (i+1)*train_episodes))
    scores = evaluate_agent(game_instance, q_learning_instance, random_opponent)
    scores_history.append(scores)
    print('Q-learning vs Random - Game {}: X wins: {}, O wins: {}, tie: {}'.format(i, scores['X'], scores['O'], scores['tie']))

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:01<00:00, 884.79it/s]


Q-learning vs Random - Game 0: X wins: 0, O wins: 1000, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 1000.92it/s]


Q-learning vs Random - Game 1: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 1670.94it/s]


Q-learning vs Random - Game 2: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 1817.38it/s]


Q-learning vs Random - Game 3: X wins: 0, O wins: 1000, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 1606.36it/s]


Q-learning vs Random - Game 4: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 2144.90it/s]


Q-learning vs Random - Game 5: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 2294.46it/s]


Q-learning vs Random - Game 6: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 2230.92it/s]


Q-learning vs Random - Game 7: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 2079.01it/s]


Q-learning vs Random - Game 8: X wins: 1000, O wins: 0, tie: 0


100%|██████████| 1000/1000 [00:00<00:00, 1490.76it/s]


Q-learning vs Random - Game 9: X wins: 1000, O wins: 0, tie: 0


In [7]:
scores_history

[{'X': 1000, 'O': 0, 'tie': 0}]