### Q-Learning for Tic-tac-toe game

In [None]:
from tictactoe import TicTacToe, QLearningAgent

In [None]:
class QLearningSimulation:
    def __init__(self, episodes, env, agent):
        self.episodes = episodes
        self.env = env
        self.agent = agent

    def train(self):
        for episode in range(self.episodes):
            state = self.env.reset()
            done = False
            while not done:
                available_actions =  self.env.get_available_actions()
                action = self.agent.choose_action(state, available_actions)
                next_state, reward, done = self.env.step(action)
                self.agent.update_q(state, action, reward, next_state, done)
                state = next_state

In [None]:
episodes = 1000
env = TicTacToe()
learning_rate = 0.01
discount_factor = 0.89
epsilon = 0.5
agent = QLearningAgent(learning_rate=learning_rate, discount_factor=discount_factor, epsilon=epsilon)
simulator = QLearningSimulation(episodes=episodes, env=env, agent=agent)

# train agent
simulator.train()

#### Play the game against random strategy

In [None]:
import random

def random_strategy(available_actions):
    return random.choice(available_actions)

def play_game(env, agent, strategy):
    state = env.reset()
    done = False
    reward = 0

    while not done:
        available_actions = env.get_available_actions()
        if env.current_player == 1:
            action = agent.choose_action(state, available_actions)
        else:
            action = strategy(available_actions)
        state, reward, done = env.step(action)
    return reward

test_agent = QLearningAgent()
test_agent.q = agent.q

env = TicTacToe()
num_of_games = 1000
outcomes = []

for _ in range(num_of_games):
    outcome = play_game(env=env, agent=test_agent, strategy=random_strategy)
    outcomes.append(outcome)

wins = [o for o in outcomes if o == 1]
draws = [o for o in outcomes if o == 0.5]

print(f'Number of wins: {len(wins)}')
print(f'Number of draws: {len(draws)}')
print(f'Percentage optimal play: {100 * (len(wins) + len(draws))/len(outcomes)}')
