Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.


# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

- Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
- Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)


In [1]:
import numpy as np
from random import choice, randint, random
from copy import deepcopy
from itertools import combinations
from collections import defaultdict
from tqdm.auto import tqdm
from matplotlib import pyplot as plt

## State definition


In [2]:
MAGIC_SQUARE = [2, 7, 6, 9, 5, 1, 4, 3, 8]
GOAL_SUM = 15
BOARD_SIZE = 3

X = 0
O = 1


class BoardState:
    def __init__(self, x=None, o=None) -> None:
        self.x_plays = x if x is not None else set()
        self.o_plays = o if o is not None else set()

    def __str__(self) -> str:
        board_string = ""
        for i in range(BOARD_SIZE):
            for j in range(BOARD_SIZE):
                index = i * BOARD_SIZE + j
                if MAGIC_SQUARE[index] in self.x_plays:
                    board_string += "| X "
                elif MAGIC_SQUARE[index] in self.o_plays:
                    board_string += "| O "
                else:
                    board_string += "|   "
            board_string += "|\n"
        return board_string

    def x_win(self) -> bool:
        return any(sum(c) == GOAL_SUM for c in combinations(self.x_plays, BOARD_SIZE))

    def o_win(self) -> bool:
        return any(sum(c) == GOAL_SUM for c in combinations(self.o_plays, BOARD_SIZE))

    def is_over(self) -> bool:
        return not self.get_available() or self.x_win() or self.o_win()

    def value(self) -> int:
        if self.x_win():
            return 1
        elif self.o_win:
            return -1
        else:
            return 0

    def play(self, square: int, player: int) -> None:
        if player == 0:
            self.x_plays.add(square)
        else:
            self.o_plays.add(square)

    def get_available(self) -> set:
        return set(MAGIC_SQUARE) - self.x_plays - self.o_plays

    def get_hashable(self) -> tuple[frozenset]:
        return frozenset(self.x_plays), frozenset(self.o_plays)

## Random game


In [128]:
def random_game():
    history = list()
    state = BoardState()
    available = list(state.get_available())
    while available:
        x = choice(available)
        state.play(x, X)
        available.remove(x)
        history.append(deepcopy(state))
        if state.is_over():
            break

        o = choice(available)
        state.play(o, O)
        available.remove(o)
        history.append(deepcopy(state))
        if state.is_over():
            break

    return history

In [129]:
last = random_game()[-1]
print(last)

| O | O | X |
| X | X | X |
| O |   |   |



## Montecarlo reinforcement learning

Code adapted from the lecture.
Here, every episode, a game is played randomly, and the value of each state the game was in is updated according to $V(s_t) \leftarrow V(s_t) + \alpha [G_t - V(s_t)]$.


In [130]:
value_dict = defaultdict(float)
hit_state = defaultdict(int)

LEARNING_RATE = 1e-4
EPISODES = 1_000_000

for steps in tqdm(range(EPISODES)):
    history = random_game()
    final_reward = history[-1].value()
    for state in history:
        hashable_state = state.get_hashable()
        hit_state[hashable_state] += 1
        value_dict[hashable_state] = value_dict[hashable_state] + LEARNING_RATE * (
            final_reward - value_dict[hashable_state]
        )

  0%|          | 0/1000000 [00:00<?, ?it/s]

In [131]:
top = sorted(value_dict.items(), key=lambda e: e[1], reverse=True)[:10]
top

[((frozenset({1, 3, 4, 7, 8}), frozenset({2, 5, 6, 9})), 0.38995631623385146),
 ((frozenset({5}), frozenset()), 0.3875577017337914),
 ((frozenset({1, 3, 4, 5, 7}), frozenset({2, 6, 8, 9})), 0.3862848672236049),
 ((frozenset({1, 2, 3, 4, 8}), frozenset({5, 6, 7, 9})), 0.38603931978670464),
 ((frozenset({1, 4, 5, 7, 9}), frozenset({2, 3, 6, 8})), 0.38474858302110837),
 ((frozenset({1, 2, 4, 6, 7}), frozenset({3, 5, 8, 9})), 0.3843177347071323),
 ((frozenset({2, 4, 7, 8, 9}), frozenset({1, 3, 5, 6})), 0.38394819602019487),
 ((frozenset({2, 6, 7, 8, 9}), frozenset({1, 3, 4, 5})), 0.38320845311068497),
 ((frozenset({4, 5, 6, 7, 8}), frozenset({1, 2, 3, 9})), 0.3828382486217784),
 ((frozenset({1, 2, 3, 5, 7}), frozenset({4, 6, 8, 9})), 0.3825913221927066)]

In [132]:
print(BoardState(set(top[0][0][0]), set(top[0][0][1])))

| O | X | O |
| O | O | X |
| X | X | X |



## Q-Learning

Here I implemented a Q-Learning agent, applying the formula $Q(S_t, A_t) \leftarrow Q(S_t, A_t) + \alpha(R_{t+1} + \gamma max_a Q(S_{t+1}, a) - Q(S_t, A_t))$ to update the q-table. The q-table is updated every action (instead of every episode like in Montecarlo), which means that we can take advantage of that to reward specific behaviours such as blocking an opponent's win.


In [33]:
def do_action(state: BoardState, action: int, player: int) -> BoardState:
    new_state = deepcopy(state)
    new_state.play(action, player)
    return new_state

In [34]:
def reward(current_state: BoardState, next_state: BoardState, action: int) -> float:
    if is_block(current_state, action):
        return 5
    elif didnt_block(current_state, action):
        return -5
    elif next_state.x_win():
        return 10
    elif next_state.o_win():
        return -10
    return 0


def is_block(state: BoardState, action: int) -> float:
    opponent_tiles = state.o_plays
    return sum(opponent_tiles) + action == GOAL_SUM


def didnt_block(state: BoardState, action: int) -> float:
    opponent_tiles = state.o_plays
    available_actions = state.get_available()
    block = any(sum(opponent_tiles) + a == GOAL_SUM for a in available_actions)
    return block and (sum(opponent_tiles) + action != GOAL_SUM)

In [35]:
q_table = defaultdict(lambda: 0.0)


DISCOUNT_RATE = 0.99
LEARNING_RATE = 0.01
EPISODES = 500_000
EPSILON_DECAY = 0.995
epsilon = 0.25
start_state = BoardState()
episode_rewards = []


for episode in tqdm(range(EPISODES)):
    current_state = start_state
    total_reward = 0

    while not (current_state.x_win() or current_state.o_win()) and len(
        current_state.get_available()
    ):
        if random() < episode:  # explore
            new_action = randint(0, BOARD_SIZE**2)
        else:  # exploit
            hashable_state = current_state.get_hashable()
            possible_actions = list(current_state.get_available())
            q_values = [q_table[(hashable_state, a)] for a in possible_actions]
            new_action = possible_actions[np.argmax(q_values)]

        new_state = do_action(current_state, new_action, X)
        new_reward = reward(current_state, new_state, new_action)
        hashable_state_action = (current_state.get_hashable(), new_action)

        current_q = q_table[hashable_state_action]

        possible_states_actions = [
            (do_action(new_state, a, O).get_hashable(), a)
            for a in new_state.get_available()
        ]

        next_actions_and_states = [
            q_table[state_action] for state_action in possible_states_actions
        ]

        if next_actions_and_states:
            max_next_q = -np.max(next_actions_and_states)
        else:
            max_next_q = 0

        target_q = new_reward + DISCOUNT_RATE * max_next_q
        q_table[hashable_state_action] += LEARNING_RATE * (target_q - current_q)

        total_reward += new_reward

        if new_state.get_available():
            new_state = do_action(new_state, choice(list(new_state.get_available())), O)
        current_state = new_state

    epsilon *= EPSILON_DECAY
    episode_rewards.append(total_reward)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [36]:
from pprint import pprint

pprint(q_table)

defaultdict(<function <lambda> at 0x7faa3c164ea0>,
            {((frozenset(), frozenset()), 0): 0.0,
             ((frozenset(), frozenset()), 1): 0.0,
             ((frozenset(), frozenset()), 2): 0.0,
             ((frozenset(), frozenset()), 3): 0.0,
             ((frozenset(), frozenset()), 4): 0.0,
             ((frozenset(), frozenset()), 5): 0.0,
             ((frozenset(), frozenset()), 6): 0.0,
             ((frozenset(), frozenset()), 7): 0.0,
             ((frozenset(), frozenset()), 8): 0.0,
             ((frozenset(), frozenset()), 9): 0.0,
             ((frozenset({1}), frozenset({2})), 2): 0.0,
             ((frozenset({1}), frozenset({3})), 3): 0.0,
             ((frozenset({1}), frozenset({4})), 4): 0.0,
             ((frozenset({1}), frozenset({5})), 5): 0.0,
             ((frozenset({1}), frozenset({6})), 6): -0.6123948850051602,
             ((frozenset({1}), frozenset({7})), 7): -0.5680756414193536,
             ((frozenset({1}), frozenset({8})), 8): -0.4780896249

In [37]:
def q_move(state: BoardState) -> int:
    possible = state.get_available()
    best_q = float("-inf")
    best_move = None

    for move in possible:
        move_q = q_table[(state.get_hashable(), move)]
        if move_q > best_q:
            best_q = move_q
            best_move = move

    return best_move


def random_move(state: BoardState) -> int:
    possible = list(state.get_available())
    return choice(possible)


def game(selector_x: callable, selector_o: callable) -> BoardState:
    current_state = BoardState()
    current_player = X

    while not (current_state.is_over()):
        if current_player == X:
            action = selector_x(current_state)
        else:
            action = selector_o(current_state)

        temp_state = deepcopy(current_state)
        temp_state.play(action, current_player)
        current_state = temp_state

        current_player = 1 - current_player

    return current_state

In [39]:
SIMULATIONS = 5_000


def simulate_games(n_games: int, move_x: callable, move_y: callable) -> tuple[int]:
    x_wins = 0
    o_wins = 0
    draws = 0
    for _ in range(n_games):
        game_state = game(move_x, move_y)
        if game_state.x_win():
            x_wins += 1
        elif game_state.o_win():
            o_wins += 1
        else:
            draws += 1
    return x_wins, draws, o_wins


q_wins, draws, opponent_wins = simulate_games(SIMULATIONS, q_move, random_move)

total_games = q_wins + opponent_wins + draws
winning_percentage = q_wins / total_games if total_games != 0 else 0
draw_percentage = draws / total_games if total_games != 0 else 0
loss_percentage = opponent_wins / total_games if total_games != 0 else 0

print(
    f"Q-LEARNING AGAINST RANDOM Wins: {q_wins}, Draws: {draws}, Losses: {opponent_wins}"
)
print(
    f"Winning Percentage: {winning_percentage:.2%}\nDraw Percentage: {draw_percentage:.2%}\nLoss Percentage: {loss_percentage:.2%}"
)

Q-LEARNING AGAINST RANDOM Wins: 3347, Draws: 430, Losses: 1223
Winning Percentage: 66.94%
Draw Percentage: 8.60%
Loss Percentage: 24.46%
