Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.


# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

- Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
- Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)


In [16]:
import numpy as np
from random import choice
from copy import deepcopy
from itertools import combinations
from collections import defaultdict
from tqdm.auto import tqdm

## State definition


In [17]:
MAGIC_SQUARE = [2, 7, 6, 9, 5, 1, 4, 3, 8]
GOAL_SUM = 15
BOARD_SIZE = 3

X = 0
O = 1


class BoardState:
    def __init__(self, x=None, o=None) -> None:
        self.x_plays = x if x is not None else set()
        self.o_plays = o if o is not None else set()

    def __str__(self) -> str:
        board_string = ""
        for i in range(BOARD_SIZE):
            for j in range(BOARD_SIZE):
                index = i * BOARD_SIZE + j
                if MAGIC_SQUARE[index] in self.x_plays:
                    board_string += "| X "
                elif MAGIC_SQUARE[index] in self.o_plays:
                    board_string += "| O "
                else:
                    board_string += "|   "
            board_string += "|\n"
        return board_string

    def x_win(self) -> bool:
        return any(sum(c) == GOAL_SUM for c in combinations(self.x_plays, BOARD_SIZE))

    def o_win(self) -> bool:
        return any(sum(c) == GOAL_SUM for c in combinations(self.o_plays, BOARD_SIZE))

    def is_over(self) -> bool:
        return not self.get_available() or self.x_win() or self.o_win()

    def value(self) -> int:
        if self.x_win():
            return 1
        elif self.o_win:
            return -1
        else:
            return 0

    def play(self, square: int, player: int) -> None:
        if player == 0:
            self.x_plays.add(square)
        else:
            self.o_plays.add(square)

    def get_available(self) -> set:
        return set(MAGIC_SQUARE) - self.x_plays - self.o_plays

    def get_hashable(self) -> tuple[frozenset]:
        return frozenset(self.x_plays), frozenset(self.o_plays)

## Random game


In [18]:
def random_game():
    history = list()
    state = BoardState()
    available = list(state.get_available())
    while available:
        x = choice(available)
        state.play(x, X)
        available.remove(x)
        history.append(deepcopy(state))
        if state.is_over():
            break

        o = choice(available)
        state.play(o, O)
        available.remove(o)
        history.append(deepcopy(state))
        if state.is_over():
            break

    return history

In [19]:
last = random_game()[-1]
print(last)

| X | O | O |
| X | O | X |
| O | X |   |



## Montecarlo reinforcement learning

Code adapted from the lecture.
Here, every episode, a game is played randomly, and the value of each state the game was in is updated according to $V(s_t) \leftarrow V(s_t) + \alpha [G_t - V(s_t)]$.


In [20]:
value_dict = defaultdict(float)
hit_state = defaultdict(int)

LEARNING_RATE = 1e-4
EPISODES = 1_000_000

for steps in tqdm(range(EPISODES)):
    history = random_game()
    final_reward = history[-1].value()
    for state in history:
        hashable_state = state.get_hashable()
        hit_state[hashable_state] += 1
        value_dict[hashable_state] = value_dict[hashable_state] + LEARNING_RATE * (
            final_reward - value_dict[hashable_state]
        )

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [02:33<00:00, 6502.16it/s]


In [21]:
top = sorted(value_dict.items(), key=lambda e: e[1], reverse=True)[:10]
top

[((frozenset({5}), frozenset()), 0.3883419319156682),
 ((frozenset({1, 3, 5, 6, 9}), frozenset({2, 4, 7, 8})), 0.38689830625820437),
 ((frozenset({3, 4, 6, 8, 9}), frozenset({1, 2, 5, 7})), 0.38677566752395254),
 ((frozenset({1, 4, 5, 6, 7}), frozenset({2, 3, 8, 9})), 0.38462551427770875),
 ((frozenset({1, 3, 4, 5, 7}), frozenset({2, 6, 8, 9})), 0.3843177347071323),
 ((frozenset({1, 2, 5, 7, 8}), frozenset({3, 4, 6, 9})), 0.3831467677874637),
 ((frozenset({4, 5, 6, 8, 9}), frozenset({1, 2, 3, 7})), 0.3825295751502216),
 ((frozenset({2, 3, 5, 6, 8}), frozenset({1, 4, 7, 9})), 0.3825295751502216),
 ((frozenset({1, 5, 7, 8, 9}), frozenset({2, 3, 4, 6})), 0.3824060625386687),
 ((frozenset({1, 2, 4, 5, 6}), frozenset({3, 7, 8, 9})), 0.3822825252208876)]

In [22]:
print(BoardState(set(top[0][0][0]), set(top[0][0][1])))

|   |   |   |
|   | X |   |
|   |   |   |



## Q-Learning


In [23]:
def do_action(state: BoardState, action: int, player: int) -> BoardState:
    new_state = deepcopy(state)
    new_state.play(action, player)
    return new_state

In [31]:
q_table = defaultdict(lambda: 0.0, {})


DISCOUNT_RATE = 0.9
LEARNING_RATE = 1e-4
EPISODES = 1_000_000
start_state = BoardState()

for episode in tqdm(range(EPISODES)):
    current_state = start_state
    total_reward = 0

    while not (current_state.x_win() or current_state.o_win()) and len(
        current_state.get_available()
    ):
        new_action = choice(list(current_state.get_available()))
        new_state = do_action(current_state, new_action, X)
        new_reward = new_state.value()
        hashable_state = new_state.get_hashable()

        current_q = q_table[(hashable_state, new_action)]

        possible_states = [
            ((hashable_state, a), do_action(new_state, a, O).get_hashable())
            for a in list(new_state.get_available())
        ]

        next_actions_and_states = [
            q_table[state_action] for _, state_action in possible_states
        ]

        if next_actions_and_states:
            max_next_q = -max(next_actions_and_states)
        else:
            break

        target_q = new_reward + DISCOUNT_RATE * max_next_q
        q_table[(hashable_state, new_action)] = q_table.get(
            (hashable_state, new_action), 0
        ) + LEARNING_RATE * (target_q - current_q)

        total_reward += new_reward

        new_state = do_action(new_state, choice(list(new_state.get_available())), O)
        current_state = new_state

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [07:30<00:00, 2218.28it/s]

Training complete.





In [34]:
def q_move(state: BoardState) -> int:
    possible = list(state.get_available())
    best_q = float("-inf")
    best_move = None

    for move in possible:
        move_q = q_table[(state.get_hashable(), move)]
        if move_q > best_q:
            best_q = move_q
            best_move = move

    return best_move


def random_move(state: BoardState) -> int:
    possible = list(state.get_available())
    return choice(possible)


def game(selector_x: callable, selector_o: callable) -> BoardState:
    current_state = BoardState()
    current_player = X

    while not (current_state.is_over()):
        if current_player == X:
            action = selector_x(current_state)
        else:
            action = selector_o(current_state)

        temp_state = deepcopy(current_state)
        temp_state.play(action, current_player)
        current_state = temp_state

        current_player = 1 - current_player

    return current_state

In [41]:
SIMULATIONS = 5_000


def simulate_games(n_games: int, move_x: callable, move_y: callable) -> tuple[int]:
    x_wins = 0
    o_wins = 0
    draws = 0
    for _ in range(n_games):
        game_state = game(move_x, move_y)
        if game_state.x_win():
            x_wins += 1
        elif game_state.o_win():
            o_wins += 1
        else:
            draws += 1
    return x_wins, draws, o_wins


q_wins, draws, opponent_wins = simulate_games(SIMULATIONS, q_move, random_move)

total_games = q_wins + opponent_wins + draws
winning_percentage = q_wins / total_games if total_games != 0 else 0
draw_percentage = draws / total_games if total_games != 0 else 0
loss_percentage = opponent_wins / total_games if total_games != 0 else 0

print(
    f"Q-LEARNING AGAINST RANDOM Wins: {q_wins}, Draws: {draws}, Losses: {opponent_wins}"
)
print(
    f"Winning Percentage: {winning_percentage:.2%}\nDraw Percentage: {draw_percentage:.2%}\nLoss Percentage: {draw_percentage:.2%}"
)

Q-LEARNING AGAINST RANDOM Wins: 2518, Draws: 926, Losses: 1556
Winning Percentage: 50.36%
Draw Percentage: 18.52%
Loss Percentage: 18.52%
