Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.


# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

- Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
- Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

- Reviews will be assigned on Monday, December 4
- You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)


In [68]:
import numpy as np
from random import choice
from copy import deepcopy
from itertools import combinations
from collections import defaultdict
from tqdm.auto import tqdm

## State definition


In [80]:
MAGIC_SQUARE = [2, 7, 6, 9, 5, 1, 4, 3, 8]
GOAL_SUM = 15
BOARD_SIZE = 3


class BoardState:
    def __init__(self) -> None:
        self.x_plays = set()
        self.o_plays = set()

    def __init__(self, x: set, o: set) -> None:
        self.x_plays = x
        self.o_plays = o

    def __str__(self) -> str:
        board_string = ""
        for i in range(BOARD_SIZE):
            for j in range(BOARD_SIZE):
                index = i * BOARD_SIZE + j
                if MAGIC_SQUARE[index] in self.x_plays:
                    board_string += "| X "
                elif MAGIC_SQUARE[index] in self.o_plays:
                    board_string += "| O "
                else:
                    board_string += "|   "
            board_string += "|\n"
        return board_string

    def x_win(self) -> bool:
        return any(sum(c) == GOAL_SUM for c in combinations(self.x_plays, BOARD_SIZE))

    def o_win(self) -> bool:
        return any(sum(c) == GOAL_SUM for c in combinations(self.o_plays, BOARD_SIZE))

    def value(self) -> int:
        if self.x_win():
            return 1
        elif self.o_win:
            return -1
        else:
            return 0

    def play_x(self, square: int) -> None:
        self.x_plays.add(square)

    def play_o(self, square: int) -> None:
        self.o_plays.add(square)

    def get_available(self) -> set:
        return set(MAGIC_SQUARE) - self.x_plays - self.o_plays

    def get_hashable(self) -> tuple[frozenset]:
        return frozenset(self.x_plays), frozenset(self.o_plays)

In [53]:
print(BoardState())

|   |   |   |
|   |   |   |
|   |   |   |



## Random game


In [56]:
def random_game():
    history = list()
    state = BoardState()
    available = list(state.get_available())
    while available:
        x = choice(available)
        state.play_x(x)
        available.remove(x)
        history.append(deepcopy(state))
        if state.x_win() or not available:
            break

        o = choice(available)
        state.play_o(o)
        available.remove(o)
        history.append(deepcopy(state))
        if state.o_win() or not available:
            break

    return history

In [67]:
last = random_game()[-1]
print(last)

|   | X | X |
| O | O | O |
| X | X | O |



## Montecarlo reinforcement learning


In [78]:
value_dict = defaultdict(float)
hit_state = defaultdict(int)

LEARNING_RATE = 1e-4
EPISODES = 1_000_000

for steps in tqdm(range(EPISODES)):
    history = random_game()
    final_reward = history[-1].value()
    for state in history:
        hashable_state = state.get_hashable()
        hit_state[hashable_state] += 1
        value_dict[hashable_state] = value_dict[hashable_state] + LEARNING_RATE * (
            final_reward - value_dict[hashable_state]
        )

  0%|          | 0/1000000 [00:00<?, ?it/s]

100%|██████████| 1000000/1000000 [02:19<00:00, 7177.39it/s]


In [93]:
sorted(value_dict.items(), key=lambda e: e[1], reverse=True)[:10]

[((frozenset({5}), frozenset()), 0.3980218853775202),
 ((frozenset({1, 2, 3, 6, 7}), frozenset({4, 5, 8, 9})), 0.39013931103828087),
 ((frozenset({1, 3, 4, 5, 7}), frozenset({2, 6, 8, 9})), 0.3881845331165637),
 ((frozenset({1, 3, 4, 7, 8}), frozenset({2, 5, 6, 9})), 0.38769486041605083),
 ((frozenset({1, 2, 4, 5, 6}), frozenset({3, 7, 8, 9})), 0.38603931978670464),
 ((frozenset({1, 2, 3, 5, 7}), frozenset({4, 6, 8, 9})), 0.3852406119187445),
 ((frozenset({3, 4, 7, 8, 9}), frozenset({1, 2, 5, 6})), 0.3850561472106083),
 ((frozenset({2, 3, 4, 6, 9}), frozenset({1, 5, 7, 8})), 0.3840714002204728),
 ((frozenset({3, 4, 6, 8, 9}), frozenset({1, 2, 5, 7})), 0.3826530630604873),
 ((frozenset({4, 5, 6, 8, 9}), frozenset({1, 2, 3, 7})), 0.38203537644687197)]