# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
import pickle
import itertools
import numpy as np
from tqdm.auto import tqdm
from collections import defaultdict, namedtuple
from copy import deepcopy
from enum import Enum
import random

In [2]:
State = namedtuple('State', ['x', 'o'])
class Player(Enum):
    X = 1,
    O = 0
# Reward values
WIN_SCORE = 2
LOSE_SCORE = -5
DRAW_SCORE = -1

TRAINING_EPOCHS = 500_000

NUM_TEST_GAMES = 10_000

#### The Game index map is used to map the game board to another game board, in the second one the numbers are in a spiral order 🌀
 0 1 2 ➜ 0 1 2 

 3 4 5 ➜ 7 8 3  

 6 7 8 ➜ 6 5 4

In [3]:
GAME_INDEX_MAP = {
    0: 0, 1: 1, 2: 2,
    7: 3, 8: 4, 3: 5,
    6: 6, 5: 7, 4: 8
}

# The magic square is used to calculate the score of a game board
MAGIC = [4, 9, 2,
         3, 5, 7,
         8, 1, 6]

In [4]:
class TicTacToe:
    def __init__(self):
        self.state = State(set(), set())
        self.move = 0
        self.my_player = Player.X

    def sum_magic(self, comb: tuple) -> int:
        return sum(MAGIC[GAME_INDEX_MAP[i]] for i in comb)

    def check_win(self, player: Player) -> bool:
        '''check if the player win the game'''
        if self.move < 5:
            return False
        if player == Player.X:
            return any(self.sum_magic(comb) == 15 for comb in itertools.combinations(self.state.x, 3))
        if player == Player.O:
            return any(self.sum_magic(comb) == 15 for comb in itertools.combinations(self.state.o, 3))

    def check_draw(self) -> bool:
        if self.move == 9 and not self.check_win(Player.X) and not self.check_win(Player.O):
            return True
        return False

    def move_done(self, move: int, player: Player) -> bool:
        self.move += 1
        if player == Player.X:
            self.state.x.add(move)
        elif player == Player.O:
            self.state.o.add(move)
        # self.good_print()

    def evaluate_match(self) -> int:
        if self.check_win(self.my_player):
            return 1
        elif self.check_win(Player.O if self.my_player == Player.X else Player.X):
            return -1
        elif self.check_draw():
            return 0

    def good_print(self):
        '''
        Print the game board in a nice way, using the spiral order
        '''
        num = ['0️⃣', '1️⃣', '2️⃣', '7️⃣', '8️⃣', '3️⃣', '6️⃣', '5️⃣', '4️⃣']
        counter = 0
        for r in range(3):
            print('|', end='')
            for c in range(3):
                val = r * 3 + c
                if val in [GAME_INDEX_MAP[i] for i in self.state.x]:
                    print('✖️|', end='')
                elif val in [GAME_INDEX_MAP[i] for i in self.state.o]:
                    print('⭕|', end='')
                else:
                    print(f'{num[counter]}|', end="")
                counter += 1
            print()
        print()

### Symmetry function

This two symmetry functions are used to check if there is already a similar state in the Q table, if there is one, the Q value of the current state is updated and no new state is added. 
This approach is used to reduce the size of the Q table and to speed up the learning process.


In [5]:
def rotate_90_right(value):
    if value < 6:
        new_value = value + 2
    elif 6 <= value <= 7:
        new_value = value - 6
    else:
        new_value = value
    return new_value


def rotate_state_90_right(policy: tuple[tuple[frozenset, frozenset], int]) -> tuple[tuple[frozenset, frozenset], int]:
    state, action = policy
    rotated_x = set([rotate_90_right(value) for value in state[0]])
    rotated_o = set([rotate_90_right(value) for value in state[1]])
    rotated_action = rotate_90_right(action)

    return (frozenset(rotated_x), frozenset(rotated_o)), rotated_action

In [6]:
class ReinforcedPlayer:
    def __init__(self):
        self.Q = defaultdict(float)
        self.epsilon = 0.01
        self.training_epochs = TRAINING_EPOCHS

    def get_Q_value(self, state, action):
        st = (frozenset(state.x), frozenset(state.o))

        policy = (st, action)
        policy_90 = rotate_state_90_right(policy)
        policy_180 = rotate_state_90_right(policy_90)
        policy_270 = rotate_state_90_right(policy_180)

        if policy not in self.Q and policy_90 not in self.Q \
                and policy_180 not in self.Q and policy_270 not in self.Q:
            self.Q[policy] = 0.0

        if policy_90 in self.Q:
            return self.Q[policy_90]
        elif policy_180 in self.Q:
            return self.Q[policy_180]
        elif policy_270 in self.Q:
            return self.Q[policy_270]
        else:
            return self.Q[policy]

    def update_Q_value(self, state, action: int, reward):
        '''
        Update the Q value of the policy (state, action) using the reward.
        This function checks the symmetry of the state and update the value of the state adding the reward weighted by the epsilon value.
        '''
        st = (frozenset(state.x), frozenset(state.o))

        policy = (st, action)
        policy_90 = rotate_state_90_right(policy)
        policy_180 = rotate_state_90_right(policy_90)
        policy_270 = rotate_state_90_right(policy_180)

        if policy_90 in self.Q:
            self.Q[policy_90] = self.get_Q_value(state, action) + self.epsilon * (reward - self.get_Q_value(state, action))
        elif policy_180 in self.Q:
            self.Q[policy_180] = self.get_Q_value(state, action) + self.epsilon * (reward - self.get_Q_value(state, action))
        elif policy_270 in self.Q:
            self.Q[policy_270] = self.get_Q_value(state, action) + self.epsilon * (reward - self.get_Q_value(state, action))
        else:
            self.Q[policy] = self.get_Q_value(state, action) + self.epsilon * (reward - self.get_Q_value(state, action))

    def choose_action(self, state, available_moves):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(available_moves)
        else:
            Q_values = [self.get_Q_value(state, action) for action in available_moves]
            max_Q = max(Q_values)
            if Q_values.count(max_Q) > 1:
                best_moves = [i for i in range(len(available_moves)) if Q_values[i] == max_Q]
                i = random.choice(best_moves)
            else:
                i = Q_values.index(max_Q)
            return available_moves[i]

    def training(self):
        for _ in tqdm(range(self.training_epochs)):
            new_game = TicTacToe()
            trajectory, reward = self.random_game(new_game)
            ## update the Q value for each tuple (state, action) in the trajectory
            for state_move in trajectory:
                self.update_Q_value(state=state_move[0], action=state_move[1], reward=reward)
        save_model(self,
                   f"EPOCHS_{str(self.training_epochs)}-LOSE_{LOSE_SCORE}-WIN_{WIN_SCORE}-DRAW_{DRAW_SCORE}-NUOVA_RAPP")
    
    def random_game(self, game: TicTacToe):
        '''
        play a semi random game and return the trajectory of the chosen player (X or O) and the reward.
        The reward is 1 if the player win, -1 if the player lose and 0 if it is a draw
        '''
        trajectory = list()
        available_moves = list(range(0, 9))
        ## a random player start
        turn = np.random.choice([0, 1])

        players = [Player.X, Player.O]
        game.my_player = random.choice(players)  ## train the model on random player
        # so it is possible to play with both X and O
        while len(available_moves) != 0 and not game.check_draw():
            turn = 1 - turn

            if game.my_player == players[turn]:
                move = self.choose_action(game.state, available_moves)
                trajectory.append((deepcopy(game.state), move))
            else:
                move = np.random.choice(available_moves)

            available_moves.remove(move)
            game.move_done(move, players[turn])

            if game.check_win(players[turn]):
                if game.my_player == players[turn]:
                    return trajectory, WIN_SCORE
                else:
                    return trajectory, LOSE_SCORE

        return trajectory, DRAW_SCORE

In [7]:
# function to save the class trained player, to avoid to train the model every time

def save_model(model: ReinforcedPlayer, text: str = None):
    # Serialize the object and write it to a file
    with open(f'models/agent-{text}.pkl', 'wb') as f:
        pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)


def load_model(path: str) -> ReinforcedPlayer:
    # Load the model from a file
    with open(path, 'rb') as f:
        loaded_instance = pickle.load(f)

    return loaded_instance

In [8]:
train = True

if train:
    rr = ReinforcedPlayer()
    rr.training()
    #rr2 = load_model('models/agent-EPOCHS_10000000.pkl')
else:
    pass
    #rr = load_model('models/agent-EPOCHS_1000000-LOSE_-5-WIN_2-DRAW_-1-NUOVA_RAPP.pkl')
    #rr2 = load_model('models/agent-EPOCHS_100000-LOSE_-5-WIN_2-DRAW_-1-NUOVA_RAPP.pkl')

  0%|          | 0/500000 [00:00<?, ?it/s]

In [10]:
wins = 0
draws = 0
for _ in range(NUM_TEST_GAMES):
    game = TicTacToe()
    turn = np.random.choice([0, 1])
    players = [Player.X, Player.O]
    rr_player = random.choice(players)
    available_moves = list(range(0, 9))
    while len(available_moves) != 0 and not game.check_draw():
        turn = 1 - turn
        # game.good_print()
        if players[turn] == rr_player:
            move = rr.choose_action(game.state, available_moves)
        else:
            move = random.choice(available_moves)
            # move = int(input("Enter your move: "))
            # move = rr2.choose_action(game.state, available_moves)

        available_moves.remove(move)
        game.move_done(move, players[turn])
        if game.check_win(players[turn]):
            # game.good_print()
            if players[turn] == rr_player:
                wins += 1
            break

    if game.check_draw():
        draws += 1

print(f'wins : {wins}, draws: {draws}, lost: {NUM_TEST_GAMES - wins - draws}')


wins : 9196, draws: 708, lost: 96
