Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [122]:
from tqdm import tqdm
import numpy as np
from copy import deepcopy
import pickle
import random

# Game Setup:

In [123]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3), dtype=int)  # 0 = empty, 1 = X, -1 = O
        self.current_player = random.choice([1,-1]) 

    def reset(self):
        self.board = np.zeros((3, 3), dtype=int)
        self.current_player *= -1

    def make_move(self, row, col):
        if self.board[row, col] == 0:
            self.board[row, col] = self.current_player
            self.current_player *= -1
            return True
        return False

    def is_winner(self, player):
        # check rows, cols and diags
        for i in range(3):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False

    def is_draw(self):
        return np.all(self.board != 0)

    def get_available_moves(self):
        return [(i, j) for i in range(3) for j in range(3) if self.board[i, j] == 0]

    def print_board(self):
        symbols = {0: '⬜️', 1: '❌', -1: '🟢'}
        for row in self.board:
            print(' '.join([symbols[cell] for cell in row]))
        print()

    def print_result(self, player1, player2):
        if self.is_winner(player1.symbol):
            print("❌ has won!")
        elif self.is_winner(player2.symbol):
            print("🟢 has won!")
        elif self.is_draw():
            print("Draw!")

## Random Player

In [124]:
class RandomPlayer:
    def __init__(self, symbol):
        self.symbol = symbol  # 1 per X, -1 per O

    def choose_move(self, game):
        available_moves = game.get_available_moves()
        if available_moves: 
            return random.choice(available_moves) 
        else:
            return None 

## Q-Learning Player:

In [125]:
class QLearningPlayer:
    def __init__(self, symbol, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.symbol = symbol  # 1 per X, -1 per O
        self.alpha = alpha    # learning rate
        self.gamma = gamma    # discount factor
        self.epsilon = epsilon  # exploring prob (epsilon greedy) 
        self.q_table = {}  # state -> action -> value

    def get_state(self, game):
        return str(game.board.reshape(9))

    def choose_move(self, game):
        state = self.get_state(game)
        if np.random.rand() < self.epsilon:
            # exploring
            return random.choice(game.get_available_moves())
        else:
            # exploiting
            self.q_table.setdefault(state, {})
            if not self.q_table[state]:
                # if none -> rand
                return random.choice(game.get_available_moves())
            best_move = max(self.q_table[state], key=self.q_table[state].get)
            return eval(best_move)

    def update_q_values(self, prev_state, action, reward, next_state):
        self.q_table.setdefault(prev_state, {})
        self.q_table.setdefault(next_state, {})
        prev_q = self.q_table[prev_state].get(str(action), 0)
        max_next_q = max(self.q_table[next_state].values(), default=0)
        self.q_table[prev_state][str(action)] = prev_q + self.alpha * (reward + self.gamma * max_next_q - prev_q)

    def update_q_table(self, game_history):
        # game_history = list (state, action, reward, next_state)
        for i in range(len(game_history) - 1):
            state, action, reward, next_state = game_history[i]
            self.update_q_values(state, action, reward, next_state)

        # update q-values for final_state
        final_state, final_action, final_reward, _ = game_history[-1]
        self.update_q_values(final_state, final_action, final_reward, final_state)

    def receive_reward(self, game):
        # rewars values
        if game.is_winner(self.symbol):
            return 1  # win
        elif game.is_draw():
            return 0  # draw, maybe -0.5
        else:
            return -1  # loss

    def save_q_table(self, filename='q_table.pkl'):
        with open(filename, 'wb') as f:
            pickle.dump(self.q_table, f)
        print(f"Q-table saved in {filename}.")

    def load_q_table(self, filename='q_table.pkl'):
        with open(filename, 'rb') as f:
            self.q_table = pickle.load(f)
        print(f"Q-table loaded from {filename}.")

## Training session:

In [135]:
def play_game(q_player, random_player, episodes=100000):
    for episode in range(episodes):
        game = TicTacToe()
        game_history = []  # init game history

        while True:
            current_player = q_player if game.current_player == q_player.symbol else random_player
            move = current_player.choose_move(game)
            if move is None:  # if none break loop
                break

            # save current_state if qlplayer 
            if isinstance(current_player, QLearningPlayer):
                prev_state = current_player.get_state(game)
            
            game.make_move(*move)

            if isinstance(current_player, QLearningPlayer):
                next_state = current_player.get_state(game)
                reward = 0 
                game_history.append((prev_state, move, reward, next_state))

            # winner check
            if game.is_winner(q_player.symbol) or game.is_winner(random_player.symbol) or game.is_draw():
                final_reward = q_player.receive_reward(game)  # final reward
                
                # update reward for qlplayer
                if isinstance(current_player, QLearningPlayer) and game_history:
                    prev_state, move, _, next_state = game_history[-1]
                    game_history[-1] = (prev_state, move, final_reward, next_state)
                break

        # update qtable with complete history
        if game_history:
            q_player.update_q_table(game_history)

        game.reset()  # next game

    print("Learning completed after {} episodes".format(episodes))


In [136]:
player1 = QLearningPlayer(symbol=1)
player2 = RandomPlayer(symbol=-1)
#player1.load_q_table('q_table_final.pkl')
play_game(player1, player2)
player1.save_q_table('q_table_final.pkl')


Learning completed after 100000 episodes
Q-table salvata in q_table_final.pkl.


# Testing:

In [137]:
def test_win_rate(q_player, random_player, episodes=1000):
    q_player.epsilon = 0  # no exploration
    results = {"wins": 0, "losses": 0, "draws": 0}
    q_player.load_q_table('q_table_final.pkl')

    for _ in tqdm(range(episodes), desc="Testing win rate"):
        game = TicTacToe()
        while True:
            current_player = q_player if game.current_player == q_player.symbol else random_player
            move = current_player.choose_move(game)
            game.make_move(*move)
            
            if game.is_winner(q_player.symbol):
                results["wins"] += 1
                break
            elif game.is_winner(random_player.symbol):
                results["losses"] += 1
                break
            elif game.is_draw():
                results["draws"] += 1
                break

        game.reset()

    win_rate = results["wins"] / episodes
    print(f"Win rate: {win_rate*100:.2f}%")
    print(f"Total games: {episodes}, Wins: {results['wins']}, Losses: {results['losses']}, Draws: {results['draws']}")

test_win_rate(player1, player2, episodes=1000)

Q-table caricata da q_table_final.pkl.


Testing win rate: 100%|██████████| 1000/1000 [00:01<00:00, 756.95it/s]

Win rate: 77.60%
Total games: 1000, Wins: 776, Losses: 165, Draws: 59





## Single Match Mode:

In [216]:
# single game
game = TicTacToe()
player1 = QLearningPlayer(symbol=1)  # X
player2 = RandomPlayer(symbol=-1)  # O
player1.load_q_table('q_table_final.pkl')

while not game.is_draw() and not game.is_winner(player1.symbol) and not game.is_winner(player2.symbol):
    if game.current_player == player1.symbol:
        move = player1.choose_move(game)
    else:
        move = player2.choose_move(game)
    
    if move:
        game.make_move(*move)
    else:
        break 
game.print_board()
game.print_result(player1, player2)


Q-table caricata da q_table_final.pkl.
🟢 🟢 ❌
🟢 🟢 ❌
❌ ❌ ❌

Il giocatore ❌ ha vinto!


In [219]:
def Me_vs_opponent(game, opponent):
    if isinstance(opponent, QLearningPlayer):
        opponent.epsilon = 0
    
    current_player = opponent if random.choice([True, False]) else "Me"
    
    print(f"{current_player} starts the game.")
    
    while True:
        game.print_board()
        
        if current_player == "Me":
            move = None
            while move is None:
                try:
                    Me_move = int(input("Choose your move (1-9): ")) - 1
                    row, col = divmod(Me_move, 3)
                    if game.make_move(row, col):
                        move = (row, col)
                    else:
                        print("Invalid move, please try again.")
                except ValueError:
                    print("Please enter a number between 1 and 9.")
                    
            if game.is_winner(-1):  # assuming Me is 🟢 (-1)
                print("I'm the best.")
                break
            current_player = opponent
        else:
            print("Let me think... 🤔")
            row, col = opponent.choose_move(game)
            game.make_move(row, col)
            print(f"Opponent chose move {row * 3 + col + 1}")
            
            if game.is_winner(1):  # assuming QLearningPlayer is ❌ (1)
                print("Opponent wins.")
                break
            current_player = "Me"
        
        if game.is_draw():
            print("Draw!")
            break
    
    game.print_board()

In [221]:
game = TicTacToe()
opponent = QLearningPlayer(symbol=1)
opponent.load_q_table('q_table_final.pkl')

Me_vs_opponent(game, opponent)

Q-table caricata da q_table_final.pkl.
Me starts the game.
⬜️ ⬜️ ⬜️
⬜️ ⬜️ ⬜️
⬜️ ⬜️ ⬜️

⬜️ 🟢 ⬜️
⬜️ ⬜️ ⬜️
⬜️ ⬜️ ⬜️

QLearningPlayer is thinking...
QLearningPlayer chose move 5
⬜️ 🟢 ⬜️
⬜️ ❌ ⬜️
⬜️ ⬜️ ⬜️

⬜️ 🟢 🟢
⬜️ ❌ ⬜️
⬜️ ⬜️ ⬜️

QLearningPlayer is thinking...
QLearningPlayer chose move 9
⬜️ 🟢 🟢
⬜️ ❌ ⬜️
⬜️ ⬜️ ❌

Congratulations! You've won.
🟢 🟢 🟢
⬜️ ❌ ⬜️
⬜️ ⬜️ ❌

