Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
import numpy as np
from itertools import permutations
from tqdm import tqdm

In [3]:
class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3, 3))
        self.TICTACTOE_MAP = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]])
        self.player=1

    def print_state(self):
        for i in range(3):
            for j in range(3):
                if self.board[i, j] == 0:
                    print("-", end=" ")
                elif self.board[i, j] == 1:
                    print("X", end=" ")
                else:
                    print("O", end=" ")
            print()
        print()
        

    def state(self):
        return self.board
    
    def move(self, action):
        if self.board[action] == 0:
            self.board[action] = self.player
            self.player = 3 - self.player
            return True
        else:
            return False
    
    def next_actions(self):
        if self.check_win(1) or self.check_win(2):
            return list()
        row, columns = np.where(self.board == 0)
        return list(zip(row, columns))
    
    def check_win(self, player):
        cells = self.TICTACTOE_MAP[self.board == player]
        return any(sum(h) == 12 for h in permutations(cells, 3))
    
    def reward(self, player):
        if self.check_win(player):
            return 1
        elif self.check_win(3 - player):
            return -1
        else:
            return 0

    def finished(self):
        return len(self.next_actions()) == 0 or self.check_win(1) or self.check_win(2)

In [4]:
class Qlearning:
    def __init__(self, alpha, gamma, epsilon):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}
    
    def set_epsilon(self, epsilon):
        self.epsilon = epsilon
        
    def get_q_value(self, state, action):
        if (state, action) not in self.q_table:
            self.q_table[(state, action)] = 0
        return self.q_table[(state, action)]
    
    def choice_action(self, state, actions):
        if np.random.uniform() < self.epsilon:
            return actions[np.random.choice(range(len(actions)))]
        else:
            q_values = np.array([self.get_q_value(state, action) for action in actions])
            maximum = np.max(q_values)
            return actions[np.random.choice(np.where(q_values == maximum)[0])]
            
    
    def update(self, state, action, reward, next_state, next_actions):
        q_value = self.get_q_value(state, action)
        next_q_values = np.array([self.get_q_value(next_state, next_action) for next_action in next_actions])
        maximum = np.max(next_q_values) if len(next_q_values) > 0 else 0
        self.q_table[(state, action)] = q_value + self.alpha * (reward + self.gamma * maximum - q_value)

### Policy For Player 1 ###

In [195]:
Q1 = Qlearning(0.5, 0.9, 1)
games = 70000
epsilon = np.linspace(1, 0.1, num=games, endpoint=True)

for i in tqdm(range(games)):
    Q1.set_epsilon(epsilon[i])
    game = TicTacToe()

    while not game.finished():
        state = game.state().copy()
        actions = game.next_actions()
        action = Q1.choice_action(str(state), actions)
        game.move(action)

        if game.finished():
            next_state = game.state().copy()
            next_actions = game.next_actions()
            reward = game.reward(1)
            Q1.update(str(state), action, reward, str(next_state), next_actions)
            
        else:
            reward = game.reward(1)
            
            actions_2 = game.next_actions()
            action_2 = actions_2[np.random.choice(range(len(actions_2)))]
            game.move(action_2)

            if game.finished():
                reward = game.reward(1)

            next_state = game.state().copy()
            next_actions = game.next_actions()
            
            Q1.update(str(state), action, reward, str(next_state), next_actions)

100%|██████████| 70000/70000 [02:05<00:00, 558.62it/s]


In [201]:
Q1.set_epsilon(0)
win=0
lose=0
tie=0
games=1000

for i in range(games):
    game = TicTacToe()
    
    while not game.finished():
        if game.player == 1:
            
            state = game.state()
            actions = game.next_actions()
            action = Q1.choice_action(str(state), actions)
            game.move(action)

        else:
            state = game.state()
            actions = game.next_actions()
            action = actions[np.random.choice(range(len(actions)))]
            game.move(action)
        
        
    if game.check_win(1):
        win += 1
    elif game.check_win(2):
        lose += 1
    else:
        tie += 1

print(f"Percentage of wins: {win/games *100}% \nPercentage of loses: {lose/games *100}% \nPercentage of ties: {tie/games *100}%")

Percentage of wins: 99.2% 
Percentage of loses: 0.0% 
Percentage of ties: 0.8%


### Policy For Player 2 ###

In [5]:
Q2 = Qlearning(0.5, 0.9, 1)
games = 100000
epsilon = np.linspace(1, 0, num=games, endpoint=True)

for i in tqdm(range(games)):
    Q2.set_epsilon(epsilon[i])
    game = TicTacToe()
     
    actions_2 = game.next_actions()
    action_2 = actions_2[np.random.choice(range(len(actions_2)))]
    game.move(action_2)

    while not game.finished():
        state = game.state().copy()
        actions = game.next_actions()
        action = Q2.choice_action(str(state), actions)
        game.move(action)

        if game.finished():
            next_state = game.state().copy()
            next_actions = game.next_actions()
            reward = game.reward(2)
            Q2.update(str(state), action, reward, str(next_state), next_actions)
            
        else:
            reward = game.reward(2)
            
            actions_2 = game.next_actions()
            action_2 = actions_2[np.random.choice(range(len(actions_2)))]
            game.move(action_2)

            if game.finished():
                reward = game.reward(2)

            next_state = game.state().copy()
            next_actions = game.next_actions()
            
            Q2.update(str(state), action, reward, str(next_state), next_actions)
            

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [02:36<00:00, 639.61it/s]


In [7]:
Q2.set_epsilon(0)

win=0
lose=0
tie=0

games=1000
for i in range(games):
    game = TicTacToe()
    while not game.finished():
        if game.player == 2:
            state = game.state()
            actions = game.next_actions()
            action = Q2.choice_action(str(state), actions)
            game.move(action)
        else:
            state = game.state()
            actions = game.next_actions()
            action = actions[np.random.choice(range(len(actions)))]
            game.move(action)
    
    if game.check_win(2):
        win += 1
    elif game.check_win(1):
        lose += 1
    else:
        tie += 1

print(f"Percentage of wins: {win/games *100}% \nPercentage of loses: {lose/games *100}% \nPercentage of ties: {tie/games *100}%")

Percentage of wins: 90.10000000000001% 
Percentage of loses: 0.6% 
Percentage of ties: 9.3%


### Min-Max Experiment ###

In [248]:
class Qlearning:
    def __init__(self, alpha, gamma, epsilon):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = {}
    
    def set_epsilon(self, epsilon):
        self.epsilon = epsilon
        
    def get_q_value(self, state, action):
        if (state, action) not in self.q_table:
            self.q_table[(state, action)] = 0
        return self.q_table[(state, action)]
    
    def choice_action(self, state, actions, player):
        if np.random.uniform() < self.epsilon:
            return actions[np.random.choice(range(len(actions)))]
        else:
            q_values = np.array([self.get_q_value(state, action) for action in actions])
            if player==1:
                maximum = np.max(q_values)
                return actions[np.random.choice(np.where(q_values == maximum)[0])]
            
            else:
                minimum = np.min(q_values)
                return actions[np.random.choice(np.where(q_values == minimum)[0])]
            
    
    def update(self, state, action, reward, next_state, next_actions, player):
        q_value = self.get_q_value(state, action)
        next_q_values = np.array([self.get_q_value(next_state, next_action) for next_action in next_actions])
        
        if player==1:
            maximum = np.max(next_q_values) if len(next_q_values) > 0 else 0
            self.q_table[(state, action)] = q_value + self.alpha * (reward + self.gamma * maximum - q_value)
        else:
            minimum = np.min(next_q_values) if len(next_q_values) > 0 else 0
            self.q_table[(state, action)] = q_value + self.alpha * (reward + self.gamma * minimum - q_value)

In [249]:
Q1 = Qlearning(0.5, 0.9, 1)
games = 100000
epsilon = np.linspace(1, 0, num=games, endpoint=True)

for i in tqdm(range(games)):
    Q1.set_epsilon(epsilon[i])
    game = TicTacToe()

    while not game.finished():
        state = game.state().copy()
        actions = game.next_actions()
        action = Q1.choice_action(str(state), actions, game.player)
        game.move(action)

        next_state = game.state().copy()
        next_actions = game.next_actions()
        reward = game.reward(1)
        Q1.update(str(state), action, reward, str(next_state), next_actions, game.player)

  0%|          | 0/100000 [00:00<?, ?it/s]

100%|██████████| 100000/100000 [05:15<00:00, 316.65it/s]


In [259]:
Q1.set_epsilon(0)
win=0
lose=0
tie=0
games=1000

for i in range(games):
    game = TicTacToe()
    
    while not game.finished():
        if game.player == 1:
            
            state = game.state()
            actions = game.next_actions()
            action = Q1.choice_action(str(state), actions, game.player)
            game.move(action)

        else:
            state = game.state()
            actions = game.next_actions()
            action = actions[np.random.choice(range(len(actions)))]
            game.move(action)
        
        
    if game.check_win(1):
        win += 1
    elif game.check_win(2):
        lose += 1
    else:
        tie += 1

print(f"Percentage of wins: {win/games *100}% \nPercentage of loses: {lose/games *100}% \nPercentage of ties: {tie/games *100}%")

Percentage of wins: 96.5% 
Percentage of loses: 0.0% 
Percentage of ties: 3.5000000000000004%
