%%latex
# Terminology
In tic tac toe game we have a model of the environment, it's not always possible.
- Agent
- Environment
- State: the condition of the environment that the agent can sense
- Action: things the agent can do that will affect its state, the action brings to the next state
- Reward
- Episode: is one run of the game, an agent will take several episodes to be trained
- Episodic task: we can play an episode again and again
- Coninuous task: it never ends, it's not made of episodes
- Terminal state

# Value Function
It's a measure of the future reward we could get when we are in the current state.<br>
For each episode, we update only the states that have been visited by the agent, and we update them backwords. The formula is:
$$V(s) = V(s) + \alpha(V(s^') - V(s))$$

In [1]:
%qtconsole

In [2]:
import sys
import numpy as np
from tqdm import tqdm

In [3]:
def get_state_from_hash(h):
    n = to_base_n(h, 3)
    n = [0. for i in range(9 - len(n))] + [float(i) for i in n]
    n = np.array(n).reshape(3,3)
    return n - 1
    
def state_hash(state):
        res = str((state + 1).reshape(9))
        res = res[1:-1].split()
        res = [s[0] for s in res]
        res = ''.join(res)
        return int(res, 3)
        
def to_base_n(n, base):
    convert_string = '0123456789ABCDEF'
    if n < base:
        return convert_string[n]
    else:
        return to_base_n(n // base, base) + convert_string[n % base]
    
class Human:
    def __init__(self, player_number, name, eps = 0.1, alpha = 0.5, verbose = False):
        self.name = name
        self.player_number = player_number
        self.sign = 1 if player_number == 1 else -1
        
    def take_action(self, env):
        while True:
            inp = input('Enter coordinates i,j for the next move: ')
            i, j = inp.split(',')
            i, j = int(i), int(j)
            if i >= 0 and i <= 2 and j >= 0 and j <= 2 and env.state[i,j] == 0:
                env.new_action(self, i, j)
                break
        
    def update_state_history(self, state):
        pass
        
    def update_v(self, env):
        pass
        
    def reset_history(self):
        pass
    
    def initialize_V(self, env):
        pass
    
    
class Agent:
    def __init__(self, player_number, name, eps = 0.1, alpha = 0.5, verbose = False):
        self.sign = 1 if player_number == 1 else -1
        self.name = name
        self.state_history = []
        self.eps = eps
        self.alpha = alpha
        self.verbose = verbose
        
    def take_action(self, env):
        r = np.random.rand()
        possible_actions = env.get_possible_actions()
        if r < self.eps:
            if self.verbose:
                print('Taking random action')
            action = possible_actions[np.random.choice(len(possible_actions))]
        else:
            if self.verbose:
                print('Taking greedy action')
            action = None
            best_value = -1
            l = []
            for a in possible_actions:
                env.new_action(self, a[0], a[1])
                state = state_hash(env.get_state())
                env.state[a[0], a[1]] = 0
                env.next_player = self
                l.append(self.v[state])
                if self.v[state] > best_value:
                    best_value = self.v[state]
                    action = a
                    best_state = state
            if self.verbose:
                k = 0
                print('-------------')
                for i in range(3):
                    print('|', end = '')
                    for j in range(3):
                        if env.state[i,j] == 0:
                            symbol = '{0:.2f}'.format(l[k])
                            k += 1
                        else:
                            symbol = env.get_symbol(env.state[i,j])
                        print(' {} |'.format(symbol), end = '')
                    print()
                    print('-------------')
                
        env.new_action(self, action[0], action[1])
        
    def update_state_history(self, state):
        self.state_history.append(state_hash(state))
        
    def update_v(self, env):
        s_prime = self.state_history[-1]
        self.v[s_prime] = env.reward(self)
        if self.verbose:
            print('Updating the V, last state reward: {}'.format(env.reward(self)))
        for s in self.state_history[-2::-1]:
            #update the value function V(s) = V(s) + alpha*(V(s') - V(s))
            self.v[s] = self.v[s] + self.alpha * (self.v[s_prime] - self.v[s])
            s_prime = s
        self.reset_history()
        
    def reset_history(self):
        self.state_history = []
    
    def initialize_V(self, env):
        v = np.zeros(env.number_of_states)
        for i in range(env.number_of_states):
            env.state = get_state_from_hash(i)
            winner = env.get_winner()
            if env.game_over():
                if winner is None or winner.sign == - self.sign:
                    v[i] = 0
                elif winner.sign == self.sign:
                    v[i] = 1
            else:
                v[i] = 0.5
        env.state = np.zeros((3,3))
        self.v = v
    
    
class Environment:
    def __init__(self, first_player, second_player):
        self.state = np.zeros((3,3))
        self.p1 = first_player
        self.p2 = second_player
        self.next_player = first_player
        self.number_of_states = 3**9
        
    def get_state(self):
        return self.state
    
    def get_symbol(self, n):
        if n == 1:
            return 'x'
        elif n == -1:
            return 'o'
        else:
            return ' '
        
    def draw_board(self):
        print('\nIt\'s the turn of the player ' + self.next_player.name)
        print('-------------')
        for i in range(3):
            print('|', end = '')
            for j in range(3):
                print(' {} |'.format(self.get_symbol(self.state[i,j])), end = '')
            print()
            print('-------------')
        
    def new_action(self, player, x, y):
        if x < 0 or x > 2 or y < 0 or y > 2 or self.state[x,y] != 0:
            sys.exit('Invalid action')
        if player != self.next_player:
            sys.exit('Invalid turn')
        self.state[x, y] = player.sign
        self.next_player = self.p1 if player == self.p2 else self.p2
        
    def next_player(self):
        return self.next_player
    
    def get_possible_actions(self):
        c = np.where(self.state == 0)
        return list(zip(c[0], c[1]))
    
    def get_winner(self):
        winner = None
        s = np.sum(self.state, axis = 0)
        if len(np.extract(s == -3, s)) > 0 or np.sum(self.state.trace()) == -3 or np.sum(self.state[::-1].trace()) == -3:
            winner = self.p2
        elif len(np.extract(s == 3, s)) > 0 or np.sum(self.state.trace()) == 3 or np.sum(self.state[::-1].trace()) == 3:
            winner = self.p1
        s = np.sum(self.state, axis = 1)
        if len(np.extract(s == -3, s)) > 0:
            winner = self.p2
        elif len(np.extract(s == 3, s)) > 0:
            winner = self.p1
        return winner
    
    def game_over(self):
        return len(np.extract(self.state == 0, self.state)) == 0 or self.get_winner() != None
    
    def reward(self, player):
        if not self.game_over():
            return 0
        if self.get_winner() == player:
            reward = 1
        elif self.get_winner() == None:
            reward = 0
        else:
            reward = -1
        return reward
    
    def reset(self):
        self.state = np.zeros((3,3))
        self.next_player = self.p1
    
    
def play_game(p1, p2, env, draw = False):
    if draw:
            env.draw_board()
    while not env.game_over():
        current_player = env.next_player  
        #print(current_player.name, current_player.sign)
        current_player.take_action(env)
        
        # update the state history
        state = env.get_state()
        p1.update_state_history(state)
        p2.update_state_history(state)
        
        if draw:
            env.draw_board()
            
    # update the value function
    p1.update_v(env)
    p2.update_v(env)

# Test

In [4]:
'''
p1, p2 = Agent(1, 'p1', verbose = True), Agent(2, 'p2')
env = Environment(p1, p2)
p1.initialize_V(env)
p2.initialize_V(env)
play_game(p1, p2, env, draw = True)
winner = env.get_winner()
if winner is None:
    print('The game ended in a draw')
else:
    print('The winner is ' + env.get_winner().name)
'''

"\np1, p2 = Agent(1, 'p1', verbose = True), Agent(2, 'p2')\nenv = Environment(p1, p2)\np1.initialize_V(env)\np2.initialize_V(env)\nplay_game(p1, p2, env, draw = True)\nwinner = env.get_winner()\nif winner is None:\n    print('The game ended in a draw')\nelse:\n    print('The winner is ' + env.get_winner().name)\n"

# Train the agents

In [5]:
p1_train, p2_train = Agent(1, 'p1_train', verbose = False), Agent(2, 'p2_train', verbose = False)
env = Environment(p1_train, p2_train)
p1_train.initialize_V(env)
p2_train.initialize_V(env)
T = 10000
for t in range(T):
    if t%500 == 0:
        print('Episode {}/{}'.format(t, T))
    play_game(p1_train, p2_train, Environment(p1_train, p2_train))
    env.reset()

Episode 0/10000
Episode 500/10000
Episode 1000/10000
Episode 1500/10000
Episode 2000/10000
Episode 2500/10000
Episode 3000/10000
Episode 3500/10000
Episode 4000/10000
Episode 4500/10000
Episode 5000/10000
Episode 5500/10000
Episode 6000/10000
Episode 6500/10000
Episode 7000/10000
Episode 7500/10000
Episode 8000/10000
Episode 8500/10000
Episode 9000/10000
Episode 9500/10000


# Try to play

In [6]:
human = Human(1, 'Andrea')
p2_train.verbose = True
p2_train.eps = 0
env = Environment(human, p2_train)
while True:
    play_game(p2_train, human, env, draw = True)
    env.reset()
    answer = input('Play again? [Y/n]: ')
    if answer and answer.lower()[0] == 'n':
        break


It's the turn of the player Andrea
-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Enter coordinates i,j for the next move: 1,1

It's the turn of the player p2_train
-------------
|   |   |   |
-------------
|   | x |   |
-------------
|   |   |   |
-------------
Taking greedy action
-------------
| -0.18 | -0.72 | 0.15 |
-------------
| -0.67 | x | -0.84 |
-------------
| -0.43 | -0.73 | -0.29 |
-------------

It's the turn of the player Andrea
-------------
|   |   | o |
-------------
|   | x |   |
-------------
|   |   |   |
-------------
Enter coordinates i,j for the next move: 1,2

It's the turn of the player p2_train
-------------
|   |   | o |
-------------
|   | x | x |
-------------
|   |   |   |
-------------
Taking greedy action
-------------
| -0.62 | -0.95 | o |
-------------
| 0.46 | x | x |
-------------
| -0.95 | -0.62 | -0.68 |
-------------

It's the turn of the player Andrea
-------------
|   |   | o |
-------------
