In [144]:
import numpy as np
import random
from tic_env import TictactoeEnv, OptimalPlayer

In [145]:
env = TictactoeEnv()
env.__dict__

{'grid': array([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]),
 'end': False,
 'winner': None,
 'player2value': {'X': 1, 'O': -1},
 'num_step': 0,
 'current_player': 'X'}

In [205]:
class QPlayer:
    
    def __init__(self, epsilon, alpha = 0.05, gamma = 0.99, player_name = 'O'):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.player_name = player_name
        self.states = []
        self.actions = []
        self.states_opponent = []
        self.actions_opponent = []
        self.states_value = dict()
        
    def set_player(self, player_name = 'X', j=-1):
        self.player_name = player_name
        if j != -1:
            self.player_name = 'X' if j % 2 == 0 else 'O'
            
    def empty(self, grid):
        '''return all empty positions'''
        avail = []
        for i in range(9):
            pos = (int(i/3), i % 3)
            if grid[pos] == 0:
                avail.append(i)
        return avail
    
    def get_state_key(self, grid):
        "Transforms the grid configuration into a string -- needed for the key of the dictionary" 
        key = str(grid.reshape(3 * 3))
        return key
    
    def select_optimal_action(self, grid):
        avail_actions = self.empty(grid)
        key = self.get_state_key(grid)
        if key in self.states_value : 
            restricted_vector = self.states_value[key][avail_actions]
            move = np.argmax(restricted_vector)
        else :
            move = self.select_random_action(grid)
        return move
    
    def select_random_action(self,grid):
        actions = self.empty(grid)
        move = random.choice(actions)
        return move
    
    def act(self,grid):
        b = np.random.binomial(1,1-self.epsilon)
        if b == 1 : return self.select_optimal_action(grid)
        else : return self.select_random_action(grid)
        
class QLearning:
    def __init__(self, epsilon1, epsilon2, alpha = 0.05, gamma = 0.99):
        self.q_player = QPlayer(epsilon1, alpha, gamma)
        self.opponent = OptimalPlayer(epsilon2)
        self.env = TictactoeEnv()
        
    def game(self):
        self.env.reset()
        if self.env.current_player == 'X' : self.q_player.states.append(np.zeros([3,3]))
        else : self.q_player.states_opponent.append(np.zeros([3,3]))
        while not self.env.end : 
            if self.env.current_player == 'O' : 
                move = self.q_player.act(self.env.grid)
                self.env.step(move)
                temp = self.env.grid.copy()
                self.q_player.states.append(temp)
                self.q_player.actions.append(move)

            else : 
                move = self.opponent.act(self.env.grid)
                self.env.step(move)
                temp = self.env.grid.copy()
                self.q_player.states_opponent.append(temp)
                self.q_player.actions_opponent.append(3*move[0]+move[1])
        self.update_q('O',self.q_player.states,self.q_player.actions)
        self.update_q('X',self.q_player.states_opponent,self.q_player.actions_opponent)
        
        
    #def give_reward(self):
        
    def update_q(self, player, states, actions):
        r = self.env.reward(player)
        i = len(actions)-1
        key = self.q_player.get_state_key(self.env.grid)
        self.q_player.states_value[key] = np.zeros([9,1])
        
        prov = states[::-1]
        
        for j in range(1,len(prov)):
            st = prov[j]
            st_prec = prov[j-1]
            key_prec = self.q_player.get_state_key(st_prec)
            key = self.q_player.get_state_key(st)
            print(self.q_player.states_value[key_prec])
            if key not in self.q_player.states_value:
                self.q_player.states_value[key] = np.zeros([9,1])
                
            self.q_player.states_value[key][actions[i]] += self.q_player.alpha*(r + self.q_player.gamma*max(self.q_player.states_value[key_prec])-self.q_player.states_value[key][actions[i]])  
            r = 0
            i -= 1
            
            
        
        

In [206]:
np.random.seed(2022)
epsilon = np.random.uniform(0,0.5)
game = QLearning(epsilon,0)


In [207]:
game.game()

KeyError: '[ 1. -1. -1.  0.  1.  0.  0.  0.  0.]'