In [1]:
import gymnasium as gym
import numpy as np
import random

In [2]:
#discretizar estado / create discrete state
def map_state(s):
    return (s[0]-1)+32*(s[1]-1)+ 32*11*s[2]

In [3]:
#Generar juego / generate state space
env = gym.make("Blackjack-v1",   sab=True)

#Crear tabla_q con todas las posibilidades / create q-table with probabilities
observation_space = env.observation_space[0].n * env.observation_space[1].n * env.observation_space[2].n
q_table = np.zeros([observation_space, env.action_space.n])

#Inicializar variables / initiate variables
epsilon = 3
alpha = 0.01
gamma = 0.95

#Inicializar métricas / initiate metrics
total_wins = 0
total_loss = 0
total_draw = 0
num_episodes = 100000


for episode in range(num_episodes):
    #Generar estado y guardar características /generate state and save characteristics
    state, _ = env.reset()
    #Ver características iniciales / print characteristics
    if episode >= num_episodes - 3:
        print(" ")
        print(f"Episode: {episode+1}")
        print(f"My sum: {state[0]}")
        print(f"Dealer face card: {state[1]}")
        print(f"Have ace: {bool(state[2])}")
    #Mapear estado usando función map_state / map state 
    state = map_state(state)
    
    done = False
    while not done:
        #Determinar acción siguiende política epsilon-greedy para explorar / epsilon-greedy policy for decision making
        if random.uniform(0,1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
            
        #Tomar acción y recibir información / take action and receive information
        next_state, reward, terminated, truncated, info = env.step(action)
        next_player_sum = next_state[0]
        next_dealer_card = next_state[1]
        next_usable_ace = bool(next_state[2])
        
        done = terminated or truncated
        #Map el nuevo estado usando la función map_state / map new state
        next_state = map_state(next_state)

        #Actualizar función-Q / update q-table
        old_q = q_table[state][action]
        next_max_q = (not done) * np.max(q_table[next_state])        
        q_table[state][action] = old_q + alpha*(reward + gamma*next_max_q - old_q)

        #Pasar a siguiente estado / jump into next state
        state = next_state
        #Ver caraterísticas del episodio / examine resulting characteristics
        if episode >= num_episodes-3:
            print(f"Action: {action}")
            print(f"Reward: {reward}")
            print(f"Next Sum: {next_player_sum}")
            print(done)
        
        #Ver porcentaje de juegos ganados y empatados / percent success rate
        if episode == num_episodes-1 and done:
            print(f"Win percentage after {episode+1} episodes: {round(((total_wins)/(total_wins + total_loss + total_draw))*100, 1)}%")
        
       
        epsilon *= 0.99
       #actualizar métricas / update metrics
        if done:
            if reward > 0:
                total_wins+=1
            elif reward < 0:
                total_loss+=1
            else:
                total_draw+=1
    
env.close()

 
Episode: 99998
My sum: 14
Dealer face card: 4
Have ace: False
Action: 0
Reward: 1.0
Next Sum: 14
True
 
Episode: 99999
My sum: 13
Dealer face card: 1
Have ace: False
Action: 1
Reward: -1.0
Next Sum: 23
True
 
Episode: 100000
My sum: 7
Dealer face card: 3
Have ace: False
Action: 1
Reward: 0.0
Next Sum: 15
False
Action: 0
Reward: -1.0
Next Sum: 15
True
Win percentage after 100000 episodes: 42.0%
