In [4]:
import numpy as np
import random
from easy21 import Easy21

In [5]:
# dealer showing is from 1 - 10
# our states are from 1 - 21
class MCControl:
    def __init__(self, no: int = 100) -> None:
        self.Q = np.zeros((22, 11, 2))
        self.N = np.zeros((22, 11, 2))
        self.no = no
        self.policy = np.ones((22, 11, 2)) * 0.5
        self.action_names = ['hit', 'stick']
        self.wins = 0
    
    def epsilon_greedy_argmax(self, player_sum, dealer_showing):
        q_values = self.Q[player_sum, dealer_showing]
        epsilon = self.no / (self.no + self.N[player_sum, dealer_showing].sum())
        # choose random with prob epsilon
        if random.random() < epsilon:
            return random.choice([0,1])

        return q_values.argmax()

    def get_episode(self, env: Easy21, log=False):
        state = env.get_starting_state()
        if log:
            print("Starting_with_state:", state)
        has_terminated = False
        episode = []
        while not has_terminated:
            player_sum, dealer_showing = state
            action_idx = self.epsilon_greedy_argmax(player_sum, dealer_showing)
            action = self.action_names[action_idx]
            self.N[player_sum, dealer_showing, action_idx] += 1
            has_terminated, next_state, reward = env.step(state, action)
            episode.append([state, action_idx, reward])
            state = next_state
        return episode
    
    def update_q_values(self, episode: list):
        """Updates q values based on episode = [s, a, r, ... s, a, r]"""
        if episode[-1][-1] == 1:
            self.wins += 1

        run_sum = 0

        for (player_sum, dealer_showing), action_idx, reward in reversed(episode):
            run_sum += reward
            # move Q[s, a] towards run_sum
            self.Q[player_sum, dealer_showing, action_idx] += (
                (run_sum - self.Q[player_sum, dealer_showing, action_idx])
                / self.N[player_sum, dealer_showing, action_idx]
            )

In [6]:
model = MCControl()
env = Easy21()

N = int(1e6)
for idx in range(N):
    episode = model.get_episode(env)
    model.update_q_values(episode)
    if (idx + 1) % (N // 10) == 0:
        print(f"Iteration {(idx + 1):7d}:  win% = {100 * model.wins / (idx + 1) : .3f}")

Iteration  100000:  win% =  50.611
Iteration  200000:  win% =  51.321
Iteration  300000:  win% =  51.567
Iteration  400000:  win% =  51.758
Iteration  500000:  win% =  51.884
Iteration  600000:  win% =  51.962
Iteration  700000:  win% =  52.015
Iteration  800000:  win% =  52.070
Iteration  900000:  win% =  52.099
Iteration 1000000:  win% =  52.134
