# BlackJack

Using monte carlo methods to solve blackjack env.

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
env = gym.make('Blackjack-v0')

print("Action space is " + str(env.action_space))
print("Observation space is " + str(env.observation_space))

Action space is Discrete(2)
Observation space is Tuple(Discrete(32), Discrete(11), Discrete(2))


In monte carlo methods we start with a random policy and interact with environment to populate a Q table. Q table has states on row and actions as columns. Then (s,a) corresponds to expected return if agents starts in that state and takes that action. Each occurence of (s,a) pair is called visit.

Let us start with a random policy where player will stick hand with 80% prob (or hit with 20% prob) if sum is greater than 18 else keep asking for hits with 80% prob.

In [3]:
def random_policy(env):
    episode = []
    # Reset the environment
    state = env.reset()
    player_hand, opp_hand, ace = state
    
    # Defining the policy
    if player_hand > 18:
        stick_prob = [0.8, 0.2]
    else:
        stick_prob = [0.2, 0.8]
    
    # Playing an episode with the set policy
    while True:
        action = np.random.choice(np.arange(2), p=stick_prob)
        ns, reward, done, info = env.step(action)
        # Record action, next state and reward
        episode.append((ns, action, reward))
        state = ns
        if done:
            break
    return episode

In [4]:
# Implementing first visit MC prediction, stick=0, hit=1

def every_visit(env, num_episodes, gamma=0.9):
    # Store the reward with next state as key and action as index to make a Q table
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    N = defaultdict(lambda: np.zeros(env.action_space.n))
    returns_sum =  defaultdict(lambda: np.zeros(env.action_space.n))
    
    for epi_idx in range(num_episodes):
        # Generate an episode
        episode = random_policy(env)
        # Separate the episode into state, action and rewards
        states, action, reward = zip(*episode)
        # Generate gamma for discounting rewards
        discounts = np.array([gamma**i for i in range(len(reward)+1)])
        # Now iterate through states in episode
        for i, state in enumerate(states):
            returns_sum[state][action[i]] += sum(reward[i:] * discounts[:-(1+i)])
            N[state][action[i]] += 1
            Q[state][action[i]] = returns_sum[state][action[i]] / N[state][action[i]]
    return Q

Q = every_visit(env, 10000)
    

In [5]:
def greedy_policy(env, Q):
    # Use random policy if state is not present in Q table else use Q table
    episode = []
    state = env.reset()
    # Defining the random policy for fall back
    player_hand, opp_hand, ace = state
    if player_hand > 18:
        stick_prob = [0.8, 0.2]
    else:
        stick_prob = [0.2, 0.8]
    
    while True:
        if state in Q:
            # Choose greedily
            action = np.argmax(Q[state])
        else:
            action = np.random.choice(np.arange(2), p=stick_prob)
        ns, reward, done, info = env.step(action)
        # Record action, next state and reward
        episode.append((ns, action, reward))
        state = ns
        if done:
            break
    return episode