In [110]:
import gym
import numpy as np
from collections import defaultdict
import sys

In [2]:
env = gym.make("Blackjack-v0")

In [3]:
env

<gym.envs.toy_text.blackjack.BlackjackEnv at 0x103d67208>

Each state is a 3-tuple of:
- the player's current sum $\in \{0, 1, \ldots, 31\}$,
- the dealer's face up card $\in \{1, \ldots, 10\}$, and
- whether or not the player has a usable ace (`no` $=0$, `yes` $=1$).

The agent has two potential actions:

```
    STICK = 0
    HIT = 1
```

The rewards go like 0 for all non terminal steps and +1,-1 depending on the result of the game

In [3]:
#observation is a tuple of the current sum of the player, the dealer's face up card, and a bool whether the ace if present is useable or not
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [4]:
env.action_space

Discrete(2)

In [5]:
print(env.observation_space.sample())

(12, 5, 0)


In [6]:
print(env.action_space.sample())

1


In [14]:
#equiproabable policy

for episode_i in range(3):
    print("episode: ",episode_i)
    state = env.reset()
    print(state)
    while True:
        action = env.action_space.sample()
        print(action)
        state, reward, done, info = env.step(action)
        print(state,reward)
        if done:
            print("End game")
            if reward>0:
                print("User won the game")
            else:
                print("Dealer won the game")
            print()
            break
            
    

episode:  0
(15, 10, False)
0
(15, 10, False) -1.0
End game
Dealer won the game

episode:  1
(13, 10, False)
0
(13, 10, False) -1.0
End game
Dealer won the game

episode:  2
(14, 1, False)
1
(24, 1, False) -1
End game
Dealer won the game



In [75]:
def generate_episode_from_limit_stochastic(bj_env):
    state = bj_env.reset()
    episode = []
    probs = [0,0]
    while True:
        if state[0]>=18:
            probs = [0.8,0.2]
        elif state[1]<18:
            probs = [0.2,0.8]
        action = np.random.choice(np.arange(2),p=probs)
        new_state,reward,done,info = bj_env.step(action)
        episode.append((state,action,reward))
        state = new_state
        if done:
            return episode
            
        

In [76]:
generate_episode_from_limit_stochastic(env)

[((16, 7, False), 0, -1.0)]

In [8]:
def gen_episode_stochastic(bj_env):
    
    episode = []
    state = bj_env.reset()
    probs = [0,0]
    while True:
        if state[0]>18:
            probs = [0.8,0.2]
        else:
            probs = [0.2,0.8]
        action = np.random.choice(np.arange(2),p=probs)
        state_n, reward, done, info = bj_env.step(action)
        episode.append((state,action,reward))
        state = state_n
        if done:
            return episode

In [9]:
for episode_i in range(3):
    print(gen_episode_stochastic(env))

[((11, 10, False), 1, 0), ((21, 10, False), 0, 1.0)]
[((10, 1, False), 0, -1.0)]
[((12, 1, False), 0, -1.0)]


In [10]:
def gen_episode_det(bj_env):
    
    episode = []
    state = bj_env.reset()
    
    while True:
        if state[0]>18:
            action = 0
        else:
            action = 1
        next_state,reward,done,info = bj_env.step(action)
        episode.append((state,action,reward))
        state = next_state
        if done:
            return episode

In [11]:
for episode_i in range(3):
    print(gen_episode_det(env))

[((15, 10, False), 1, 0), ((21, 10, False), 0, 1.0)]
[((6, 7, False), 1, 0), ((11, 7, False), 1, 0), ((14, 7, False), 1, -1)]
[((18, 5, True), 1, 0), ((18, 5, False), 1, 0), ((19, 5, False), 0, 1.0)]


In [57]:
X = defaultdict(lambda:np.zeros(env.action_space.n))

In [27]:
N = defaultdict(lambda:np.zeros(env.action_space.n))

In [28]:
print(N)

defaultdict(<function <lambda> at 0x1047e7598>, {})


In [29]:
returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))

In [30]:
print(returns_sum)

defaultdict(<function <lambda> at 0x1a13811840>, {})


In [120]:
#For black jack, evmc and fvmc are equivalent since a state generally won't repeat in an episode.

def evmc_q_table(num_episodes,env,gamma=1):
    returns_sum = defaultdict(lambda:np.zeros(env.action_space.n))
    N = defaultdict(lambda:np.zeros(env.action_space.n))
    Q = defaultdict(lambda:np.zeros(env.action_space.n))

    for episode_i in range(num_episodes):
        episode = gen_episode_stochastic(env)
        states,actions,rewards = zip(*episode)
        for index,item in enumerate(episode):
            rewards = np.array(rewards)
            discounting = np.array([gamma**i for i in range(len(rewards))])
#             print(discounting)
#             print(rewards[index:])
            returns_sum[item[0]][item[1]]+=np.sum(np.array(discounting*rewards))
            N[item[0]][item[1]]+=1
            Q[item[0]][item[1]] = returns_sum[item[0]][item[1]]/N[item[0]][item[1]]
#     print(returns_sum.keys())
#     print(returns_sum.values())
#     print()
#     for k in returns_sum.keys():
#         print(returns_sum[k],end=" ")
#         print(N[k])
#         print()
    
#     for k in returns_sum.keys():
#             Q[k] = returns_sum[k]/N[k]
    return Q

In [95]:
def discounting_factor(index,n,gamma=1):
    return np.array([gamma**i for i in range(n-index)])

In [97]:
discounting_factor(3,10,0.2)

array([1.0e+00, 2.0e-01, 4.0e-02, 8.0e-03, 1.6e-03, 3.2e-04, 6.4e-05])

In [113]:
def ev_mc(num_episodes,env,gen=generate_episode_from_limit_stochastic,gamma=1):
    
    n = {}
    q = {}
    r = {}
    for episode_i in range(num_episodes):
        episode = generate_episode_from_limit_stochastic(env)
        states,actions,rewards = zip(*episode)
        for index,item in enumerate(episode):
            if item[0] not in n:
                n[item[0]] = np.zeros(env.action_space.n)
                r[item[0]] = np.zeros(env.action_space.n)
                q[item[0]] = np.zeros(env.action_space.n)
            n[item[0]][item[1]]+=1
            r[item[0]][item[1]]+=(discounting_factor(index,len(episode),gamma)*np.array(rewards[index:])).sum()
        q[item[0]][item[1]] = r[item[0]][item[1]]/n[item[0]][item[1]]
    return q

In [115]:
ev_mc(50000,env)

{(12, 10, False): array([-0.59006211, -0.52494577]),
 (17, 8, False): array([-0.53947368, -0.62834225]),
 (18, 8, False): array([ 0.08430233, -0.64444444]),
 (11, 5, False): array([-0.26530612,  0.        ]),
 (21, 5, False): array([ 0.90419162, -1.        ]),
 (14, 4, False): array([-0.22580645, -0.57910448]),
 (18, 4, False): array([ 0.08625337, -0.75      ]),
 (20, 8, False): array([ 0.79423868, -0.968     ]),
 (17, 3, False): array([-0.08823529, -0.55313351]),
 (14, 3, True): array([0., 0.]),
 (14, 3, False): array([-0.13580247, -0.43989071]),
 (16, 3, False): array([-0.39473684, -0.49874055]),
 (13, 9, False): array([-0.64705882, -0.46827795]),
 (18, 9, False): array([-0.11684783, -0.71134021]),
 (10, 9, False): array([-0.88571429,  0.        ]),
 (15, 8, False): array([-0.62790698, -0.50609756]),
 (12, 6, False): array([-0.14583333, -0.44897959]),
 (14, 1, False): array([-0.75280899, -0.6426513 ]),
 (21, 1, False): array([ 0.6, -1. ]),
 (20, 10, True): array([0.48927039, 0.      

In [123]:
Q_table = evmc_q_table(500000,env,1)
Q_table

defaultdict(<function __main__.evmc_q_table.<locals>.<lambda>()>,
            {(11, 3, False): array([-0.19907407, -0.05413271]),
             (16, 3, False): array([-0.29032258, -0.61918438]),
             (12, 1, False): array([-0.75615213, -0.65523307]),
             (17, 1, False): array([-0.62921348, -0.74083264]),
             (15, 6, False): array([-0.14835787, -0.55508475]),
             (17, 3, True): array([-0.01840491, -0.30417495]),
             (14, 3, False): array([-0.225     , -0.54915536]),
             (12, 9, False): array([-0.5159516, -0.4886231]),
             (13, 9, False): array([-0.53970081, -0.51422135]),
             (14, 9, False): array([-0.50845547, -0.56087824]),
             (20, 9, False): array([ 0.74863495, -0.88342585]),
             (18, 9, False): array([-0.17272727, -0.69059406]),
             (10, 10, False): array([-0.54724677, -0.23942677]),
             (20, 10, False): array([ 0.43353175, -0.8837535 ]),
             (14, 10, False): array([-0

In [124]:
len(list(Q_table.keys()))

280

In [13]:
a1 = np.array([[20,10],[30,25]])
a2 = np.array([[5,2],[6,5]])
a3 = a1/a2

a3

array([[4., 5.],
       [5., 5.]])

In [118]:
def gen_episode(env,policy):
    state = env.reset()
    action = 0
    episode = []
    while True:
        if state not in policy:
            if state[0]>18:
                probs = [0.8,0.2]
            else:
                probs = [0.2,0.8]
            action = np.random.choice(np.arange(2),p = probs)
        else:
            action = policy[state]
        new_state, reward, done, info = env.step(action)
        episode.append((state,action,reward))
        state = new_state
        if done:
            return episode
        
def update_q(episode,alpha,q,gamma):
    
    states,actions,rewards = zip(*episode)
    for index,item in enumerate(episode):
        reward = ((discounting_factor(index,len(episode),gamma))*rewards[index:]).sum()
        if item[0] not in q:
            q[item[0]] = np.zeros(env.action_space.n)
        q[item[0]][item[1]] = q[item[0]][item[1]]+alpha*(reward-q[item[0]][item[1]])
    return q

In [138]:
def update_p(env,Q_table,epsilon,probs):
    nA = env.action_space.n
    greedy = (1-epsilon)+(epsilon)/nA
    non_greedy = (epsilon)/nA
    
    for state in Q_table:
        if state not in probs:
            probs[state] = np.zeros(env.action_space.n)
        max_index = np.where(Q_table[state]==max(Q_table[state]))
        probs[state][max_index] = greedy
        for j in range(len(Q_table[state])):
            if max_index==j:
                continue
            else:
                probs[state][j] = non_greedy
    return probs

In [140]:
def update_policy(probs):
    policy = {}
    
    for state in probs:
        policy[state] = np.random.choice(np.arange(2),p=list(probs[state]))
    return policy

In [142]:
policy = {}
q = {}
episode = gen_episode(env,policy)
episode

Q_table = update_q(episode,0.2,q,1)
Q_table

probs = {}

probs = update_p(env,Q_table,1,probs)
probs

policy = update_policy(probs)

policy

{(15, 10, False): 0}

In [147]:
def mc_control(env,num_episodes,alpha,gamma=1):
    
    Q_table = {}
    policy = {}
    probs = {}
    epsilon = 1
    for episode_i in range(num_episodes):
        episode = gen_episode(env,policy)
        Q_table = update_q(episode,alpha,Q_table,gamma)
        probs = update_p(env,Q_table,epsilon,probs)
        policy = update_policy(probs)
        
    return policy, Q_table

In [None]:
policy, Q_table = mc_control(env,5000,0.6)

In [150]:
policy, Q_table

({(20, 10, False): 1,
  (8, 6, False): 0,
  (16, 6, False): 0,
  (21, 6, False): 1,
  (14, 7, False): 0,
  (21, 7, False): 0,
  (11, 10, False): 1,
  (21, 10, False): 1,
  (15, 10, False): 1,
  (16, 10, False): 1,
  (20, 7, False): 1,
  (8, 10, False): 0,
  (20, 2, False): 0,
  (13, 10, False): 0,
  (20, 8, False): 1,
  (15, 2, False): 1,
  (12, 5, False): 1,
  (16, 2, False): 0,
  (13, 7, False): 0,
  (16, 7, False): 1,
  (12, 10, False): 1,
  (9, 7, False): 0,
  (11, 6, False): 1,
  (12, 6, False): 1,
  (9, 4, False): 0,
  (19, 4, False): 1,
  (19, 10, False): 0,
  (17, 3, True): 1,
  (13, 3, False): 0,
  (16, 3, False): 0,
  (13, 2, False): 1,
  (21, 2, False): 1,
  (11, 2, False): 1,
  (14, 4, False): 0,
  (8, 2, False): 1,
  (18, 2, False): 1,
  (17, 10, False): 0,
  (15, 3, False): 1,
  (20, 3, False): 1,
  (10, 5, False): 0,
  (17, 10, True): 0,
  (18, 7, False): 1,
  (7, 10, False): 0,
  (18, 10, True): 0,
  (18, 10, False): 1,
  (15, 5, False): 0,
  (16, 5, False): 0,
  (17, 5