In [43]:
import gym
import numpy as np
from collections import defaultdict
import sys

In [17]:
env = gym.make("Blackjack-v0")

In [18]:
#observation is a tuple of the current sum of the player, the dealer's face up card, and a bool whether the ace if present is useable or not
env.observation_space

Tuple(Discrete(32), Discrete(11), Discrete(2))

In [19]:
env.action_space

Discrete(2)

In [20]:
print(env.observation_space.sample())

(12, 5, 0)


In [21]:
print(env.action_space.sample())

1


In [22]:
#equi-probable policy based episode generation

for episode_i in range(3):
    print("episode:", episode_i)
    state = env.reset()
    print(state)
    while True:
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        print(state, reward, done, info)
        if done:
            if reward==1:
                print("You won")
            elif reward==-1:
                print("You lost")
            else:
                print("You drew")
            print()
            break

episode: 0
(15, 3, False)
(25, 3, False) -1 True {}
You lost

episode: 1
(7, 2, False)
(18, 2, True) 0 False {}
(21, 2, True) 0 False {}
(21, 2, False) 0 False {}
(28, 2, False) -1 True {}
You lost

episode: 2
(12, 10, False)
(19, 10, False) 0 False {}
(19, 10, False) 1.0 True {}
You won



In [23]:
def gen_episode_stochastic(bj_env):
    
    episode = []
    state = bj_env.reset()
    probs = [0,0]
    while True:
        if state[0]>18:
            probs = [0.8,0.2]
        else:
            probs = [0.2,0.8]
        action = np.random.choice(np.arange(2),p=probs)
        state_n, reward, done, info = bj_env.step(action)
        episode.append((state,action,reward))
        state = state_n
        if done:
            return episode

In [24]:
for episode_i in range(3):
    print(gen_episode_stochastic(env))

[((19, 6, False), 0, 1.0)]
[((15, 7, False), 1, 0), ((20, 7, False), 0, 1.0)]
[((19, 10, False), 0, 1.0)]


In [25]:
def gen_episode_det(bj_env):
    
    episode = []
    state = bj_env.reset()
    
    while True:
        if state[0]>18:
            action = 0
        else:
            action = 1
        next_state,reward,done,info = bj_env.step(action)
        episode.append((state,action,reward))
        state = next_state
        if done:
            return episode

In [26]:
for episode_i in range(3):
    print(gen_episode_det(env))

[((18, 3, True), 1, 0), ((21, 3, True), 0, 0.0)]
[((7, 8, False), 1, 0), ((17, 8, False), 1, -1)]
[((20, 7, False), 0, 1.0)]


In [27]:
N = defaultdict(lambda:np.zeros(env.action_space.n))

In [28]:
print(N)

defaultdict(<function <lambda> at 0x1047e7598>, {})


In [29]:
returns_sum = defaultdict(lambda: np.zeros(env.action_space.n))

In [30]:
print(returns_sum)

defaultdict(<function <lambda> at 0x1a13811840>, {})


In [120]:
#For black jack, evmc and fvmc are equivalent since a state generally won't repeat in an episode.

def evmc_q_table(num_episodes,env,gamma=1):
    returns_sum = defaultdict(lambda:np.zeros(env.action_space.n))
    N = defaultdict(lambda:np.zeros(env.action_space.n))
    Q = defaultdict(lambda:np.zeros(env.action_space.n))

    for episode_i in range(num_episodes):
        episode = gen_episode_stochastic(env)
        states,actions,rewards = zip(*episode)
        for index,item in enumerate(episode):
            rewards = np.array(rewards)
            discounting = np.array([gamma**i for i in range(len(rewards))])
#             print(discounting)
#             print(rewards[index:])
            returns_sum[item[0]][item[1]]+=np.sum(np.array(discounting*rewards))
            N[item[0]][item[1]]+=1
            Q[item[0]][item[1]] = returns_sum[item[0]][item[1]]/N[item[0]][item[1]]
#     print(returns_sum.keys())
#     print(returns_sum.values())
#     print()
#     for k in returns_sum.keys():
#         print(returns_sum[k],end=" ")
#         print(N[k])
#         print()
    
#     for k in returns_sum.keys():
#             Q[k] = returns_sum[k]/N[k]
    return Q

In [123]:
Q_table = evmc_q_table(500000,env,1)
Q_table

defaultdict(<function __main__.evmc_q_table.<locals>.<lambda>()>,
            {(11, 3, False): array([-0.19907407, -0.05413271]),
             (16, 3, False): array([-0.29032258, -0.61918438]),
             (12, 1, False): array([-0.75615213, -0.65523307]),
             (17, 1, False): array([-0.62921348, -0.74083264]),
             (15, 6, False): array([-0.14835787, -0.55508475]),
             (17, 3, True): array([-0.01840491, -0.30417495]),
             (14, 3, False): array([-0.225     , -0.54915536]),
             (12, 9, False): array([-0.5159516, -0.4886231]),
             (13, 9, False): array([-0.53970081, -0.51422135]),
             (14, 9, False): array([-0.50845547, -0.56087824]),
             (20, 9, False): array([ 0.74863495, -0.88342585]),
             (18, 9, False): array([-0.17272727, -0.69059406]),
             (10, 10, False): array([-0.54724677, -0.23942677]),
             (20, 10, False): array([ 0.43353175, -0.8837535 ]),
             (14, 10, False): array([-0

In [124]:
len(list(Q_table.keys()))

280

In [13]:
a1 = np.array([[20,10],[30,25]])
a2 = np.array([[5,2],[6,5]])
a3 = a1/a2

a3

array([[4., 5.],
       [5., 5.]])