In [1]:
import gym

#We will be using the game blackjack
env = gym.make('Blackjack-v0')
#Lets check out what the environment looks like
print(env.reset())

(9, 10, False)


In [2]:
#The 8 means that the sum of the two cards that we have is 8
#6 means that the value of the face up card from the dealer is 6. 
#The False means that we don't have a usable ace
#Lets look at our action sample space
print(env.action_space)

Discrete(2)


In [3]:
#A 0 means that we stand(reveal cards and end game)
#A 1 means that we hit(draw a new card)

import pandas as pd
from collections import defaultdict

In [4]:
#This is a semi-optimal policy
#For now, we are only doing the monte carlo prediction task
def policy(state):
    return 0 if state[0] > 19 else 1

In [5]:
state = env.reset()
print(state)

(13, 10, False)


In [6]:
print(policy(state))

1


In [7]:
num_timesteps = 10

def generate_episode(policy):
    episode = []
    
    state = env.reset()
    
    for t in range(num_timesteps):
        action = policy(state)
        
        next_state, reward, done, info = env.step(action)
        
        episode.append((state, action, reward))
        
        if done:
            break
        state = next_state
    
    return episode 

print(generate_episode(policy))

[((21, 2, True), 0, 1.0)]


In [8]:
#Now lets compute the value function

total_return = defaultdict(float)
N = defaultdict(int)

num_iterations = 5000

for i in range(num_iterations):
    
    episode = generate_episode(policy)
    
    states, actions, rewards = zip(*episode)
    
    for t, state in enumerate(states):
        R = sum(rewards[t:])
        
        total_return[state] = total_return[state] + R
        
        N[state] = N[state] + 1

In [9]:
#Visualize data
total_return = pd.DataFrame(total_return.items(), columns = ['state', 'total_return'])
N = pd.DataFrame(N.items(), columns=['state', 'N'])
df = pd.merge(total_return, N, on="state")
df.head(10)

Unnamed: 0,state,total_return,N
0,"(19, 3, False)",-49.0,58
1,"(14, 9, False)",-27.0,45
2,"(16, 10, False)",-161.0,219
3,"(8, 2, False)",-7.0,9
4,"(16, 2, False)",-46.0,56
5,"(10, 1, False)",-2.0,20
6,"(13, 1, False)",-33.0,50
7,"(15, 1, False)",-40.0,49
8,"(19, 1, False)",-37.0,63
9,"(11, 10, False)",-26.0,88


In [10]:
df['value'] = df['total_return']/df['N']
df.head(10)

Unnamed: 0,state,total_return,N,value
0,"(19, 3, False)",-49.0,58,-0.844828
1,"(14, 9, False)",-27.0,45,-0.6
2,"(16, 10, False)",-161.0,219,-0.73516
3,"(8, 2, False)",-7.0,9,-0.777778
4,"(16, 2, False)",-46.0,56,-0.821429
5,"(10, 1, False)",-2.0,20,-0.1
6,"(13, 1, False)",-33.0,50,-0.66
7,"(15, 1, False)",-40.0,49,-0.816327
8,"(19, 1, False)",-37.0,63,-0.587302
9,"(11, 10, False)",-26.0,88,-0.295455


In [11]:
#The last prediction method used every-visit mc
#Lets try first visit mc, which works fine if the environment is deterministic 
total_return = defaultdict(float)
N = defaultdict(int)

num_iterations = 5000

for i in range(num_iterations):
    
    episode = generate_episode(policy)
    
    states, actions, rewards = zip(*episode)
    
    for t, state in enumerate(states):
        if state not in states[0:t]:
            R = sum(rewards[t:])
        
            total_return[state] = total_return[state] + R
        
            N[state] = N[state] + 1

In [39]:
#Now lets implement on-policy MC control with epsilon greedy policy
import random

Q = defaultdict(float)
total_return = defaultdict(float)
N = defaultdict(int)

epsilon = 0.5

def epsilon_greedy_policy(state):
    
    if random.uniform(0, 1) < epsilon:
        return env.action_space.sample()
    else:
        #Computes index of maximum value, aka diy argmax
        return max(list(range(env.action_space.n)), key = lambda x: Q[(state, x)])

In [40]:
num_timesteps = 10

def generate_episode():
    episode = []
    state = env.reset()
    
    for t in range(num_timesteps):
        action = epsilon_greedy_policy(state)
        
        next_state, reward, done, info = env.step(action)
        
        episode.append((state, action, reward))
        
        if done:
            break
        state = next_state
    
    return episode

In [41]:
num_iterations = 5000
for i in range(num_iterations):
    
    episode = generate_episode()
    
    all_state_action_pairs = [(s, a) for (s, a, r) in episode]
    
    rewards = [r for (s, a, r) in episode]
    
    for t, (state, action, _) in enumerate(episode):
        if not (state, action) in all_state_action_pairs[0:t]:
            
            R = sum(rewards[t:])
            total_return[(state, action)] += R
            N[(state, action)] += 1
            Q[(state, action)] = total_return[(state, action)]/N[(state, action)]

In [42]:
df = pd.DataFrame(Q.items(), columns=['state_action pair', 'value'])
df.head(10)

Unnamed: 0,state_action pair,value
0,"((20, 10, False), 0)",0.376623
1,"((20, 10, False), 1)",-0.884615
2,"((19, 10, False), 0)",-0.065934
3,"((19, 10, False), 1)",-0.777778
4,"((15, 10, False), 1)",-0.755102
5,"((20, 1, False), 0)",0.4
6,"((20, 1, False), 1)",-0.75
7,"((14, 5, False), 0)",-0.214286
8,"((14, 5, False), 1)",-0.222222
9,"((10, 2, False), 0)",-0.333333


In [None]:
#Lets now try off policy Monete Carlo 
#There are two policies, b and pi. Pi is attempted to be optimized. 
#Q value updates are in ratio pi(a | s)/b(a | s)
#If at any action pi(a | s)> b(a | s), 
#or the probability of the good policy doing an action is higher than the bad policy doing the action Q values should be updated more
#If the probability of pi doing an action is less than b doing that action, then 

Q = defaultdict(float)
N = defaultdict(int)
C = defaultdict(float)

def behavior_policy()