In [1]:
import sagemaker
import boto3
import sys
import os
import glob
import re
import subprocess
from IPython.display import HTML
import time
from time import gmtime, strftime

sys.path.append("common")
from sagemaker import get_execution_role#, wait_for_s3_object
from sagemaker.rl import RLEstimator, RLToolkit, RLFramework

# install gym environments if needed
!pip install gym
!pip install env_utils
#from env_utils import VectoredGymEnvironment

# S3 bucket
sage_session = sagemaker.session.Session()
s3_bucket = sage_session.default_bucket()
region_name = sage_session.boto_region_name
s3_output_path = "s3://{}/".format(s3_bucket)  # SDK appends the job name and output folder
print("S3 bucket path: {}".format(s3_output_path))

# create unique job name
job_name_prefix = "rl-blackjack"





ModuleNotFoundError: No module named 'sagemaker'

In [None]:
%%time

# run in local mode?
local_mode = False

try:
    role = sagemaker.get_execution_role()
except:
    role = get_execution_role()

print("Using IAM role arn: {}".format(role))

# only run from SageMaker notebook instance

In [None]:
if local_mode:
    !/bin/bash ./common/setup.sh

! pip install -U gym
! pip install -U torch
! pip install gym[toy_text]

import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from IPython import display



env = gym.make("Blackjack-v1")
env.observation_space

env.action_space

env.reset(seed=42)

def get_state_idxs(state):
    idx1, idx2, idx3 = state
    idx3 = int(idx3)
    return idx1, idx2, idx3

def update_qtable(qtable, state, action, reward, next_state, alpha, gamma):
    curr_idx1, curr_idx2, curr_idx3 = get_state_idxs(state)
    next_idx1, next_idx2, next_idx3 = get_state_idxs(next_state)
    curr_state_q = qtable[curr_idx1][curr_idx2][curr_idx3]
    next_state_q = qtable[next_idx1][next_idx2][next_idx3]
    qtable[curr_idx1][curr_idx2][curr_idx3][action] += \
            alpha * (reward + gamma * np.max(next_state_q) - curr_state_q[action])
    return qtable

def get_action(qtable, state, epsilon):
    if random.uniform(0,1) < epsilon:
        action = env.action_space.sample()
    else:
        idx1, idx2, idx3 = get_state_idxs(state)
        action = np.argmax(qtable[idx1][idx2][idx3])
    return action

def train_agent(env,
                qtable: np.ndarray,
                num_episodes: int,
                alpha: float, 
                gamma: float, 
                epsilon: float, 
                epsilon_decay: float) -> np.ndarray:

    for episode in range(num_episodes):
        state, _ = env.reset()                                     # Added blank for extra returned argument
        done = False
        while True:
            action = get_action(qtable, state, epsilon)
            new_state, reward, done, _, info = env.step(action)    # Added blank for extra returned argument
            qtable = update_qtable(qtable, state, action, reward, new_state, alpha, gamma)
            state = new_state
            if done:
                break
        epsilon = np.exp(-epsilon_decay*episode)
    return qtable

FIGSIZE = (8,4)

def watch_trained_agent(env, qtable, num_rounds):
    #envdisplay = JupyterDisplay(figsize=FIGSIZE)
    rewards = []
    for s in range(1, num_rounds+1):
        state, _ = env.reset()
        done = False
        round_rewards = 0
        while True:
            action = get_action(qtable, state, epsilon)          
            new_state, reward, done, _, info = env.step(action)  # Added blank for extra returned argument
            #envdisplay.show(env)

            round_rewards += reward
            state = new_state
            if done == True:
                break
        rewards.append(round_rewards)
    return rewards

FIGSIZE = (8,4)

def watch_trained_agent_no_exploration(env, qtable, num_rounds):
    #envdisplay = JupyterDisplay(figsize=FIGSIZE)
    rewards = []
    for s in range(1, num_rounds+1):
        state, _ = env.reset()
        done = False
        round_rewards = 0
        while True:
            action = get_action(qtable, state, 0)                # epsilon set to 0
            new_state, reward, done, _, info = env.step(action)  # Added blank for extra returned argument
            #envdisplay.show(env)

            round_rewards += reward
            state = new_state
            if done == True:
                break
        rewards.append(round_rewards)
    return rewards



In [None]:
def print_policy(qtable):
    print('PC DC Soft Pol')
    dim1, dim2, dim3, dim4 = qtable.shape
    for player_count in range(10,21):
        for dealer_card in range(dim2):
            for soft in range(dim3):
                q_stay = qtable[player_count, dealer_card, soft, 0]
                q_hit  = qtable[player_count, dealer_card, soft, 1]
                pol = "Stay" if q_stay>=q_hit else "Hit"
                print(player_count+1, dealer_card+1, soft, pol)

In [None]:
env = gym.make("Blackjack-v1")
env.action_space.seed(42)

# get initial state
state = env.reset()

state_size = [x.n for x in env.observation_space]
action_size = env.action_space.n

qtable = np.zeros(state_size + [action_size]) #init with zeros


alpha = 0.3 # learning rate
gamma = 0.1 # discount rate
epsilon = 0.9     # probability that our agent will explore
decay_rate = 0.005

# training variables
num_hands = 500_000

qtable = train_agent(env,
                     qtable,
                     num_hands,
                     alpha,
                     gamma,
                     epsilon,
                     decay_rate)

print(f"Qtable Max: {np.max(qtable)}")
print(f"Qtable Mean: {np.mean(qtable)}")
print(f"Qtable Num Unique Vals: {len(np.unique(qtable))}")


In [None]:
qtable


In [None]:
import pandas as pd
qt = pd.DataFrame(qtable)