In [12]:
import numpy as np

from evaluation.mc import *
from utils.misc import *
from policies import *
import gym
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import defaultdict
from tqdm import tqdm
from evaluation.td import *

In [9]:
def sample_episode_lim(env, policy,limit=100):
    """
    A sampling routine. Given environment and a policy samples one episode and returns states, actions, rewards
    and dones from environment's step function and policy's sample_action function as lists.

    Args:
        env: OpenAI gym environment.
        policy: A policy which allows us to sample actions with its sample_action method.

    Returns:
        Tuple of lists (states, actions, rewards, dones). All lists should have same length.
        Hint: Do not include the state after the termination in the list of states.
    """
    states = []
    actions = []
    rewards = []
    dones = []

    state = env.reset()

    for i in range(limit):
        states.append(state)

        action = policy.sample_action(state)
        state, reward, done, _ = env.step(action)

        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        if done == True:
            break

    return states, actions, rewards, dones

In [10]:
import gym
from gym import spaces
from gym.utils import seeding

class NChainEnv(gym.Env):
    """n-Chain environment
    This game presents moves along a linear chain of states, with two actions:
     0) forward, which moves along the chain but returns no reward
     1) backward, which returns to the beginning and has a small reward
    The end of the chain, however, presents a large reward, and by moving
    'forward' at the end of the chain this large reward can be repeated.
    At each action, there is a small probability that the agent 'slips' and the
    opposite transition is instead taken.
    The observed state is the current state in the chain (0 to n-1).
    This environment is described in section 6.1 of:
    A Bayesian Framework for Reinforcement Learning by Malcolm Strens (2000)
    http://ceit.aut.ac.ir/~shiry/lecture/machine-learning/papers/BRL-2000.pdf
    """
    def __init__(self, n=5, slip=0.2, small=2, large=10):
        self.n = n
        self.slip = slip  # probability of 'slipping' an action
        self.small = small  # payout for 'backwards' action
        self.large = large  # payout at end of chain for 'forwards' action
        self.state = 0  # Start at beginning of the chain
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Discrete(self.n)
        self.seed()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action)
        
        if self.np_random.rand() < self.slip:
            action = not action  # agent slipped, reverse action taken
            
            
        #added 
        if self.state == 0 and action:
            done = True
        elif self.state == self.n - 1 and not action:
            done = True
        else:
            done = False
        #this part
            
        if action:  # 'backwards': go back to the beginning, get small reward
            reward = self.small
            self.state = 0
        elif self.state < self.n - 1:  # 'forwards': go up along the chain
            reward = 0
            self.state += 1
        else:  # 'forwards': stay at the end of the chain, collect large reward
            reward = self.large
        
        return self.state, reward, done, {}

    def reset(self):
        self.state = 0
        return self.state

# NChain

In [None]:
import numpy as np
# from environments.Nchain import NChainEnv
from evaluation.mc import *
from utils.misc import *
from policies import *
import gym
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

for j in [5,10,15,20]:
    for i in range(10):
        n = j

        Q = np.zeros((n, 2))

        for state in range(n):
            Q[state,  0] = 1

        env = NChainEnv(n=n, slip = 0.0)
        actions = [0, 1]
        policy = EpsilonGreedyPolicy(actions, Q, 0.001)


        random_policy = RandomPolicy(actions)

        np.random.seed(i)
        _, hist = mc_prediction(env, policy, 500001, sample_episode, save_every=1000, name="mc_nchain_{}_{}".format(j,i))



100%|██████████| 500001/500001 [02:17<00:00, 3640.29it/s]
100%|██████████| 500001/500001 [02:15<00:00, 3700.04it/s]
100%|██████████| 500001/500001 [02:18<00:00, 3621.54it/s]
100%|██████████| 500001/500001 [02:16<00:00, 3673.47it/s]
100%|██████████| 500001/500001 [02:19<00:00, 3588.08it/s]
100%|██████████| 500001/500001 [02:14<00:00, 3722.82it/s]
100%|██████████| 500001/500001 [02:15<00:00, 3697.87it/s]
100%|██████████| 500001/500001 [02:17<00:00, 3631.98it/s]
 25%|██▌       | 125430/500001 [00:33<01:40, 3714.22it/s]

In [40]:
for j in [5,15,20]:
    for i in range(10):
        n = j

        Q = np.zeros((n, 2))

        for state in range(n):
            Q[state,  0] = 1

        env = NChainEnv(n=n, slip = 0.0)
        actions = [0, 1]
        policy = EpsilonGreedyPolicy(actions, Q, 0.001)


        random_policy = RandomPolicy(actions)

        np.random.seed(i)
        last, hist = mc_ordinary_importance_sampling(env, random_policy, policy, 500001, sample_episode, save_every=1000, name="mc_ordinary_nchain_{}_{}".format(j,i))

100%|██████████| 500001/500001 [01:28<00:00, 5623.70it/s]
100%|██████████| 500001/500001 [01:29<00:00, 5614.74it/s]
100%|██████████| 500001/500001 [01:29<00:00, 5610.28it/s]
100%|██████████| 500001/500001 [01:28<00:00, 5640.20it/s]
100%|██████████| 500001/500001 [01:29<00:00, 5599.55it/s]
100%|██████████| 500001/500001 [01:28<00:00, 5636.29it/s]
100%|██████████| 500001/500001 [01:29<00:00, 5599.56it/s]
100%|██████████| 500001/500001 [01:28<00:00, 5630.55it/s]
100%|██████████| 500001/500001 [01:28<00:00, 5624.15it/s]
100%|██████████| 500001/500001 [01:29<00:00, 5615.93it/s]


In [41]:
from evaluation.td import *
for j in [5,15,20]:
    for i in range(10):
        n = j

        Q = np.zeros((n, 2))

        for state in range(n):
            Q[state,  0] = 1

        env = NChainEnv(n=n, slip = 0.0)
        actions = [0, 1]
        policy = EpsilonGreedyPolicy(actions, Q, 0.001)


        random_policy = RandomPolicy(actions)

        np.random.seed(i)
        _, V_hist = n_step_td_off_policy(env, random_policy, policy, 500001, sample_step, n=5, save_every=1000,name="td_nchain_{}_{}".format(j,i))

100%|██████████| 500001/500001 [02:59<00:00, 2787.99it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2855.56it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2848.26it/s]
100%|██████████| 500001/500001 [02:54<00:00, 2865.86it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2845.48it/s]
100%|██████████| 500001/500001 [02:54<00:00, 2862.43it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2847.17it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2853.32it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2856.23it/s]
100%|██████████| 500001/500001 [02:55<00:00, 2855.57it/s]


In [45]:
for j in [5,15,20]:
    for i in range(10):
        n = j

        Q = np.zeros((n, 2))

        for state in range(n):
            Q[state,  0] = 1

        env = NChainEnv(n=n, slip = 0.0)
        actions = [0, 1]
        policy = EpsilonGreedyPolicy(actions, Q, 0.001)


        random_policy = RandomPolicy(actions)

        np.random.seed(i)
        last, hist = mc_weighted_importance_sampling(env, random_policy, policy, 500001, sample_episode, save_every=1000, name="mc_weighted_nchain_{}_{}".format(j,i))


100%|██████████| 500001/500001 [01:37<00:00, 5103.24it/s]
100%|██████████| 500001/500001 [01:40<00:00, 4970.40it/s]
100%|██████████| 500001/500001 [01:38<00:00, 5061.29it/s]
100%|██████████| 500001/500001 [01:40<00:00, 4990.72it/s]
100%|██████████| 500001/500001 [01:43<00:00, 4847.98it/s]
100%|██████████| 500001/500001 [01:36<00:00, 5191.44it/s]
100%|██████████| 500001/500001 [01:33<00:00, 5350.47it/s]
100%|██████████| 500001/500001 [01:33<00:00, 5368.02it/s]
100%|██████████| 500001/500001 [01:33<00:00, 5353.61it/s]
100%|██████████| 500001/500001 [01:33<00:00, 5356.39it/s]


# BlackJack

In [42]:
for i in range(10):    
    player_values = [i for i in range(12, 22)]
    dealer_values = [i for i in range(1,11)]


    env = gym.make('Blackjack-v0')


    # Let's sample some episodes

    policy = SimpleBlackjackPolicy()
    actions = [0,1]
    blackjack_policy = RandomPolicy(actions)

    np.random.seed(i)

    last, hist = mc_ordinary_importance_sampling(env, blackjack_policy, SimpleBlackjackPolicy(), 500001, sample_episode, save_every=1000, name= "mc_ordinary_bj_"+str(i))

100%|██████████| 500001/500001 [01:18<00:00, 6356.09it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6342.03it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6372.53it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6350.70it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6347.75it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6333.92it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6333.32it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6356.48it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6354.90it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6339.37it/s]


In [43]:
for i in range(10):    
    player_values = [i for i in range(12, 22)]
    dealer_values = [i for i in range(1,11)]


    env = gym.make('Blackjack-v0')


    # Let's sample some episodes

    policy = SimpleBlackjackPolicy()
    actions = [0,1]
    blackjack_policy = RandomPolicy(actions)

    np.random.seed(i)
    _, hist = mc_prediction(env, SimpleBlackjackPolicy(), 500001, sample_episode, save_every=1000, name="mc_bj_"+str(i))

100%|██████████| 500001/500001 [01:18<00:00, 6408.24it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6390.67it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6386.17it/s]
100%|██████████| 500001/500001 [01:17<00:00, 6436.85it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6368.14it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6392.21it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6392.73it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6388.43it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6390.52it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6368.10it/s]


In [44]:
for i in range(10):    
    player_values = [i for i in range(12, 22)]
    dealer_values = [i for i in range(1,11)]


    env = gym.make('Blackjack-v0')


    # Let's sample some episodes

    policy = SimpleBlackjackPolicy()
    actions = [0,1]
    blackjack_policy = RandomPolicy(actions)

    np.random.seed(i)
    _, V_hist = n_step_td_off_policy(env, blackjack_policy, SimpleBlackjackPolicy(), 500001, sample_step, n=5, save_every=1000, name="td_bj_"+str(i))

100%|██████████| 500001/500001 [01:35<00:00, 5243.97it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5239.34it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5258.21it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5240.71it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5214.70it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5248.96it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5236.03it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5237.13it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5238.65it/s]
100%|██████████| 500001/500001 [01:35<00:00, 5246.30it/s]


In [46]:
for i in range(10):    
    player_values = [i for i in range(12, 22)]
    dealer_values = [i for i in range(1,11)]


    env = gym.make('Blackjack-v0')


    # Let's sample some episodes

    policy = SimpleBlackjackPolicy()
    actions = [0,1]
    blackjack_policy = RandomPolicy(actions)

    np.random.seed(i)
    _, hist = mc_weighted_importance_sampling(env, blackjack_policy, SimpleBlackjackPolicy(), 500001, sample_episode, save_every=1000, name= "mc_weighted_bj_"+str(i))


100%|██████████| 500001/500001 [01:18<00:00, 6403.95it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6401.21it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6401.84it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6392.57it/s]
100%|██████████| 500001/500001 [01:17<00:00, 6440.97it/s]
100%|██████████| 500001/500001 [01:17<00:00, 6417.36it/s]
100%|██████████| 500001/500001 [01:17<00:00, 6421.85it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6409.23it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6378.54it/s]
100%|██████████| 500001/500001 [01:18<00:00, 6409.93it/s]


# FrozenLake

In [47]:

from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)

In [49]:
for i in range(10):
    env = gym.make('FrozenLakeNotSlippery-v0')
    env.is_slippery = False
    action_map = {
      0: "D",
      1: "R",
      2: "D",
      3: "L",
      4: "D",
      5: "U",
      6: "D",
      7: "U",
      8: "R",
      9: "D",
      10: "D",
      11: "U",
      12: "U",
      13: "R",
      14: "R",
      15: "D",

    }
    index = {"L": 0, "D": 1, "R": 2, "U": 3}

    Q = np.zeros((16, 4))
    for state, action in action_map.items():
        Q[state,  index[action]] =1

    actions = [0,1,2,3]
    target_policy =  EpsilonGreedyPolicy(actions, Q, 0.001)
    behavior_policy = RandomPolicy(actions)

    ### First we run the env with random agent

    np.random.seed(i)
    _, hist = mc_weighted_importance_sampling(env, behavior_policy, target_policy, 500001, sample_episode, save_every=1000, name= "mc_weighted_fl_"+str(i))



100%|██████████| 500001/500001 [03:43<00:00, 2240.14it/s]
100%|██████████| 500001/500001 [03:41<00:00, 2253.99it/s]
100%|██████████| 500001/500001 [03:41<00:00, 2257.46it/s]
100%|██████████| 500001/500001 [03:41<00:00, 2256.15it/s]
100%|██████████| 500001/500001 [03:40<00:00, 2272.27it/s]
100%|██████████| 500001/500001 [03:40<00:00, 2267.76it/s]
100%|██████████| 500001/500001 [03:40<00:00, 2268.23it/s]
100%|██████████| 500001/500001 [03:40<00:00, 2268.42it/s]
100%|██████████| 500001/500001 [03:40<00:00, 2272.18it/s]
100%|██████████| 500001/500001 [03:40<00:00, 2268.90it/s]


In [50]:
for i in range(10):
    env = gym.make('FrozenLakeNotSlippery-v0')
    env.is_slippery = False
    action_map = {
      0: "D",
      1: "R",
      2: "D",
      3: "L",
      4: "D",
      5: "U",
      6: "D",
      7: "U",
      8: "R",
      9: "D",
      10: "D",
      11: "U",
      12: "U",
      13: "R",
      14: "R",
      15: "D",

    }
    index = {"L": 0, "D": 1, "R": 2, "U": 3}

    Q = np.zeros((16, 4))
    for state, action in action_map.items():
        Q[state,  index[action]] =1

    actions = [0,1,2,3]
    target_policy = EpsilonGreedyPolicy(actions, Q, 0.001)
    behavior_policy = RandomPolicy(actions)

    ### First we run the env with random agent

    np.random.seed(i)
    _, hist = mc_ordinary_importance_sampling(env, behavior_policy, target_policy, 500001, sample_episode, save_every=1000, name= "mc_ordinary_fl_"+str(i))




100%|██████████| 500001/500001 [03:37<00:00, 2295.76it/s]
100%|██████████| 500001/500001 [03:38<00:00, 2289.14it/s]
100%|██████████| 500001/500001 [03:37<00:00, 2299.82it/s]
100%|██████████| 500001/500001 [03:37<00:00, 2297.16it/s]
100%|██████████| 500001/500001 [03:42<00:00, 2250.94it/s]
100%|██████████| 500001/500001 [03:42<00:00, 2242.18it/s]
100%|██████████| 500001/500001 [03:46<00:00, 2206.44it/s]
100%|██████████| 500001/500001 [03:43<00:00, 2232.16it/s]
100%|██████████| 500001/500001 [03:36<00:00, 2307.42it/s]
100%|██████████| 500001/500001 [03:36<00:00, 2306.24it/s]


In [16]:
for i in range(10):
    env = gym.make('FrozenLakeNotSlippery-v0')
    env.is_slippery = False
    action_map = {
      0: "D",
      1: "R",
      2: "D",
      3: "L",
      4: "D",
      5: "U",
      6: "D",
      7: "U",
      8: "R",
      9: "D",
      10: "D",
      11: "U",
      12: "U",
      13: "R",
      14: "R",
      15: "D",

    }
    index = {"L": 0, "D": 1, "R": 2, "U": 3}

    Q = np.zeros((16, 4))
    for state, action in action_map.items():
        Q[state,  index[action]] =1

    actions = [0,1,2,3]
    target_policy = EpsilonGreedyPolicy(actions, Q, 0.001)
    behavior_policy = RandomPolicy(actions)

    ### First we run the env with random agent

    np.random.seed(i)
    _, hist = n_step_td_off_policy(env, behavior_policy, target_policy, 500001, sample_step, save_every=1000, name= "td_fl_"+str(i))





100%|██████████| 500001/500001 [04:17<00:00, 1938.88it/s]
100%|██████████| 500001/500001 [04:17<00:00, 1939.26it/s]
100%|██████████| 500001/500001 [04:19<00:00, 1927.13it/s]
100%|██████████| 500001/500001 [04:28<00:00, 1860.75it/s]
100%|██████████| 500001/500001 [04:19<00:00, 1927.89it/s]
100%|██████████| 500001/500001 [04:18<00:00, 1932.34it/s]
100%|██████████| 500001/500001 [04:19<00:00, 1929.63it/s]
100%|██████████| 500001/500001 [04:19<00:00, 1929.31it/s]
100%|██████████| 500001/500001 [04:22<00:00, 1907.54it/s]
100%|██████████| 500001/500001 [04:30<00:00, 1847.27it/s]


In [53]:
for i in range(10):
    env = gym.make('FrozenLakeNotSlippery-v0')
    env.is_slippery = False
    action_map = {
      0: "D",
      1: "R",
      2: "D",
      3: "L",
      4: "D",
      5: "U",
      6: "D",
      7: "U",
      8: "R",
      9: "D",
      10: "D",
      11: "U",
      12: "U",
      13: "R",
      14: "R",
      15: "D",

    }
    index = {"L": 0, "D": 1, "R": 2, "U": 3}

    Q = np.zeros((16, 4))
    for state, action in action_map.items():
        Q[state,  index[action]] =1

    actions = [0,1,2,3]
    target_policy = EpsilonGreedyPolicy(actions, Q, 0.001)
    behavior_policy = RandomPolicy(actions)

    ### First we run the env with random agent

    np.random.seed(i)
    _, hist = mc_prediction(env, target_policy, 500001, sample_episode, save_every=1000, name= "mc_fl_"+str(i))





100%|██████████| 500001/500001 [03:24<00:00, 2446.48it/s]
100%|██████████| 500001/500001 [03:22<00:00, 2472.02it/s]
100%|██████████| 500001/500001 [03:21<00:00, 2480.71it/s]
100%|██████████| 500001/500001 [03:22<00:00, 2473.04it/s]
100%|██████████| 500001/500001 [03:20<00:00, 2491.56it/s]
100%|██████████| 500001/500001 [03:22<00:00, 2464.86it/s]
100%|██████████| 500001/500001 [03:24<00:00, 2447.85it/s]
100%|██████████| 500001/500001 [03:25<00:00, 2430.37it/s]
100%|██████████| 500001/500001 [03:31<00:00, 2368.40it/s]
100%|██████████| 500001/500001 [03:25<00:00, 2429.73it/s]
