In [1]:
import sys
from contextlib import closing
import random
import numpy as np
from io import StringIO
from utils import *
from gym import utils, Env, spaces
from gym.utils import seeding
from gym.envs.toy_text import discrete
from gym.utils import seeding
from collections import deque
import csv
from SailingEnv import *

In [2]:
environment_config = dict(
    total_steps = 500,
    random_seed = 10,
    is_random_env = False,
    map_name = "8x8",  
    is_slippery = True
)


In [3]:
env = SailingEnv(environment_config)

In [4]:
print("Current observation space: {}".format(env.observation_space))
print("Current action space: {}".format(env.action_space))
print("0 in action space? {}".format(env.action_space.contains(0)))
print("5 in action space? {}".format(env.action_space.contains(5)))

Current observation space: Discrete(64)
Current action space: Discrete(4)
0 in action space? True
5 in action space? False


In [5]:
# env.reset()

# while True:
    
#     # take random action
#     # [TODO] Uncomment next line
#     obs, reward, done, info = env.step(env.action_space.sample())

#     # render the environment
#     env.render()  # [TODO] Uncomment this line

#     print("Current step: {}\nCurrent observation: {}\nCurrent reward: {}\n"
#           "Whether we are done: {}\ninfo: {}".format(
#         env.current_step, obs, reward, done, info
#     ))
#     wait(sleep=0.4)
#     # [TODO] terminate the loop if done
#     if done:
#         break
# #     pass

In [6]:
# Solve the TODOs and remove `pass`

def _render_helper(env):
    env.render()
    wait(sleep=0.2)


def evaluate(policy, num_episodes, seed=0, env_name='SailingEnv', render=False):
    """[TODO] You need to implement this function by yourself. It
    evaluate the given policy and return the mean episode reward.
    We use `seed` argument for testing purpose.
    You should pass the tests in the next cell.

    :param policy: a function whose input is an interger (observation)
    :param num_episodes: number of episodes you wish to run
    :param seed: an interger, used for testing.
    :param env_name: the name of the environment
    :param render: a boolean flag. If true, please call _render_helper
    function.
    :return: the averaged episode reward of the given policy.
    """

    # Create environment (according to env_name, we will use env other than 'FrozenLake8x8-v0')
    env = SailingEnv(environment_config)

    # Seed the environment
    env.seed(seed)

    # Build inner loop to run.
    # For each episode, do not set the limit.
    # Only terminate episode (reset environment) when done = True.
    # The episode reward is the sum of all rewards happen within one episode.
    # Call the helper function `render(env)` to render
    rewards = []
    steps = []
    for i in range(num_episodes):
        # reset the environment
        obs = env.reset()
        act = policy(obs)
        
        ep_reward = 0
        ep_step = 0
        while True:
            # [TODO] run the environment and terminate it if done, collect the
            # reward at each step and sum them to the episode reward.
            obs, reward, done, info = env.step(act)
            act = policy(obs)
            ep_reward += reward
            ep_step += 1

            if done:

                break
        steps.append(ep_step)  
        rewards.append(ep_reward)
        
        
    return np.mean(rewards), np.mean(steps)

# [TODO] Run next cell to test your implementation!

In [7]:
# Run this cell without modification

class TabularRLTrainerAbstract:
    """This is the abstract class for tabular RL trainer. We will inherent the specify 
    algorithm's trainer from this abstract class, so that we can reuse the codes like
    getting the dynamic of the environment (self._get_transitions()) or rendering the
    learned policy (self.render())."""
    
    def __init__(self, env_name="SailingEnv", model_based=True):
        self.env_name = env_name
        self.env = SailingEnv(environment_config)
        self.action_dim = self.env.action_space.n
        self.obs_dim = self.env.observation_space.n
        
        self.model_based = model_based

    def _get_transitions(self, state, act):
        """Query the environment to get the transition probability,
        reward, the next state, and done given a pair of state and action.
        We implement this function for you. But you need to know the 
        return format of this function.
        """
        self._check_env_name()
        assert self.model_based, "You should not use _get_transitions in " \
            "model-free algorithm!"
        
        # call the internal attribute of the environments.
        # `transitions` is a list contain all possible next states and the 
        # probability, reward, and termination indicater corresponding to it
        transitions = self.env.P[state][act]
#         print(transitions)
        # Given a certain state and action pair, it is possible
        # to find there exist multiple transitions, since the 
        # environment is not deterministic.
        # You need to know the return format of this function: a list of dicts
        ret = []
        for prob, next_state, reward, done in transitions:
            ret.append({
                "prob": prob,
                "next_state": next_state,
                "reward": reward,
                "done": done
            })
        return ret
    
    def _check_env_name(self):
        assert self.env_name.startswith('SailingEnv')

    def print_table(self):
        """print beautiful table, only work for FrozenLake8X8-v0 env. We 
        write this function for you."""
        self._check_env_name()
        print_table(self.table)

    def train(self):
        """Conduct one iteration of learning."""
        raise NotImplementedError("You need to override the "
                                  "Trainer.train() function.")

    def evaluate(self):
        """Use the function you write to evaluate current policy.
        Return the mean episode reward of 1000 episodes when seed=0."""
        mean_reward, mean_step = evaluate(self.policy, 1000, env_name=self.env_name)
        return [mean_reward, mean_step]

    def render(self):
        """Reuse your evaluate function, render current policy 
        for one episode when seed=0"""
        evaluate(self.policy, 1, render=True, env_name=self.env_name)

In [8]:
# Solve the TODOs and remove `pass`

class PolicyItertaionTrainer(TabularRLTrainerAbstract):
    def random_policy(ops):
            return np.random.choice(self.action_dim, size=(self.env.observation_space.n))
    def __init__(self, gamma=1.0, eps=1e-10, env_name='SailingEnv'):
        super(PolicyItertaionTrainer, self).__init__(env_name)

        # discount factor
        self.gamma = gamma

        # value function convergence criterion
        self.eps = eps

        # build the value table for each possible observation
        self.table = np.zeros((self.obs_dim,))

        policy_array = np.random.randint(self.action_dim, size = (self.obs_dim))

        self.policy = lambda obs: policy_array[obs]

        test_random_policy(self.policy, self.env)
        
    
    
    def train(self):
        """Conduct one iteration of learning."""

        self.table = np.zeros((self.obs_dim,))
        self.update_value_function()
        self.update_policy()

    def update_value_function(self):
        count = 0  # count the steps of value updates
        while True:
            old_table = self.table.copy()

            for state in range(self.obs_dim):
                
                act = self.policy(state)
                transition_list = self._get_transitions(state, act)
                state_value = 0
                for transition in transition_list:
                    
                    prob = transition['prob']
                    reward = transition['reward']
                    next_state = transition['next_state']
                    done = transition['done']
                    

                    state_value += prob * (reward + self.gamma * old_table[next_state])
                # update the state value
                    
                self.table[state] = state_value
            

            # [TODO] Compare the old_table and current table to
            #  decide whether to break the value update process.
            # hint: you should use self.eps, old_table and self.table
            should_break = True if np.sum(np.abs(old_table - self.table)) < self.eps else False

            if should_break:
                break
            count += 1
            if count % 20000 == 0:
                # disable this part if you think debug message annoying.
                
                print("[DEBUG]\tUpdated values for {} steps. "
                      "Difference between new and old table is: {}".format(
                    count, np.sum(np.abs(old_table - self.table))
                ))
#             if count > 4000:
#                 print("[HINT] Are you sure your codes is OK? It shouldn't be "
#                       "so hard to update the value function. You already "
#                       "use {} steps to update value function within "
#                       "single iteration.".format(count))
#             if count > 6000:
#                 raise ValueError("Clearly your code has problem. Check it!")

    def update_policy(self):
        """You need to define a new policy function, given current
        value function. The best action for a given state is the one that
        has greatest expected return.

        To optimize computing efficiency, we introduce a policy table,
        which take state as index and return the action given a state.
        """
        policy_table = np.zeros([self.obs_dim, ], dtype=np.int)

        for state in range(self.obs_dim):
            state_action_values = [0] * self.action_dim
            
            # [TODO] assign the action with greatest "value"
            # to policy_table[state]
            # hint: what is the proper "value" here?
            #  you should use table, gamma, reward, prob,
            #  next_state and self._get_transitions() function
            #  as what we done at self.update_value_function()
            #  Bellman equation may help.
            for action in range(self.action_dim):
                transition_list = self._get_transitions(state, action)
                for transition in transition_list:
                    prob = transition['prob']
                    reward = transition['reward']
                    next_state = transition['next_state']
                    done = transition['done']
                    state_action_values[action] += prob * (reward + self.gamma * self.table[next_state])
            best_action = np.argmax(state_action_values)
            
            
            policy_table[state] = best_action
        self.policy = lambda obs: policy_table[obs]


In [9]:
# Solve the TODOs and remove `pass`


class ValueIterationTrainer(PolicyItertaionTrainer):
    """Note that we inherate Policy Iteration Trainer, to resue the
    code of update_policy(). It's same since it get optimal policy from
    current state-value table (self.table).
    """

    def __init__(self, gamma=1.0, env_name='SailingEnv'):
        super(ValueIterationTrainer, self).__init__(gamma, None, env_name)

    def train(self):
        """Conduct one iteration of learning."""

        self.update_value_function()

    def update_value_function(self):
        old_table = self.table.copy()
        for state in range(self.obs_dim):
            state_value = 0
            state_action_values = [0] * self.action_dim
            for action in range(self.action_dim):
                transition_list = self._get_transitions(state, action)
                for transition in transition_list:
                    prob = transition['prob']
                    reward = transition['reward']
                    next_state = transition['next_state']
                    done = transition['done']
                    state_action_values[action] += prob * (reward + self.gamma * self.table[next_state])

                state_value = np.max(state_action_values)
                self.table[state] = state_value


    def evaluate(self):
        """Since in value itertaion we do not maintain a policy function,
        so we need to retrieve it when we need it."""
        self.update_policy()
        return super().evaluate()

    def render(self):
        """Since in value itertaion we do not maintain a policy function,
        so we need to retrieve it when we need it."""
        self.update_policy()
        return super().render()


In [10]:
# Solve the TODOs and remove `pass`

# Managing configurations of your experiments is important for your research.
default_pi_config = dict(
    max_iteration=20,
    evaluate_interval=1,
    gamma=0.99,
    eps=1e-10
)


def policy_iteration(train_config=None):
    config = default_pi_config.copy()
    if train_config is not None:
        config.update(train_config)
        
    trainer = PolicyItertaionTrainer(gamma=config['gamma'], eps=config['eps'])

    old_policy_result = {
        obs: -1 for obs in range(trainer.obs_dim)
    }
    benchmark = []
    for i in range(config['max_iteration']):

        # train the agent
        trainer.train()  
        new_policy_result = {
             obs: trainer.table[obs] for obs in range(trainer.obs_dim)
        }
        
#         should_stop = True if new_policy_result == old_policy_result else False
#         if should_stop:
#             print("We found policy is not changed anymore at "
#                   "itertaion {}. Current mean episode reward "
#                   "is {}. Current mean episode step is {}. Stop training.".format(i, trainer.evaluate()[0], trainer.evaluate()[1]))
#             break
        old_policy_result = new_policy_result
#         print(old_policy_result)
        # evaluate the result
        if i % config['evaluate_interval'] == 0:
            result = dict(iteration=i, mean_reward=trainer.evaluate()[0],mean_step=trainer.evaluate()[1])
            benchmark.append(result)
            print(
                "[INFO]\tIn {} iteration, current mean episode reward is {}, current mean episode step is {}."
                "".format(i, trainer.evaluate()[0], trainer.evaluate()[1]))

#             if i > 20:
#                 print("You sure your codes is OK? It shouldn't take so many "
#                       "({}) iterations to train a policy iteration "
#                       "agent.".format(i))

#     assert trainer.evaluate() > 0.8, \
#         "We expect to get the mean episode reward greater than 0.8. " \
#         "But you get: {}. Please check your codes.".format(trainer.evaluate())

    return trainer, benchmark


In [11]:
# Run this cell without modification

# It may be confusing to call a trainer agent. But that's what we normally do.
pi_agent, benchmark = policy_iteration()

[INFO]	In 0 iteration, current mean episode reward is 16.398, current mean episode step is 9.602.
[INFO]	In 1 iteration, current mean episode reward is 348.332, current mean episode step is 56.068.
[INFO]	In 2 iteration, current mean episode reward is 479.937, current mean episode step is 55.663.
[INFO]	In 3 iteration, current mean episode reward is 710.843, current mean episode step is 80.957.
[INFO]	In 4 iteration, current mean episode reward is 903.417, current mean episode step is 96.583.
[INFO]	In 5 iteration, current mean episode reward is 904.794, current mean episode step is 95.206.
[INFO]	In 6 iteration, current mean episode reward is 906.714, current mean episode step is 93.286.
[INFO]	In 7 iteration, current mean episode reward is 906.714, current mean episode step is 93.286.
[INFO]	In 8 iteration, current mean episode reward is 906.714, current mean episode step is 93.286.
[INFO]	In 9 iteration, current mean episode reward is 906.714, current mean episode step is 93.286.
[I

In [12]:
benchmark_dir = "benchmark/"
benchmark_col = ["iteration", "mean_reward", "mean_step"]
benchmark_name = "%sPI_%s_%s.csv" % (benchmark_dir, environment_config["map_name"], environment_config["total_steps"])

In [13]:
try:
    with open(benchmark_name, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=benchmark_col)
        writer.writeheader()
        for data in benchmark:
            writer.writerow(data)
except IOError:
    print("I/O error")

In [12]:
# Solve the TODOs and remove `pass`

# Managing configurations of your experiments is important for your research.
default_vi_config = dict(
    max_iteration=20,
    evaluate_interval=1,  # don't need to update policy each iteration
    gamma=0.99,
    eps=1e-10
)


def value_iteration(train_config=None):
    config = default_vi_config.copy()
    if train_config is not None:
        config.update(train_config)

    # [TODO] initialize Value Iteration Trainer. Remember to pass
    #  config['gamma'] to it.
    trainer = ValueIterationTrainer(gamma=config['gamma'])

    old_state_value_table = trainer.table.copy()

    for i in range(config['max_iteration']):
        # train the agent
        trainer.train() 
        new_state_value_table = trainer.table
        # evaluate the result
        if i % config['evaluate_interval'] == 0:
            print(
                "[INFO]\tIn {} iteration, current mean episode reward is {}, current mean episode step is {}."
                "".format(i, trainer.evaluate()[0], trainer.evaluate()[1]))


#             should_stop = True if np.sum(np.abs(old_state_value_table - new_state_value_table)) < config["eps"] else False
            
            
#             if should_stop:
#                 print("We found policy is not changed anymore at "
#                   "itertaion {}. Current mean episode reward "
#                   "is {}. Current mean episode step is {}. Stop training.".format(i, trainer.evaluate()[0], trainer.evaluate()[1]))
#                 break
            old_state_value_table = new_state_value_table
            if i > 3000:
                print("You sure your codes is OK? It shouldn't take so many "
                      "({}) iterations to train a policy iteration "
                      "agent.".format(
                    i))

#     assert trainer.evaluate() > 0.8, \
#         "We expect to get the mean episode reward greater than 0.8. " \
#         "But you get: {}. Please check your codes.".format(trainer.evaluate())

    return trainer


In [13]:
# Run this cell without modification

vi_agent = value_iteration()

[INFO]	In 0 iteration, current mean episode reward is 44.756, current mean episode step is 67.244.
[INFO]	In 1 iteration, current mean episode reward is -64.627, current mean episode step is 64.627.
We found policy is not changed anymore at itertaion 1. Current mean episode reward is -64.627. Current mean episode step is 64.627. Stop training.
