In [1]:
from SailingEnvDQN import *
import sys
from contextlib import closing
import random
import numpy as np
from io import StringIO
from utils import *
from gym import utils, Env, spaces
from gym.utils import seeding
from gym.envs.toy_text import discrete
from gym.utils import seeding
from collections import deque

import torch
import torch
import torch.nn as nn
import csv

In [2]:
environment_config = dict(
    total_steps = 2000, # 2000
    random_seed = 10,
    is_random_env = False,
    map_name = "16x16", #16x16, 8x8 
    is_slippery = True
)


In [3]:
# Run this cell without modification

def evaluate(policy, num_episodes=1, seed=0, env_name='SailingEnvDQN',
             render=False):
    """This function evaluate the given policy and return the mean episode 
    reward.
    :param policy: a function whose input is the observation
    :param num_episodes: number of episodes you wish to run
    :param seed: the random seed
    :param env_name: the name of the environment
    :param render: a boolean flag indicating whether to render policy
    :return: the averaged episode reward of the given policy.
    """
    env = SailingEnvDQN(environment_config)
    env.seed(seed)
    rewards = []
    steps = []
    if render: num_episodes = 1
    for i in range(num_episodes):
        obs = env.reset()
        act = policy(obs)
        ep_reward = 0
        ep_step = 0
        while True:
            obs, reward, done = env.step(act)
            act = policy(obs)
            ep_reward += reward
            ep_step += 1
            if render:
                env.render()
                wait(sleep=0.05)
            if done:
                break
        rewards.append(ep_reward)
        steps.append(ep_step) 
    if render:
        env.close()
    return np.mean(rewards), np.mean(steps)

In [4]:
# Run this cell without modification

def run(trainer_cls, config=None, reward_threshold=None):
    """Run the trainer and report progress, agnostic to the class of trainer
    :param trainer_cls: A trainer class 
    :param config: A dict
    :param reward_threshold: the reward threshold to break the training
    :return: The trained trainer and a dataframe containing learning progress
    """
    assert inspect.isclass(trainer_cls)
    if config is None:
        config = {}
    trainer = trainer_cls(config)
    config = trainer.config
    start = now = time.time()
    stats = []
    benchmark = []
    for i in range(config['max_iteration'] + 1):
#         print("Current iteration: {}".format(i))
        stat = trainer.train()
        stats.append(stat or {})
        if i % config['evaluate_interval'] == 0 or \
                i == config["max_iteration"]:
            reward, step = trainer.evaluate(config.get("evaluate_num_episodes", 50))
            result = dict(iteration=i, mean_reward=reward,mean_step=step)
            benchmark.append(result)
            print("({:.1f}s,+{:.1f}s)\tIteration {}, current mean episode "
                  "reward is {} current mean step is {}. {}".format(
                time.time() - start, time.time() - now, i, reward, step,
                {k: round(np.mean(v), 4) for k, v in
                 stat.items()} if stat else ""))
            now = time.time()
        if reward_threshold is not None and reward > reward_threshold:
            print("In {} iteration, current mean episode reward {:.3f} is "
                  "greater than reward threshold {}. Congratulation! Now we "
                  "exit the training process.".format(
                i, reward, reward_threshold))
            break
    return trainer, stats, benchmark

In [5]:
from collections import deque
import random

class ExperienceReplayMemory:
    """Store and sample the transitions"""
    def __init__(self, capacity):
        # deque is a useful class which acts like a list but only contain
        # finite elements.When appending new element make deque exceeds the 
        # `maxlen`, the oldest element (the index 0 element) will be removed.
        
        # [TODO] uncomment next line. 
        self.memory = deque(maxlen=capacity)

    def push(self, transition):
        self.memory.append(transition)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [6]:
# Solve the TODOs and remove `pass`

class PytorchModel(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super(PytorchModel, self).__init__()
        
        # [TODO] Build a sequential model with two layers.
        # The first hidden layer has 100 hidden nodes, followed by
        # a ReLU activation function.
        # The second output layer take the activation vector, who has
        # 100 elements, as input and return the action values.
        # So the return values is a vector with num_actions elements.

        self.action_value = nn.Sequential(
            nn.Linear(obs_dim, 128),
            nn.ReLU(),
            nn.Linear(128, act_dim),
        )
#         self.action_value = nn.Sequential(
#             nn.Linear(input_shape[0], 256),
#             nn.ReLU(),
#             nn.Linear(256, 128),
#             nn.ReLU(),
#             nn.Linear(128, 64),
#             nn.ReLU(),
#             nn.Linear(64, num_actions),
#         )

    
    def forward(self, obs):
        return self.action_value(obs)
    
# Test
assert isinstance(PytorchModel(3, 7).action_value, nn.Module)

In [7]:
pytorch_config = dict(
    env_name="SailingEnvDQN",
    max_iteration=1000,
    max_episode_length=1000,
    evaluate_interval=5,
    gamma=0.99,
    eps=0.3,
    seed=0,
    parameter_std=0.01,
    learning_rate=0.01,
    hidden_dim=100,
    clip_norm=1.0,
    clip_gradient=True,
    memory_size=50000,
    learn_start=1000,
    batch_size=32,
    target_update_freq=500,  # in steps
    learn_freq=1,  # in steps
    n=1
)



def to_tensor(x):
    """A helper function to transform a numpy array to a Pytorch Tensor"""
    if isinstance(x, np.ndarray):
        x = torch.from_numpy(x).type(torch.float32)
    assert isinstance(x, torch.Tensor)
    if x.dim() == 3 or x.dim() == 1:
        x = x.unsqueeze(0)
    assert x.dim() == 2 or x.dim() == 4, x.shape
    return x

In [8]:
class DQNTrainer():
    def __init__(self, config):
        self.config = config
        self.env_name='SailingEnvDQN'
        self.env = SailingEnvDQN(environment_config)
        self.action_dim = self.env.action_space.n
        self.obs_dim = self.env.observation_space.n
        self.eps = self.config['eps']
        self.hidden_dim = self.config["hidden_dim"]
        self.max_episode_length = self.config["max_episode_length"]
        self.learning_rate = self.config["learning_rate"]
        self.gamma = self.config["gamma"]
        self.n = self.config["n"]
        self.initialize_parameters()
        self.learning_rate = self.config["learning_rate"]
        self.learn_start = self.config["learn_start"]
        self.batch_size = self.config["batch_size"]
        self.target_update_freq = self.config["target_update_freq"]
        self.clip_norm = self.config["clip_norm"]
        self.step_since_update = 0
        self.total_step = 0
        self.memory = ExperienceReplayMemory(self.config["memory_size"])
        
    def initialize_parameters(self):
        input_shape = self.env.observation_space.shape

        # [TODO] Initialize two network using PytorchModel class
        self.network = PytorchModel(self.obs_dim,  self.action_dim)  # PytorchModel((3,), 7)

        self.network.eval()
        self.network.share_memory()

        self.target_network = PytorchModel(self.obs_dim, self.action_dim)
        self.target_network.load_state_dict(self.network.state_dict())

        self.target_network.eval()

        # Build Adam optimizer and MSE Loss.
        # [TODO] Uncomment next few lines
        self.optimizer = torch.optim.Adam(
            self.network.parameters(), lr=self.learning_rate
        )
        self.loss = nn.MSELoss()
        
        
        
    def compute_action(self, processed_state, eps=None):
        """Compute the action given the state. Note that the input
        is the processed state."""

        values = self.compute_values(processed_state)
        assert values.ndim == 1, values.shape

        if eps is None:
            eps = self.eps

        if np.random.uniform(0, 1)  <= eps:
            action = np.random.randint(self.env.action_space.n)
        else:
            action = np.argmax(values)

        
        return action

    def evaluate(self, num_episodes=50, *args, **kwargs):
        """Use the function you write to evaluate current policy.
        Return the mean episode reward of 50 episodes."""
        policy = lambda raw_state: self.compute_action(
            self.process_state(raw_state), eps=0.0)
        result = evaluate(policy, num_episodes, *args, **kwargs)
        return result
    def compute_values(self, processed_state):
        """Compute the value for each potential action. Note that you
        should NOT preprocess the state here."""
        # [TODO] Convert the output of neural network to numpy array
        values = self.network(processed_state).detach().numpy()

    
        return values

    def train(self):
        s = self.env.reset()
        processed_s = self.process_state(s)
        act = self.compute_action(processed_s)
        stat = {"loss": []}

        for t in range(self.max_episode_length):
            next_state, reward, done = self.env.step(act)
#             print(next_state)

            next_processed_s = self.process_state(next_state)

            # Push the transition into memory.
            self.memory.push(
                (processed_s, act, reward, next_processed_s, done)
            )

            processed_s = next_processed_s
            act = self.compute_action(next_processed_s)
            self.step_since_update += 1
            self.total_step += 1

            if done:
                break
                
            if t % self.config["learn_freq"] != 0:
                # It's not necessary to update in each step.
                continue

            if len(self.memory) < self.learn_start:
                continue
            elif len(self.memory) == self.learn_start:
                print("Current memory contains {} transitions, "
                      "start learning!".format(self.learn_start))
            batch = self.memory.sample(self.batch_size)

            # Transform a batch of state / action / .. into a tensor.
            state_batch = to_tensor(
                np.stack([transition[0] for transition in batch])
            )
            action_batch = to_tensor(
                np.stack([transition[1] for transition in batch])
            )
            reward_batch = to_tensor(
                np.stack([transition[2] for transition in batch])
            )
            next_state_batch = torch.stack(
                [transition[3] for transition in batch]
            )
            done_batch = to_tensor(
                np.stack([transition[4] for transition in batch])
            )

            with torch.no_grad():
                    
                Q_t_plus_one = torch.max(self.target_network(next_state_batch).detach(), 1)[0] # to 1-D and get the tensor
                
                assert isinstance(Q_t_plus_one, torch.Tensor)
                assert Q_t_plus_one.dim() == 1
                
                # [TODO] Compute the target value of Q in batch.
                
                
                #use (1.0 − done) to determine if the game is ended or not
                Q_target = (reward_batch + (1 - done_batch) * self.gamma * Q_t_plus_one).reshape(self.batch_size,) 
                assert Q_target.shape == (self.batch_size,)

            self.network.train()


            Q_t = self.network(state_batch).gather(dim = 1, index = action_batch.reshape(self.batch_size,1).long()).reshape(self.batch_size,)
            assert Q_t.shape == Q_target.shape

            self.optimizer.zero_grad()
            loss = self.loss(input=Q_t, target=Q_target)
            loss_value = loss.item()
            stat['loss'].append(loss_value)
            loss.backward()
            

            nn.utils.clip_grad_norm_(self.network.parameters(), self.clip_norm)

            
            self.optimizer.step()
            self.network.eval()

        if len(self.memory) >= self.learn_start and \
                self.step_since_update > self.target_update_freq:
#             print("{} steps has passed since last update. Now update the"
#                   " parameter of the behavior policy. Current step: {}".format(
#                 self.step_since_update, self.total_step
#             ))
            self.step_since_update = 0
            # [TODO] Copy the weights of self.network to self.target_network.
            self.target_network.load_state_dict(self.network.state_dict())
            
            self.target_network.eval()
            
        return {"loss": np.mean(stat["loss"]), "episode_len": t}
   
    def compute_gradient(self, processed_states, actions, rewards, tau, T):
        """Compute the gradient"""
        n = self.n
        G = 0.0
        for i in range(tau, min(T, tau + n)):
            G += rewards[i+1] * np.power(self.gamma, i - tau)
        

        
        if tau + n < T:

            G += self.gamma ** n * self.compute_values(processed_states[tau + n])[actions[tau + n]]

        loss_grad = np.zeros((self.act_dim, 1))


        loss_grad[[actions[tau]]] = -(G - self.compute_values(processed_states[tau])[actions[tau]])

        value_grad = processed_states[tau].reshape(self.obs_dim, 1)
        

        assert loss_grad.shape == (self.act_dim, 1), loss_grad.shape
        assert value_grad.shape == (self.obs_dim, 1), value_grad.shape

        gradient = np.dot(loss_grad, value_grad.T).T
    
        return gradient

    def apply_gradient(self, gradient):
        """Apply the gradient to the parameter."""
        assert gradient.shape == self.parameters.shape, (
            gradient.shape, self.parameters.shape)
        
        # [TODO] apply the gradient to self.parameters
        self.parameters -= self.learning_rate * gradient
    def process_state(self, state):
        return torch.from_numpy(state).type(torch.float32)

In [9]:
config = merge_config(environment_config,
                     pytorch_config)

In [None]:
pytorch_trainer, pytorch_stat, pytorch_benchmark = run(DQNTrainer, config)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


(0.2s,+0.2s)	Iteration 0, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 16.0}
(0.3s,+0.2s)	Iteration 5, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 18.0}
(0.5s,+0.2s)	Iteration 10, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 16.0}
(0.7s,+0.2s)	Iteration 15, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 23.0}
(0.9s,+0.2s)	Iteration 20, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 23.0}
(1.1s,+0.2s)	Iteration 25, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 60.0}
(1.2s,+0.2s)	Iteration 30, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': nan, 'episode_len': 14.0}
Current memory contains 1000 transitions, start learning!
(1.5s,+0.3s)	Iteration 35, current mean episode reward 

(124.9s,+0.4s)	Iteration 300, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 932502.7964, 'episode_len': 16.0}
(125.3s,+0.3s)	Iteration 305, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 723561.1942, 'episode_len': 23.0}
(125.6s,+0.4s)	Iteration 310, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 982203.0265, 'episode_len': 19.0}
(126.1s,+0.4s)	Iteration 315, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 862420.4083, 'episode_len': 23.0}
(126.4s,+0.3s)	Iteration 320, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 642311.2169, 'episode_len': 21.0}
(126.7s,+0.3s)	Iteration 325, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 1167939.5984, 'episode_len': 16.0}
(127.1s,+0.4s)	Iteration 330, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 814268.8341, 'episode_len': 33.0}
(127.5s,+0.4s)	Iteration 335, cur

(147.4s,+0.4s)	Iteration 600, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 127724.4725, 'episode_len': 16.0}
(147.8s,+0.4s)	Iteration 605, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 153008.6808, 'episode_len': 14.0}
(148.2s,+0.4s)	Iteration 610, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': 136378.2201, 'episode_len': 19.0}
(148.5s,+0.3s)	Iteration 615, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 135816.7661, 'episode_len': 21.0}
(148.9s,+0.4s)	Iteration 620, current mean episode reward is 7986.0 current mean step is 15.0. {'loss': 157283.7169, 'episode_len': 21.0}
(149.3s,+0.4s)	Iteration 625, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': 165751.9288, 'episode_len': 21.0}
(149.7s,+0.4s)	Iteration 630, current mean episode reward is 3986.0 current mean step is 15.0. {'loss': 145392.0577, 'episode_len': 22.0}
(150.1s,+0.4s)	Iteration 635, curr

In [11]:
benchmark_dir = "benchmark/"
benchmark_col = ["iteration", "mean_reward", "mean_step"]
benchmark_name = "%sDQN_%s_%s.csv" % (benchmark_dir, environment_config["map_name"], environment_config["total_steps"])

In [12]:
try:
    with open(benchmark_name, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=benchmark_col)
        writer.writeheader()
        for data in pytorch_benchmark:
            writer.writerow(data)
except IOError:
    print("I/O error")