In [1]:
!pip install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting box2d-py==2.3.5
  Downloading box2d_py-2.3.5-cp37-cp37m-manylinux1_x86_64.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 36.5 MB/s 
[?25hCollecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 66.5 MB/s 
Installing collected packages: pygame, box2d-py
Successfully installed box2d-py-2.3.5 pygame-2.1.0


In [2]:
'''DLP DQN Lab'''
__author__ = 'chengscott'
__copyright__ = 'Copyright 2020, NCTU CGI Lab'
import argparse
from collections import deque
import itertools
import random
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter 




class ReplayMemory:
    __slots__ = ['buffer']

    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, *transition):
        # (state, action, reward, next_state, done)
        self.buffer.append(tuple(map(tuple, transition)))

    def sample(self, batch_size, device):
        '''sample a batch of transition tensors'''
        transitions = random.sample(self.buffer, batch_size)
        return (torch.tensor(x, dtype=torch.float, device=device)
                for x in zip(*transitions))


class Net(nn.Module):
    def __init__(self, state_dim=8, action_dim=4, hidden_dim=32):
        super().__init__()
        ## TODO ##
        self.fc1 = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(inplace=True)
        )
        self.fc2 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(inplace=True)
        )
        self.fc3 = nn.Sequential(
            nn.Linear(hidden_dim, action_dim)
        )
        
    def forward(self, x):
        ## TODO ##
        x = torch.tensor(x)
        x = x.to("cuda")
        x = self.fc1(x)
        x = self.fc2(x)
        output = self.fc3(x)
        return output


class DQN:
    def __init__(self, args):
        self._behavior_net = Net().to(args.device)
        self._target_net = Net().to(args.device)
        # initialize target network
        self._target_net.load_state_dict(self._behavior_net.state_dict())
        ## TODO ##
        self._optimizer = torch.optim.Adam(self._behavior_net.parameters(), lr=args.lr)
        
        # memory
        self._memory = ReplayMemory(capacity=args.capacity)

        ## config ##
        self.device = args.device
        self.batch_size = args.batch_size
        self.gamma = args.gamma
        self.freq = args.freq
        self.target_freq = args.target_freq

    def select_action(self, state, epsilon, action_space):
        '''epsilon-greedy based on behavior network'''
         ## TODO ##
        if random.random() < epsilon:
            random_action = action_space
            return random_action.sample()
        with torch.no_grad():
            every_probability = self._behavior_net(state)
            best_action = torch.argmax(every_probability)
            return best_action.item()

    def append(self, state, action, reward, next_state, done):
        self._memory.append(state, [action], [reward / 10], next_state,
                            [int(done)])

    def update(self, args, total_steps):
        if total_steps % self.freq == 0:
            self._update_behavior_network(args, self.gamma)
        if total_steps % self.target_freq == 0:
            self._update_target_network()

    def _update_behavior_network(self, args, gamma):
        # sample a minibatch of transitions
        state, action, reward, next_state, done = self._memory.sample(
            self.batch_size, self.device)
        
        ## TODO ##
        q_value = self._behavior_net(state).gather(1, action.long())
        with torch.no_grad():
           q_next = torch.max(self._target_net(next_state), 1)[0].view(-1, 1)
           q_target = reward + q_next.to(args.device) * gamma * (1.0 - done)

        criterion = nn.MSELoss() 
        loss = criterion(q_value, q_target)
        
        # optimize
        self._optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self._behavior_net.parameters(), 5)
        self._optimizer.step()

    def _update_target_network(self):
        '''update target network by copying from behavior network'''
        ## TODO ##
        self._target_net.load_state_dict(self._behavior_net.state_dict())

    def save(self, model_path, checkpoint=False):
        if checkpoint:
            torch.save(
                {
                    'behavior_net': self._behavior_net.state_dict(),
                    'target_net': self._target_net.state_dict(),
                    'optimizer': self._optimizer.state_dict(),
                }, model_path)
        else:
            torch.save({
                'behavior_net': self._behavior_net.state_dict(),
            }, model_path)

    def load(self, model_path, checkpoint=False):
        model = torch.load(model_path)
        self._behavior_net.load_state_dict(model['behavior_net'])
        if checkpoint:
            self._target_net.load_state_dict(model['target_net'])
            self._optimizer.load_state_dict(model['optimizer'])


def train(args, env, agent, writer):
    print('Start Training')
    action_space = env.action_space
    total_steps, epsilon = 0, 1.
    ewma_reward = 0
    for episode in range(args.episode):
        total_reward = 0
        state = env.reset()
        for t in itertools.count(start=1):
            # select action
            if total_steps < args.warmup:
                action = action_space.sample()
            else:
                action = agent.select_action(state, epsilon, action_space)
                epsilon = max(epsilon * args.eps_decay, args.eps_min)

            # execute action
            next_state, reward, done, _ = env.step(action)
            # store transition
            agent.append(state, action, reward, next_state, done)
            if total_steps >= args.warmup:
                agent.update(args, total_steps)

            state = next_state
            total_reward += reward
            total_steps += 1
            
            if done:
                ewma_reward = 0.05 * total_reward + (1 - 0.05) * ewma_reward
                writer.add_scalar('Train/Episode Reward', total_reward,
                                  total_steps)
                writer.add_scalar('Train/Ewma Reward', ewma_reward,
                                  total_steps)
                print(
                    'Step: {}\tEpisode: {}\tLength: {:3d}\tTotal reward: {:.2f}\tEwma reward: {:.2f}\tEpsilon: {:.3f}'
                    .format(total_steps, episode, t, total_reward, ewma_reward,
                            epsilon))
                break
    env.close()





def test(args, env, agent, writer):
    print('Start Testing')
    action_space = env.action_space
    epsilon = args.test_epsilon
    seeds = (args.seed + i for i in range(10))
    rewards = []
    for n_episode, seed in enumerate(seeds):
        total_reward = 0
        env.seed(seed)
        state = env.reset()
        ## TODO ##
        for t in itertools.count(start=1):
            if args.render:
                env.render()

            # select action
            action = agent.select_action(state, epsilon, action_space)
            # execute action
            next_state, reward, done, _ = env.step(action)

            state = next_state
            total_reward += reward

            if done:
                rewards.append(total_reward)

                writer.add_scalar('Test/Episode Reward', total_reward, n_episode)
                print(
                    'Episode: {}\tLength: {:3d}\tTotal reward: {:.2f}\t'
                    .format(n_episode, t, total_reward))
                break

    print('Average Reward', np.mean(rewards))
    env.close()


def main():
    ## arguments ##
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-d', '--device', default='cuda')
    parser.add_argument('-m', '--model', default='dqn.pth')
    parser.add_argument('--logdir', default='log/dqn')
    # train
    parser.add_argument('--warmup', default=10000, type=int)
    parser.add_argument('--episode', default=1200, type=int)
    parser.add_argument('--capacity', default=10000, type=int)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--lr', default=.0005, type=float)
    parser.add_argument('--eps_decay', default=.995, type=float)
    parser.add_argument('--eps_min', default=.01, type=float)
    parser.add_argument('--gamma', default=.5, type=float)
    parser.add_argument('--freq', default=4, type=int)
    parser.add_argument('--target_freq', default=1000, type=int)
    # test
    parser.add_argument('--test_only', action='store_true')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--seed', default=20200519, type=int)
    parser.add_argument('--test_epsilon', default=.001, type=float)
    parser.add_argument('-f')
    args = parser.parse_args()

    ## main ##
    env = gym.make('LunarLander-v2')
    agent = DQN(args)
    writer = SummaryWriter(args.logdir)
    if not args.test_only:
        train(args, env, agent, writer)
        agent.save(args.model)
    agent.load(args.model)
    test(args, env, agent, writer)


if __name__ == '__main__':
    main()


  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


Start Training
Step: 83	Episode: 0	Length:  83	Total reward: -108.76	Ewma reward: -5.44	Epsilon: 1.000
Step: 181	Episode: 1	Length:  98	Total reward: -150.09	Ewma reward: -12.67	Epsilon: 1.000
Step: 289	Episode: 2	Length: 108	Total reward: -573.98	Ewma reward: -40.74	Epsilon: 1.000
Step: 354	Episode: 3	Length:  65	Total reward: -142.27	Ewma reward: -45.81	Epsilon: 1.000
Step: 445	Episode: 4	Length:  91	Total reward: -390.69	Ewma reward: -63.06	Epsilon: 1.000
Step: 505	Episode: 5	Length:  60	Total reward: -230.21	Ewma reward: -71.41	Epsilon: 1.000
Step: 615	Episode: 6	Length: 110	Total reward: -221.75	Ewma reward: -78.93	Epsilon: 1.000
Step: 734	Episode: 7	Length: 119	Total reward: -196.56	Ewma reward: -84.81	Epsilon: 1.000
Step: 853	Episode: 8	Length: 119	Total reward: -146.19	Ewma reward: -87.88	Epsilon: 1.000
Step: 1007	Episode: 9	Length: 154	Total reward: -178.87	Ewma reward: -92.43	Epsilon: 1.000
Step: 1102	Episode: 10	Length:  95	Total reward: -347.01	Ewma reward: -105.16	Epsilon:



Step: 10018	Episode: 107	Length:  94	Total reward: -293.89	Ewma reward: -204.47	Epsilon: 0.914
Step: 10120	Episode: 108	Length: 102	Total reward: -345.65	Ewma reward: -211.53	Epsilon: 0.548
Step: 10203	Episode: 109	Length:  83	Total reward: -521.95	Ewma reward: -227.05	Epsilon: 0.361
Step: 10268	Episode: 110	Length:  65	Total reward: -524.45	Ewma reward: -241.92	Epsilon: 0.261
Step: 10328	Episode: 111	Length:  60	Total reward: -286.07	Ewma reward: -244.13	Epsilon: 0.193
Step: 10406	Episode: 112	Length:  78	Total reward: -558.09	Ewma reward: -259.83	Epsilon: 0.131
Step: 10494	Episode: 113	Length:  88	Total reward: -258.46	Ewma reward: -259.76	Epsilon: 0.084
Step: 10576	Episode: 114	Length:  82	Total reward: -474.92	Ewma reward: -270.52	Epsilon: 0.056
Step: 10657	Episode: 115	Length:  81	Total reward: -374.13	Ewma reward: -275.70	Epsilon: 0.037
Step: 10766	Episode: 116	Length: 109	Total reward: -659.14	Ewma reward: -294.87	Epsilon: 0.022
Step: 10866	Episode: 117	Length: 100	Total reward:

  "Function `env.seed(seed)` is marked as deprecated and will be removed in the future. "


Episode: 0	Length: 327	Total reward: 240.85	
Episode: 1	Length: 218	Total reward: -91.16	
Episode: 2	Length: 233	Total reward: -168.66	
Episode: 3	Length: 299	Total reward: 258.90	
Episode: 4	Length: 261	Total reward: 274.13	
Episode: 5	Length: 189	Total reward: -125.87	
Episode: 6	Length: 266	Total reward: 277.40	
Episode: 7	Length: 285	Total reward: 221.21	
Episode: 8	Length: 227	Total reward: -181.87	
Episode: 9	Length: 258	Total reward: -34.84	
Average Reward 67.00805509894684


In [None]:
%load_ext tensorboard
%tensorboard --logdir=/content/log/dqn