[medium](https://medium.com/@jonathan_hui/rl-dqn-deep-q-network-e207751f7ae4) <br>
[github](https://github.com/udacity/deep-reinforcement-learning/tree/master/dqn) <br>
[Pytorch DQN Example](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)<br>
[paper dqn](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)<br>
[paper Double DQN](https://arxiv.org/pdf/1509.06461.pdf)<br>

TODO
- reward clipping ???? see [here](https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py) when end episode<br>
- error clipping <br>
- Prioritized Experience Replay

In [1]:
%reload_ext autoreload
%autoreload 2
import torch
import torch.nn.functional as F 
import random
import numpy as np
from EXITrl.approx_v_base import ApproxVBase
from EXITrl.approx_policy_base import ApproxPolicyBase
from EXITrl.base import Base
from EXITrl.helpers import update_params, ExperienceReplay, WeightDecay, device
from EXITrl.nn_wrapper import NNWrapper
import gym

In [2]:
class QNetwork(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.linear1 = torch.nn.Linear(input_size, hidden_size)
        self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
        self.linear3 = torch.nn.Linear(hidden_size, output_size)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        return self.linear3(x)

In [29]:
class DQN(Base):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.local_q_network = NNWrapper(
            QNetwork(self.num_state, 32, self.num_action),
            lr=self.alpha
        )
        self.target_q_network = NNWrapper(
            QNetwork(self.num_state, 32, self.num_action),
            lr=0 # use manual update
        )
        update_params(self.local_q_network.model, self.target_q_network.model, tau=0)
        self.num_step = 0
    
    def initialize(self, num_step_to_learn, eps_start, eps_end, eps_decay, num_experience, num_recall, skip_frame=1):
        self.num_step_to_learn= num_step_to_learn
        self.epsilon_decay = WeightDecay(eps_start, eps_end, eps_decay)
        self.epsilon = self.epsilon_decay.step()
        self.experience_replay = ExperienceReplay(num_experience=num_experience, num_recall=num_recall)
        self.skip_frame = skip_frame
        
    def policy(self, state):
        return self.local_q_network.epsilon_greedy(state, self.epsilon)
    
    def learn(self, state, action, reward, next_state, done):
        # detach because we only backprop local network and update target network weight manually
        targets_next_Q = self.target_q_network.forward(next_state).detach().max(1)[0]
        targets_Q = reward + (self.gamma * targets_next_Q * (1 - done))

        local_Q = self.local_q_network.forward(state)
        expected_Q = local_Q.gather(1, action.unsqueeze(1).long()).squeeze(1) # select Q from action

        # Huber loss
        loss = F.smooth_l1_loss(expected_Q, targets_Q) # loss = F.mse_loss(expected_Q, targets_Q)
        self.local_q_network.backprop(loss)

        update_params(self.local_q_network.model, self.target_q_network.model, self.tau)

    def _loop(self, episode) -> int:
        total_reward = 0
        state = self.env.reset()
        for i in range(1000):
            action = self.policy(state)
            for _ in range(self.skip_frame):
                _state, reward, done, _ = self.env.step(action)
                if done: break
            self.experience_replay.remember(state, action, reward, _state, done)
            
            self.num_step += 1
            if self.num_step%self.num_step_to_learn == 0:
                experiences = self.experience_replay.recall()
                self.learn(*experiences)
            state = _state
            
            total_reward += reward
            self.additional_log['num_step'] = self.num_step
            self.additional_log['epsilon'] = self.epsilon_decay.val
            if done: break
        self.epsilon = self.epsilon_decay.step()
        return total_reward
    
    def _save(self, reward):
        torch.save(self.local_q_network.model.state_dict(), self.save_name)
    def _load(self):
        self.epsilon = 0
        self.local_q_network.model.load_state_dict(torch.load(self.save_name, map_location=device))
        self.local_q_network.model.eval()


### LunarLander

In [37]:
try: env.close()
except: pass
env = gym.make('LunarLander-v2')
dqn = DQN(env, 
      num_mean_episode=100,
      num_episodes=2000,
      alpha=5e-4, 
      gamma=.99,
      tau=5e-4,
      save_name="checkpoint/LunarLander-v2-DQN.pth")
dqn.initialize(num_step_to_learn=4, 
               eps_start=1, 
               eps_end=.01, 
               eps_decay=.995, 
               num_experience=2048, 
               num_recall=512,
               skip_frame=1)
dqn.local_q_network = NNWrapper(
    QNetwork(dqn.num_state, 16, dqn.num_action),
    lr=dqn.alpha
)
dqn.target_q_network = NNWrapper(
    QNetwork(dqn.num_state, 16, dqn.num_action),
    lr=0 # use manual update
)
update_params(dqn.local_q_network.model, dqn.target_q_network.model, tau=0)
dqn.train(early_stop=lambda mean_reward: mean_reward>200)
# dqn.play()

Episode 100	Average Score: -140.11 	other{'num_step': 10639, 'epsilon': 0.6057704364907278}
Episode 200	Average Score: -73.93 	other{'num_step': 34613, 'epsilon': 0.3669578217261671}}}
Episode 300	Average Score: -43.96 	other{'num_step': 68979, 'epsilon': 0.22229219984074702}
Episode 400	Average Score: 16.58 	other{'num_step': 134759, 'epsilon': 0.1346580429260134}}
Episode 500	Average Score: 58.98 	other{'num_step': 185825, 'epsilon': 0.08157186144027828}
Episode 600	Average Score: 65.38 	other{'num_step': 226329, 'epsilon': 0.0494138221100385}}}
Episode 700	Average Score: 14.36 	other{'num_step': 256637, 'epsilon': 0.029933432588273214}
Episode 800	Average Score: 26.68 	other{'num_step': 297934, 'epsilon': 0.018132788524664028}}
Episode 900	Average Score: 85.89 	other{'num_step': 331827, 'epsilon': 0.01098430721937979}}
Episode 1000	Average Score: 157.66 	other{'num_step': 374399, 'epsilon': 0.01}6634861955105}
--- early stop ----e Score: 200.69 	other{'num_step': 393237, 'epsilon': 

### Skip frame

In [143]:
try: env.close()
except: pass
env = gym.make('LunarLander-v2')
dqn = DQN(env, 
      num_mean_episode=100,
      num_episodes=2000,
      alpha=5e-4, 
      gamma=.95,
      tau=1e-3,
      save_name="checkpoint/LunarLander-v2-DQN.pth")
dqn.initialize(num_step_to_learn=4, 
               eps_start=1, 
               eps_end=.01, 
               eps_decay=.995, 
               num_experience=2048, 
               num_recall=32,
               skip_frame=3)
dqn.train(early_stop=lambda mean_reward: mean_reward>200)
# dqn.play()

Episode 100	Average Score: -148.99
Episode 200	Average Score: -142.11
Episode 300	Average Score: -149.07
Episode 400	Average Score: -150.33
Episode 500	Average Score: -166.24
Episode 600	Average Score: -175.00
Episode 700	Average Score: -155.82
Episode 800	Average Score: -142.49
Episode 900	Average Score: -95.593
Episode 1000	Average Score: -10.25
Episode 1100	Average Score: 32.34
Episode 1200	Average Score: 36.93
Episode 1300	Average Score: 31.62
Episode 1400	Average Score: 53.91
Episode 1500	Average Score: 56.46
Episode 1600	Average Score: 70.57
Episode 1700	Average Score: 54.78
Episode 1800	Average Score: 103.50
Episode 1900	Average Score: 102.85
Episode 2000	Average Score: 85.092


### Breakout

In [19]:
try: env.close()
except: pass
env = gym.make('Breakout-ram-v0')
dqn = DQN(env, 
      num_mean_episode=100,
      num_episodes=int(1e6),
      alpha=5e-4, 
      gamma=.99,
      tau=1,
      save_name="checkpoint/Breakout-ram-v0-DQN.pth")
dqn.initialize(num_step_to_learn=3000, 
               eps_start=1, 
               eps_end=.01, 
               eps_decay=3e-4, 
               num_experience=20480, 
               num_recall=1024,
               skip_frame=1)
dqn.train(early_stop=lambda mean_reward: mean_reward>200)
# dqn.play()

Episode 100	Average Score: 1.39 	other{'num_step': 24992, 'epsilon': 0.9700000000000033}
Episode 200	Average Score: 1.23 	other{'num_step': 49233, 'epsilon': 0.9400000000000066}
Episode 300	Average Score: 1.44 	other{'num_step': 74584, 'epsilon': 0.9100000000000099}
Episode 400	Average Score: 1.35 	other{'num_step': 99487, 'epsilon': 0.8800000000000132}
Episode 500	Average Score: 1.46 	other{'num_step': 124849, 'epsilon': 0.8500000000000165}
Episode 600	Average Score: 1.64 	other{'num_step': 151291, 'epsilon': 0.8200000000000198}
Episode 700	Average Score: 1.25 	other{'num_step': 175878, 'epsilon': 0.7900000000000231}
Episode 800	Average Score: 1.51 	other{'num_step': 201760, 'epsilon': 0.7600000000000264}
Episode 900	Average Score: 1.49 	other{'num_step': 227716, 'epsilon': 0.7300000000000297}
Episode 1000	Average Score: 1.23 	other{'num_step': 251963, 'epsilon': 0.700000000000033}
Episode 1100	Average Score: 1.07 	other{'num_step': 275928, 'epsilon': 0.6700000000000363}
Episode 1200	

KeyboardInterrupt: 

In [None]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

In [46]:
try: env.close()
except: pass
env = gym.make('Breakout-v0')
dqn = DQN(env, 
      num_mean_episode=100,
      num_episodes=int(1e6),
      alpha=5e-4, 
      gamma=.99,
      tau=1,
      save_name="checkpoint/Breakout-v0-DQN.pth")
dqn.initialize(num_step_to_learn=3000, 
               eps_start=1, 
               eps_end=.01, 
               eps_decay=3e-4, 
               num_experience=20480, 
               num_recall=1024,
               skip_frame=1)
dqn.train(early_stop=lambda mean_reward: mean_reward>200)
# dqn.play()

input: torch.Size([210, 160, 3])


RuntimeError: size mismatch, m1: [33600 x 3], m2: [210 x 32] at /opt/conda/conda-bld/pytorch_1549628766161/work/aten/src/THC/generic/THCTensorMathBlas.cu:266

### Pong

In [21]:
try: env.close()
except: pass
env = gym.make('Pong-ram-v0')
dqn = DQN(env, 
      num_mean_episode=100,
      num_episodes=int(1e6),
      alpha=5e-3, 
      gamma=.99,
      tau=1,
      save_name="checkpoint/Pong-ram-v0-DQN.pth")
dqn.initialize(num_step_to_learn=3000, 
               eps_start=1, 
               eps_end=.01, 
               eps_decay=3e-4, 
               num_experience=204800, 
               num_recall=64,
               skip_frame=1)
dqn.train(early_stop=lambda mean_reward: mean_reward>200)
# dqn.play()

Episode 27	Average Score: -16.78 	other{'num_step': 27000, 'epsilon': 0.9919000000000009}

KeyboardInterrupt: 