#### Implementation of Dueling DQN paper for 1-dimensional games, such as Cartpole.
- https://arxiv.org/abs/1511.06581

<br>
    
    Detailed implementation of Q-Network, a state and ReplayBuffer are different from the original paper. Because this notebook aims to solve a "simple 1-dimensional" atari game.
    Please see the notebook named as "Dueling_DQN_2dim" for more rigorous implementation of the paper.

#### Please NOTE,
    The code lines different from Vanila DQN are annotated with '*/*/*/'.
    So, by searching '*/*/*/', you can find these lines.
    
    [What is the difference?]
    In Dueling DQN, an architecture of the Q-network is different from that of Vanila DQN.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim 
import torch.nn.functional as F 

import gym
import numpy as np
import time
import os
import cv2
import matplotlib.pyplot as plt
from IPython.display import clear_output

#### Configurations
    NOTE: In this notebook, Priortized Replay Buffer and scaling the gradient of the last convolution layer (dividing by 1/root(2)) are not implemented. 
<br>

![image](https://drive.google.com/uc?id=1-B0zM5w1I6Sfu5dn7ir_FAPKvoC0Y-z-)
![image](https://drive.google.com/uc?id=1pXP-xIA9QU1vWUnCl6wRivMBel1ZHwCq)

In [3]:
class QNetwork(nn.Module):
    ''' Simple linear Q-Network. The architecture is, therefore, different from thg model in DQN paper.'''
    def __init__(self, 
                 input_feature: ("int: input state dimension"), 
                 action_dim: ("output: action dimensions"),
        ):
        super(QNetwork, self).__init__()
        self.action_dim = action_dim

        self.linear1 = nn.Linear(input_feature, 256)
        
        # */*/*/
        # The input features are processed through two streams.
        self.V1 = nn.Linear(256, 128) 
        self.V2 = nn.Linear(128, 1) 
        self.A1 = nn.Linear(256, 128)
        self.A2 = nn.Linear(128, action_dim)
        # */*/*/
        self.relu = nn.ReLU()

    def forward(self, x):

        x = self.relu(self.linear1(x))
        
        # */*/*/
        # The input features are processed through two streams.
        V = self.V2(self.relu(self.V1(x)))
        A = self.A2(self.relu(self.A1(x)))
        # */*/*/
        Q = V + A - A.mean(dim=-1, keepdim=True)
        return Q
    
if __name__ == '__main__':
    input_feature = 4
    action_dim = 10
    net = QNetwork(input_feature, action_dim)
    test = torch.randn(size=(64, input_feature))
    print(net)
    print("Network output: ", net(test).shape)

QNetwork(
  (linear1): Linear(in_features=4, out_features=256, bias=True)
  (V1): Linear(in_features=256, out_features=128, bias=True)
  (V2): Linear(in_features=128, out_features=1, bias=True)
  (A1): Linear(in_features=256, out_features=128, bias=True)
  (A2): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
)
Network output:  torch.Size([64, 10])


In [4]:
class ReplayBuffer:
    """ Experience Replay Buffer as in DQN paper. """
    def __init__(self, 
                 buffer_size: ('int: total size of the Replay Buffer'), 
                 input_dim: ('int: a dimension of input data.'), 
                 batch_size: ('int: a batch size when updating')):
                 
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.save_count, self.current_size = 0, 0

        # One can choose either np.zeros or np.ones. 
        # The reason using np.ones here is for checking the total memory occupancy of the buffer. 
        self.state_buffer = np.ones((buffer_size, input_dim), dtype=np.float32)
        self.next_state_buffer = np.ones((buffer_size, input_dim), dtype=np.float32) 
        self.action_buffer = np.ones(buffer_size, dtype=np.uint8) 
        self.reward_buffer = np.ones(buffer_size, dtype=np.float32) 
        self.done_buffer = np.ones(buffer_size, dtype=np.uint8) 

    def __len__(self):
        return self.current_size

    def store(self, 
              state: np.ndarray, 
              action: int, 
              reward: float, 
              next_state: np.ndarray, 
              done: int):

        self.state_buffer[self.save_count] = state
        self.action_buffer[self.save_count] = action
        self.reward_buffer[self.save_count] = reward
        self.next_state_buffer[self.save_count] = next_state
        self.done_buffer[self.save_count] = done
        
        # self.save_count is an index when storing transitions into the replay buffer
        self.save_count = (self.save_count + 1) % self.buffer_size
        # self.current_size is an indication for how many transitions is stored
        self.current_size = min(self.current_size+1, self.buffer_size)

    def batch_load(self):
        # Selecting samples randomly with a size of self.batch_size 
        indices = np.random.randint(self.current_size, size=self.batch_size)
        return dict(
                states=self.state_buffer[indices], 
                actions=self.action_buffer[indices],
                rewards=self.reward_buffer[indices],
                next_states=self.next_state_buffer[indices], 
                dones=self.done_buffer[indices]) 


In [5]:
class Agent:
    def __init__(self, 
                 env: 'Environment',
                 input_dim: ('int: The width and height of pre-processed input image'),
                 training_frames: ('int: The total number of training frames'),
                 eps_decay: ('float: Epsilon Decay_rate'),
                 gamma: ('float: Discount Factor'),
                 target_update_freq: ('int: Target Update Frequency (by frames)'),
                 update_type: ('str: Update type for target network. Hard or Soft')='hard',
                 soft_update_tau: ('float: Soft update ratio')=None,
                 batch_size: ('int: Update batch size')=32,
                 buffer_size: ('int: Replay buffer size')=1000000,
                 update_start_buffer_size: ('int: Update starting buffer size')=50000,
                 learning_rate: ('float: Learning rate')=0.0004,
                 eps_min: ('float: Epsilon Min')=0.1,
                 eps_max: ('float: Epsilon Max')=1.0,
                 device_num: ('int: GPU device number')=0,
                 rand_seed: ('int: Random seed')=None,
                 plot_option: ('str: Plotting option')=False,
                 model_path: ('str: Model saving path')='./',
                 trained_model_path: ('str: Trained model path')=''):

        self.action_dim = env.action_space.n
        self.device = torch.device(f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path
        
        self.env = env
        self.input_dim = input_dim
        self.training_frames = training_frames
        self.epsilon = eps_max
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.gamma = gamma
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option
        
        self.q_behave = QNetwork(self.input_dim, self.action_dim).to(self.device)
        self.q_target = QNetwork(self.input_dim, self.action_dim).to(self.device)
        if trained_model_path: # load a trained model if existing
            self.q_behave.load_state_dict(torch.load(trained_model_path))
            print("Trained model is loaded successfully.")
        
        # Initialize target network parameters with behavior network parameters
        self.q_target.load_state_dict(self.q_behave.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_behave.parameters(), lr=learning_rate) 

        self.memory = ReplayBuffer(self.buffer_size, self.input_dim, self.batch_size)

    def select_action(self, state: 'Must be pre-processed in the same way as updating current Q network. See def _compute_loss'):
        
        if np.random.random() < self.epsilon:
            return np.zeros(self.action_dim), self.env.action_space.sample()
        else:
            # with no_grad to compute faster
            with torch.no_grad():
                state = torch.FloatTensor(state).to(self.device)
                Qs = self.q_behave(state)
                # take an action of a maximum Q-value
                action = Qs.argmax()
            
            # return action and Q-values (Q-values are not required for implementing algorithms. This is just for checking Q-values for each state. Not must-needed)  
            return Qs.detach().cpu().numpy(), action.detach().item()  

    def get_init_state(self):

        init_state = self.env.reset()
        for _ in range(0): # loop for a random initial starting point. range(0) means the same initial point.
            action = self.env.action_space.sample()
            init_state, _, _, _ = self.env.step(action) 
        return init_state

    def get_state(self, state, action):

        next_state, reward, done, _ = self.env.step(action)
        return reward, next_state, done

    def store(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def update_behavior_q_net(self):
        # update behavior q network with a batch
        batch = self.memory.batch_load()
        loss = self._compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def target_soft_update(self):
        ''' target network is updated with Soft Update. tau is a hyperparameter for the updating ratio betweeen target and behavior network  '''
        for target_param, current_param in zip(self.q_target.parameters(), self.q_behave.parameters()):
            target_param.data.copy_(self.tau*current_param.data + (1.0-self.tau)*target_param.data)

    def target_hard_update(self):
        ''' target network is updated with Hard Update '''
        self.update_cnt = (self.update_cnt+1) % self.target_update_freq
        if self.update_cnt==0:
            self.q_target.load_state_dict(self.q_behave.state_dict())

    def train(self):
        tic = time.time()
        losses = []
        scores = []
        epsilons = []
        avg_scores = [[-10000]] # As an initial score, set an arbitrary score of an episode.

        score = 0

        print("Storing initial buffer..") 
        state = self.get_init_state()
        for frame_idx in range(1, self.update_start+1):
            # Store transitions into the buffer until the number of 'self.update_start' transitions is stored 
            _, action = self.select_action(state)
            reward, next_state, done = self.get_state(state, action)
            self.store(state, action, reward, next_state, done)
            state = next_state
            if done: state = self.get_init_state()

        print("Done. Start learning..")
        history_store = []
        for frame_idx in range(1, self.training_frames+1):
            Qs, action = self.select_action(state)
            reward, next_state, done = self.get_state(state, action)
            self.store(state, action, reward, next_state, done)
            history_store.append([state, Qs, action, reward, next_state, done]) # history_store is for checking an episode later. Not must-needed.
            loss = self.update_behavior_q_net()

            if self.update_type=='hard':   self.target_hard_update()
            elif self.update_type=='soft': self.target_soft_update()
            
            score += reward
            losses.append(loss)

            if done:
                # For saving and plotting when an episode is done.
                scores.append(score)
                if np.mean(scores[-10:]) > max(avg_scores):
                    torch.save(self.q_behave.state_dict(), self.model_path+'{}_Score:{}.pt'.format(frame_idx, np.mean(scores[-10:])))
                    training_time = round((time.time()-tic)/3600, 1)
                    np.save(self.model_path+'{}_history_Score_{}_{}hrs.npy'.format(frame_idx, score, training_time), np.array(history_store))
                    print("          | Model saved. Recent scores: {}, Training time: {}hrs".format(scores[-10:], training_time), ' /'.join(os.getcwd().split('/')[-3:]))
                avg_scores.append(np.mean(scores[-10:]))

                if self.plot_option=='inline': 
                    scores.append(score)
                    epsilons.append(self.epsilon)
                    self._plot(frame_idx, scores, losses, epsilons)
                else: 
                    print(score, end='\r')

                score=0
                state = self.get_init_state()
                history_store = []
            else: state = next_state

            self._epsilon_step()

        print("Total training time: {}(hrs)".format((time.time()-tic)/3600))

    def _epsilon_step(self):
        self.epsilon = max(self.epsilon-self.eps_decay, 0.1)

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        states = torch.FloatTensor(batch['states']).to(self.device)
        next_states = torch.FloatTensor(batch['next_states']).to(self.device)
        actions = torch.LongTensor(batch['actions'].reshape(-1, 1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1, 1)).to(self.device)

        current_q = self.q_behave(states).gather(1, actions)

        # target value
        next_q = self.q_target(next_states).max(dim=1, keepdim=True)[0].detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)

        # Use smooth l1 loss for clipping loss between -1 to 1 as in DQN paper.
        loss = F.smooth_l1_loss(current_q, target)
        return loss

    def _plot(self, frame_idx, scores, losses, epsilons):
        clear_output(True) 
        plt.figure(figsize=(20, 5), facecolor='w') 
        plt.subplot(131)  
        plt.title('frame %s. score: %s' % (frame_idx, np.mean(scores[-10:])))
        plt.plot(scores) 
        plt.subplot(132) 
        plt.title('loss') 
        plt.plot(losses) 
        plt.subplot(133) 
        plt.title('epsilons')
        plt.plot(epsilons) 
        plt.show() 

In [6]:
env_list = {
    0: "CartPole-v0",
    1: "CartPole-v2",
    2: "LunarLander-v2",
}

env_name = env_list[0]
env = gym.make(env_name)

# Same input size as in DQN paper. 
input_dim = env.observation_space.shape[0]
print("env_name", env_name) 
update_start_buffer_size = 200
training_frames = 20000
eps_max = 1.0
eps_min = 0.1
eps_decay = 1/2000
gamma = 0.99

buffer_size = int(2e3) 
batch_size = 32           
update_type = 'hard'
soft_update_tau = 0.002
learning_rate = 0.001
target_update_freq = 100

device_num = 0
rand_seed = None
rand_name = ('').join(map(str, np.random.randint(10, size=(3,))))
folder_name = os.getcwd().split('/')[-1] 

model_name = 'Test'
model_save_path = f'./model_save/{model_name}/'
if not os.path.exists('./model_save/'):
    os.mkdir('./model_save/')
if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)
print("model_save_path:", model_save_path)

trained_model_path = ''

plot_options = {1: 'inline', 2: False} 
plot_option = plot_options[2] 

env_name CartPole-v0
model_save_path: ./model_save/Test/


In [None]:
agent = Agent( 
    env,
    input_dim,
    training_frames,
    eps_decay,
    gamma,
    target_update_freq,
    update_type,
    soft_update_tau,
    batch_size,
    buffer_size,
    update_start_buffer_size,
    learning_rate,
    eps_min,
    eps_max,
    device_num,
    rand_seed,
    plot_option,
    model_save_path,
    trained_model_path
) 

agent.train()

#### An example of results

    Storing initial buffer..
    Done. Start learning..
              | Model saved. Recent scores: [26.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [26.0, 25.0, 16.0, 35.0, 12.0, 52.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [52.0, 18.0, 19.0, 35.0, 39.0, 19.0, 17.0, 12.0, 34.0, 145.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [18.0, 19.0, 35.0, 39.0, 19.0, 17.0, 12.0, 34.0, 145.0, 113.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [19.0, 35.0, 39.0, 19.0, 17.0, 12.0, 34.0, 145.0, 113.0, 35.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [35.0, 39.0, 19.0, 17.0, 12.0, 34.0, 145.0, 113.0, 35.0, 65.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [39.0, 19.0, 17.0, 12.0, 34.0, 145.0, 113.0, 35.0, 65.0, 45.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [19.0, 17.0, 12.0, 34.0, 145.0, 113.0, 35.0, 65.0, 45.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [17.0, 12.0, 34.0, 145.0, 113.0, 35.0, 65.0, 45.0, 200.0, 140.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [12.0, 34.0, 145.0, 113.0, 35.0, 65.0, 45.0, 200.0, 140.0, 193.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [34.0, 145.0, 113.0, 35.0, 65.0, 45.0, 200.0, 140.0, 193.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [145.0, 113.0, 35.0, 65.0, 45.0, 200.0, 140.0, 193.0, 200.0, 139.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [113.0, 35.0, 65.0, 45.0, 200.0, 140.0, 193.0, 200.0, 139.0, 188.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [35.0, 65.0, 45.0, 200.0, 140.0, 193.0, 200.0, 139.0, 188.0, 124.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [65.0, 45.0, 200.0, 140.0, 193.0, 200.0, 139.0, 188.0, 124.0, 125.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [45.0, 200.0, 140.0, 193.0, 200.0, 139.0, 188.0, 124.0, 125.0, 179.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [200.0, 140.0, 193.0, 200.0, 139.0, 188.0, 124.0, 125.0, 179.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [193.0, 200.0, 139.0, 188.0, 124.0, 125.0, 179.0, 200.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [200.0, 139.0, 188.0, 124.0, 125.0, 179.0, 200.0, 200.0, 200.0, 195.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [188.0, 124.0, 125.0, 179.0, 200.0, 200.0, 200.0, 195.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [124.0, 125.0, 179.0, 200.0, 200.0, 200.0, 195.0, 200.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [125.0, 179.0, 200.0, 200.0, 200.0, 195.0, 200.0, 200.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [179.0, 200.0, 200.0, 200.0, 195.0, 200.0, 200.0, 200.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [200.0, 200.0, 200.0, 195.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN