### Build Dueling Deep Q-Network

In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import gym

In [2]:
information = {"DuelingDQN-LunarLander":[]}

### Buffer

In [3]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape),
                                    dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_shape),
                                    dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int64)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)
        
        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]
        
        return states, actions, rewards, states_, dones

### Build Dueling DQN

In [4]:
class DuelingDQN(nn.Module):
    def __init__(self, ALPHA, n_actions, input_dims):
        super(DuelingDQN, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = 256
        self.fc2_dims = 256
        self.n_actions = n_actions
        
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.V = nn.Linear(self.fc2_dims, 1)
        self.A = nn.Linear(self.fc2_dims, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=ALPHA)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        V = self.V(x)
        A = self.A(x)
        return V, A

### Build Dueling DQN Agent

In [5]:
class AgentDuelingDQN():
    def __init__(self, gamma, epsilon, alpha, input_dims, batch_size, n_actions,
                max_mem_size = 100000, eps_end = 0.01, eps_dec = 5e-7, replace=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0
        self.replace_target_cntr = replace
        self.batch_size = batch_size
        
        self.memory = ReplayBuffer(max_mem_size, input_dims, n_actions)
        
        self.Q_eval = DuelingDQN(alpha, n_actions=n_actions, input_dims=input_dims)
        self.Q_next = DuelingDQN(alpha, n_actions=n_actions, input_dims=input_dims)
        
    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            observation = observation[np.newaxis, :]
            state = T.tensor([observation]).to(self.Q_eval.device)
            _, advantage = self.Q_eval.forward(state)
            action = T.argmax(advantage).item()
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def replace_target_network(self):
        if self.replace_target_cntr is not None and self.learn_step_counter % self.replace_target_cntr == 0:
            self.Q_next.load_state_dict(self.Q_eval.state_dict())
            
    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        self.Q_eval.optimizer.zero_grad()
        self.replace_target_network()
        
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        
        state = T.tensor(state).to(self.Q_eval.device)
        new_state = T.tensor(new_state).to(self.Q_eval.device)
        action = T.tensor(action).to(self.Q_eval.device)
        reward = T.tensor(reward).to(self.Q_eval.device)
        dones = T.tensor(done).to(self.Q_eval.device)
        
        V_s, A_s = self.Q_eval.forward(state)
        V_s_, A_s_ = self.Q_eval.forward(new_state)
        
        q_pred = T.add(V_s, (A_s - A_s.mean(dim=1, keepdim=True))).gather(1,action.unsqueeze(-1)).squeeze(-1)
        q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True)))
        
        q_target = reward + self.gamma*T.max(q_next, dim=1)[0].detach()
        q_target[dones] = 0.0
        
        loss = self.Q_eval.loss(q_target, q_pred).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        self.learn_step_counter += 1
        
        self.decrement_epsilon()

### Training Dueling DQN

In [6]:
env = gym.make('LunarLander-v2')
num_games = 300
load_checkpoint = False

agent = AgentDuelingDQN(gamma=0.99, epsilon=1.0, alpha=5e-4,
                input_dims=[8], n_actions=4, max_mem_size=100000, eps_end=0.01,
                batch_size=64, eps_dec=1e-3, replace=100)

scores = []
eps_history = []
n_steps = 0

for i in range(num_games):
    done = False
    observation = env.reset()
    score = 0

    while not done:
        env.render()
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        n_steps += 1
        score += reward
        agent.store_transition(observation, action,
                                reward, observation_, int(done))
        agent.learn()

        observation = observation_


    scores.append(score)
    avg_score = np.mean(scores[max(0, i-100):(i+1)])
    print('episode: ', i,'score %.1f ' % score,
             ' average score %.1f' % avg_score,
            'epsilon %.2f' % agent.epsilon,
            end = "\r", flush=True)

    eps_history.append(agent.epsilon)
information["DuelingDQN-LunarLander"] = scores

episode:  299 score 201.6   average score 134.7 epsilon 0.011

### Save the training information

In [7]:
pd.DataFrame(information).to_csv("training_info.csv", header = True, index = False)