In [11]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import gym

In [4]:
information = {"Deep Q-Network":{"Lunar Landing":{}}}

# Deep Q-Neural Network for Lunar Lander

### Build Deep Q-Neural Network

In [5]:
class DQN(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DQN, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        return actions

### Build Double Deep Q-Neural Network

In [None]:
class DDQN(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DQN, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        return actions

### Build Learning Agent

In [6]:
class AgentDQN():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                max_mem_size = 100000, eps_end = 0.01, eps_dec = 5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        self.Q_eval = DQN(self.lr, n_actions=n_actions, input_dims=input_dims,
                          fc1_dims=256, fc2_dims=256)
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
        self.loss_history = []
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
            
        return action
        
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]
        
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0
        
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
        self.loss_history.append(loss.item())
    
    def get_loss_history(self):
        history = self.loss_history
        return history

In [None]:
class AgentDQN():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                max_mem_size = 100000, eps_end = 0.01, eps_dec = 5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        self.Q_eval = DQN(self.lr, n_actions=n_actions, input_dims=input_dims,
                          fc1_dims=256, fc2_dims=256)
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
        self.loss_history = []
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
            
        return action
        
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]
        
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0
        
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
        self.loss_history.append(loss.item())
    
    def get_loss_history(self):
        history = self.loss_history
        return history

# Training the Agent to play LunarLander

In [7]:
env = gym.make('LunarLander-v2')
agent = AgentDQN(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=4, eps_end=0.01,
                  input_dims=[8], lr=0.001)
scores, eps_history = [], []
n_games = 300
    
for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    while not done:
        env.render()
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.store_transition(observation, action, reward, 
                                    observation_, done)
        agent.learn()
        observation = observation_
    scores.append(score)
    eps_history.append(agent.epsilon)

    avg_score = np.mean(scores[-100:])

    print('episode ', i, 'score %.2f' % score,
                'average score %.2f' % avg_score,
                'epsilon %.2f' % agent.epsilon)
env.close()

information["Deep Q-Network"]["Lunar Landing"]["Epsilon"] = eps_history
information["Deep Q-Network"]["Lunar Landing"]["Reward"] = scores

episode  0 score -149.04 average score -149.04 epsilon 0.99
episode  1 score -134.87 average score -141.95 epsilon 0.96
episode  2 score -72.94 average score -118.95 epsilon 0.92
episode  3 score -118.65 average score -118.87 epsilon 0.86
episode  4 score -213.05 average score -137.71 epsilon 0.80
episode  5 score -55.50 average score -124.01 epsilon 0.75
episode  6 score -89.06 average score -119.02 epsilon 0.72
episode  7 score -25.10 average score -107.28 epsilon 0.65
episode  8 score -186.01 average score -116.02 epsilon 0.52
episode  9 score -124.78 average score -116.90 epsilon 0.46
episode  10 score -81.23 average score -113.66 epsilon 0.34
episode  11 score -430.29 average score -140.04 epsilon 0.21
episode  12 score -120.47 average score -138.54 epsilon 0.09
episode  13 score -248.60 average score -146.40 epsilon 0.01
episode  14 score -126.89 average score -145.10 epsilon 0.01
episode  15 score 275.73 average score -118.80 epsilon 0.01
episode  16 score 120.60 average score -

episode  136 score -23.59 average score -95.53 epsilon 0.01
episode  137 score 110.08 average score -89.26 epsilon 0.01
episode  138 score 179.92 average score -83.09 epsilon 0.01
episode  139 score 12.76 average score -75.22 epsilon 0.01
episode  140 score -81.07 average score -70.00 epsilon 0.01
episode  141 score 99.43 average score -61.61 epsilon 0.01
episode  142 score 229.61 average score -54.82 epsilon 0.01
episode  143 score 231.63 average score -48.07 epsilon 0.01
episode  144 score 234.92 average score -41.04 epsilon 0.01
episode  145 score 187.48 average score -34.46 epsilon 0.01
episode  146 score 36.09 average score -28.13 epsilon 0.01
episode  147 score 224.63 average score -19.85 epsilon 0.01
episode  148 score -67.07 average score -17.89 epsilon 0.01
episode  149 score -64.65 average score -15.06 epsilon 0.01
episode  150 score 209.27 average score -12.50 epsilon 0.01
episode  151 score 193.39 average score -7.25 epsilon 0.01
episode  152 score 1.68 average score -9.29 

episode  274 score 267.46 average score 149.66 epsilon 0.01
episode  275 score 257.52 average score 149.17 epsilon 0.01
episode  276 score 248.50 average score 149.55 epsilon 0.01
episode  277 score 236.45 average score 149.96 epsilon 0.01
episode  278 score 255.34 average score 149.92 epsilon 0.01
episode  279 score 236.95 average score 150.26 epsilon 0.01
episode  280 score 220.68 average score 150.27 epsilon 0.01
episode  281 score -251.71 average score 145.60 epsilon 0.01
episode  282 score -121.40 average score 141.94 epsilon 0.01
episode  283 score -119.25 average score 138.83 epsilon 0.01
episode  284 score -113.98 average score 135.59 epsilon 0.01
episode  285 score 226.05 average score 135.13 epsilon 0.01
episode  286 score -143.35 average score 131.71 epsilon 0.01
episode  287 score 46.65 average score 129.68 epsilon 0.01
episode  288 score 73.97 average score 128.70 epsilon 0.01
episode  289 score 224.81 average score 128.46 epsilon 0.01
episode  290 score 234.64 average sco

In [12]:
pd.DataFrame(information).to_csv("DQN_LL_info.csv", header = True, index = False)