In [1]:
from collections import namedtuple, deque
import random
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import torchvision.transforms as transform
import gym
import warnings
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from models import DQN, Duel_DQN, NoisyLinear
from memory import ReplayMemory
from LunLand import LL

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("LunarLander-v2")
env.seed(0)
date = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")

In [4]:
def train(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.9995):
    
    #reporting vars
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    episode_length_window = deque(maxlen=100)
    avg_loss_window = deque(maxlen=100)
    avg_loss_list = []
    episode_length_list = []
    eps = eps_start
    start = time.time()
    
    for i_episode in range(1, n_episodes + 1):
        training_episode = i_episode
        state = env.reset()
        score = 0
        avg_loss = 0
        epis_length = 0
        for t in range(max_t):
            action = lunar_agent.decide_action(state, eps)
            next_state, reward, done, _ = env.step(action)
            lunar_agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            avg_loss += float(lunar_agent.get_Loss())
            epis_length += 1
            if done:
                break
                
        #append scores
        episode_length_list.append(epis_length)
        episode_length_window.append(epis_length)
        scores_window.append(score)
        scores.append(score)
        avg_loss_window.append(avg_loss)
        avg_loss_list.append(avg_loss)
        eps = max(eps_end, eps_decay * eps)
        
        #REMOVE FROM END
        print('\rEpisode {}\tAverage Score: {:.2f} \taverage Loss {:.2f} \tepisode length {:.2f}'.format(i_episode,np.mean(scores_window),
                                                                                                         np.mean(avg_loss_window), np.mean( episode_length_window)),end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f} \taverage Loss {:.2f} \tepisode length {:.2f}'.format(i_episode, np.mean( scores_window), np.mean( avg_loss_window), np.mean(episode_length_window)))
            torch.save(lunar_agent.policy_net.state_dict(), "./results/checkpoint_duel_dDQN_half_lunar_agent_"+date+str(i_episode)+".pth")
    
    torch.save(lunar_agent.policy_net.state_dict(), "./results/checkpoint_final_duel_dDQN_half_lunar_agent_"+date+".pth")
    return scores, avg_loss_list, episode_length_list

In [5]:
"""class LL:
    def __init__(self, state_size, action_size, seed, batch_size=64, gamma=0.99, learning_rate=1e-4,
                 buffer_size=int(1e5), n_every=4, tau=1e-3, device = DEVICE, noisy = False, dueling = False, dDQN = False):"""

tau = 0.0007161331306967163
learning_rate = 0.0020105330891440187
gamma = 0.9997133549438102
eps_start = 0.9221004582368262
eps_end = 0.023580916333788057
eps_decay = 0.9523200490821455
    
lunar_agent = LL(state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=0, gamma = gamma,
                 learning_rate=learning_rate, buffer_size=int(1e5), n_every = 4, tau=tau, device = DEVICE, 
                 noisy = False, dueling = True, dDQN = True)

#set training episode to 0
training_episode = 0

#train agent
scores, avg_loss_list, episode_length_list = train(n_episodes=800, max_t=500, eps_start=eps_start, eps_end=eps_end, eps_decay=eps_decay)

#save scores
scores = np.array(scores)
losslist = np.array(avg_loss_list)
lengthlist = np.array(episode_length_list)
df = pd.DataFrame(scores, columns=['Scores'])
df['Loss'] = losslist
df['Episode length'] = lengthlist
df.to_csv("./results/opt_duel_dDQN_half_res_"+date+'.csv')

Episode 100	Average Score: -160.96 	average Loss 6536.00 	episode length 193.26
Episode 200	Average Score: -32.75 	average Loss 2607.66 	episode length 239.250
Episode 300	Average Score: -3.79 	average Loss 2044.62 	episode length 248.109
Episode 400	Average Score: 7.66 	average Loss 1443.06 	episode length 240.246
Episode 500	Average Score: -1.19 	average Loss 1189.97 	episode length 236.09
Episode 600	Average Score: -11.71 	average Loss 883.34 	episode length 237.09
Episode 700	Average Score: 7.68 	average Loss 951.15 	episode length 240.6397
Episode 800	Average Score: 23.41 	average Loss 878.93 	episode length 243.72
