In [1]:
from collections import namedtuple, deque
import random
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import torchvision.transforms as transform
import gym
import warnings
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from models import DQN
from memory import ReplayMemory

In [10]:

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LL:
    def __init__(self, state_size, action_size, seed, batch_size=64, gamma=0.99, learning_rate=1e-4,
                 buffer_size=int(1e5), n_every=4, tau=1e-3, device = DEVICE, noisy = False, pr_xp = False):
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(0)
        self.batch_size = 64
        self.buffer_size = int(1e5)
        self.n_update = 4
        self.Loss = 0
        
        #hyperparameters
        self.tau = tau
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.policy_net = DQN(self.state_size, self.action_size).to(DEVICE)
        self.target_net = DQN(self.state_size, self.action_size).to(DEVICE)
        self.memory = ReplayMemory(self.action_size, self.buffer_size, self.batch_size, self.seed)

        #get dict for policy
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval() #set to evalution model
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)#use adam adaptive gradient descent optimizer
        self.n_step = 0  # initialize timestep var

    def step(self, state, action, reward, next_state, terminal):
        self.memory.add_memory(state, action, reward, next_state, terminal) #add to memory
        self.n_step = (self.n_step + 1) % self.n_update #update per time steps
        if self.n_step == 0: #get experience
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.train_model(experiences, self.gamma)

    def decide_action(self, state, epsilon=0):
        return random.choice(np.arange(self.action_size))

    def soft_update(self, policy_net, target_net, tau):
        for target_param, policy_param in zip(target_net.parameters(), policy_net.parameters()):
            target_param.data.copy_(tau * policy_param.data + (1.0 - tau) * target_param.data)

    def train_model(self, experiences, gamma):
        #prioritized replay training
        if len(experiences) > 5:
            states, actions, rewards, next_states, terminal_runs, indices, weights = experiences
            Q_net_targets = self.target_net(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets = rewards + (gamma * Q_net_targets * (1 - terminal_runs)) #0 = not terminal
            Q_pred = self.policy_net(states).gather(1, (actions.type(torch.LongTensor).to(DEVICE)))
            loss = F.mse_loss(Q_pred, Q_targets)
            loss = (loss * weights)
            prios = loss + 1e-5
            loss = loss.mean()
        
        #replay memory training
        else:
            states, actions, rewards, next_states, terminal_runs = experiences
            Q_net_targets = self.target_net(next_states).detach().max(1)[0].unsqueeze(1)
            Q_targets = rewards + (gamma * Q_net_targets * (1 - terminal_runs))
            Q_pred = self.policy_net(states).gather(1, (actions.type(torch.LongTensor)).to(DEVICE))
            loss = F.mse_loss(Q_pred, Q_targets)
        
        #update loss and backprop
        self.Loss = loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.soft_update(policy_net=self.policy_net, target_net=self.target_net, tau=self.tau)

    def get_Loss(self):
        return self.Loss

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make("LunarLander-v2")
env.seed(0)
date = datetime.now().strftime("%Y_%m_%d-%I_%M_%S_%p")

In [11]:
def train(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.9995):
    
    #reporting vars
    scores = []  # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    episode_length_window = deque(maxlen=100)
    avg_loss_window = deque(maxlen=100)
    avg_loss_list = []
    episode_length_list = []
    eps = eps_start
    start = time.time()
    
    for i_episode in range(1, n_episodes + 1):
        training_episode = i_episode
        state = env.reset()
        score = 0
        avg_loss = 0
        epis_length = 0
        for t in range(max_t):
            action = lunar_agent.decide_action(state, eps)
            next_state, reward, done, _ = env.step(action)
            lunar_agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            avg_loss += float(lunar_agent.get_Loss())
            epis_length += 1
            if done:
                break
                
        #append scores
        episode_length_list.append(epis_length)
        episode_length_window.append(epis_length)
        scores_window.append(score)
        scores.append(score)
        avg_loss_window.append(avg_loss)
        avg_loss_list.append(avg_loss)
        eps = max(eps_end, eps_decay * eps)
        
        #REMOVE FROM END
        print('\rEpisode {}\tAverage Score: {:.2f} \taverage Loss {:.2f} \tepisode length {:.2f}'.format(i_episode,np.mean(scores_window),
                                                                                                         np.mean(avg_loss_window), np.mean( episode_length_window)),end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f} \taverage Loss {:.2f} \tepisode length {:.2f}'.format(i_episode, np.mean( scores_window), np.mean( avg_loss_window), np.mean(episode_length_window)))
            torch.save(lunar_agent.policy_net.state_dict(), "./results/checkpoint_random_lunar_agent_"+date+".pth")
    
    torch.save(lunar_agent.policy_net.state_dict(), "./results/checkpoint_final_random_lunar_agent_"+date+".pth")
    return scores, avg_loss_list, episode_length_list

In [12]:
lunar_agent = LL(state_size=env.observation_space.shape[0], action_size=env.action_space.n, seed=0, noisy = False, pr_xp = True)

#set training episode to 0
training_episode = 0

#train agent
scores, avg_loss_list, episode_length_list = train(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.9995)

#save scores
scores = np.array(scores)
losslist = np.array(avg_loss_list)
lengthlist = np.array(episode_length_list)
df = pd.DataFrame(scores, columns=['Scores'])
df['Loss'] = losslist
df['Episode length'] = lengthlist
df.to_csv("./results/random_res_"+date+'.csv')

Episode 100	Average Score: -185.28 	average Loss 11406.55 	episode length 92.79
Episode 200	Average Score: -203.14 	average Loss 9034.70 	episode length 95.447
Episode 300	Average Score: -161.58 	average Loss 6798.02 	episode length 100.05
Episode 400	Average Score: -167.89 	average Loss 4438.86 	episode length 90.170
Episode 500	Average Score: -200.45 	average Loss 3794.30 	episode length 90.39
Episode 600	Average Score: -193.19 	average Loss 3198.68 	episode length 91.85
Episode 700	Average Score: -197.68 	average Loss 3277.45 	episode length 93.83
Episode 800	Average Score: -188.05 	average Loss 2983.83 	episode length 88.28
Episode 900	Average Score: -179.99 	average Loss 3182.87 	episode length 90.50
Episode 1000	Average Score: -175.79 	average Loss 3042.07 	episode length 91.60
Episode 1100	Average Score: -190.72 	average Loss 3134.78 	episode length 92.13
Episode 1200	Average Score: -172.40 	average Loss 3159.85 	episode length 90.64
Episode 1300	Average Score: -186.60 	average 