In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

In [2]:
import gymnasium as gym

env = gym.make("LunarLander-v2",render_mode='rgb_array')

In [3]:
from collections import namedtuple, deque
import random

Transition = namedtuple('Transition', ('state', 'action','next_state', 'reward'))

class ReplayMemory():

  def __init__(self, capacity):
    self.memory = deque([], maxlen=capacity)
  
  def push(self, *args):
    self.memory.append(Transition(*args))
  
  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)

  def __len__(self):
    return len(self.memory)

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
  def __init__(self, n_observations, n_actions):
    super(DQN, self).__init__()
    self.layer_input = nn.Linear(n_observations, 512)
    self.layer_h_1 = nn.Linear(512, 512)
    self.layer_h_2 = nn.Linear(512, 512)
    self.layer_v = nn.Linear(512, 1)
    self.layer_a = nn.Linear(512, n_actions)

  def forward(self, x):
    x = F.relu(self.layer_input(x))
    # x = self.dropout0(x)
    x = F.relu(self.layer_h_1(x))
    # x = self.dropout1(x)
    x = F.relu(self.layer_h_2(x))
    # x = self.dropout2(x)
    
    v = self.layer_v(x)
    a = self.layer_a(x)
    
    q = v + a - a.mean()
    
    return q

In [5]:
import torch.optim as optim
import math

REPLAY_SIZE = 100_000
BATCH_SIZE = 1024
GAMMA = 0.99
TAU = 0.005
LR = 0.00005

n_actions = env.action_space.n

state, info = env.reset()
n_observations = len(state)

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=LR) # amsgrad? r:
memory = ReplayMemory(REPLAY_SIZE)
global_step = 0

def select_action(state, eps_threshold):
  #global steps_done
  sample = random.random()

  if sample > eps_threshold:
    with torch.no_grad():
      return policy_net(state).max(1)[1].view(1,1) # .view(1,1)? r:
  else:
    return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

episode_durations = []

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return

    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions)) # print after

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device)

    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch) # print after


    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    expected_next_action_values = reward_batch + GAMMA * next_state_values

    criterion = nn.MSELoss()
    loss = criterion(state_action_values, expected_next_action_values.unsqueeze(1))
    
    optimizer.zero_grad()
    loss.backward()

    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    return loss.item()

In [6]:
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 80 #6/1

if torch.cuda.is_available():
  num_episodes = 200
else:
  num_episodes = 50

h_params = {
    'EPS_START': EPS_START,
    'EPS_END': EPS_END,
    'EPS_DECAY': EPS_DECAY,
    'REPLAY_SIZE': REPLAY_SIZE,
    'BATCH_SIZE': BATCH_SIZE,
    'GAMMA': GAMMA,
    'TAU': TAU,
    'LR': LR,
    
}

In [7]:
from tensorboardX import SummaryWriter
import time
import numpy as np

timestr = time.strftime("%Y_%m_%d_%H_%M_%S")

In [8]:
from itertools import count
import numpy as np

while len(memory) < BATCH_SIZE:
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    
    while True:
        action = torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device, dtype=torch.float32)
        done = terminated or truncated
        
        if terminated:
          next_state = None
        else:
          next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        
        memory.push(state, action, next_state, reward)
        
        state = next_state
        if done:
            break
env.close()

env = gym.make("LunarLander-v2",render_mode='human')

ep_losses = []

with SummaryWriter(log_dir=f'duel_runs/{timestr}') as writer:
    
    while global_step < num_episodes:
        state, info = env.reset(seed=global_step)
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * global_step / EPS_DECAY)

        ep_reward = 0
        ep_qvalues = 0
        # ep_gif_frames = []

        for t in count():

            action = select_action(state, eps_threshold)
            observation, reward, terminated, truncated, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            done = terminated or truncated

            # ep_gif_frames.append(env.render())
            ep_reward += reward.cpu().numpy().item()
            ep_qvalues += policy_net(state).max(1)[0].item()

            if terminated:
              next_state = None
            else:
              next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

            memory.push(state, action, next_state, reward)
            state = next_state

            loss_scalar = optimize_model()
            ep_losses.append(loss_scalar)

            target_net_state_dict = target_net.state_dict()
            policy_net_state_dict = policy_net.state_dict()

            # Soft update of the target network's weights
            # θ′ ← τ θ + (1 −τ )θ′
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
            target_net.load_state_dict(target_net_state_dict)

            if done:
                mean_rewards = ep_reward # / (t + 1)
                print(f'[{global_step}/{num_episodes}]', f'[loss_mean: {np.mean(ep_losses)}]', f'[rewards_mean: {mean_rewards}]', f'[iterations: {t}]', f'[last reward: {reward.item()}]')
                break
            

        loss_mean = np.mean(ep_losses)
        
        writer.add_hparams(

            h_params,
            {
                'i_episode': global_step,
                't': t + 1,
                'Memory_len': len(memory),
                'Threshold': eps_threshold,
                'Loss': loss_mean, # loss_scalar,
                'Qvalues': ep_qvalues / (t + 1),
                'Reward': ep_reward / (t + 1),
            }, name='.', global_step=global_step,
        )
        ep_losses = []
        
        global_step += 1 
        writer.flush()

[0/200] [loss_mean: 126.54937744140625] [rewards_mean: -51.52707693836419] [iterations: 70] [last reward: -100.0]
[1/200] [loss_mean: 99.88041684054589] [rewards_mean: -470.0474033802748] [iterations: 128] [last reward: -100.0]
[2/200] [loss_mean: 60.81026342405495] [rewards_mean: -252.43054363131523] [iterations: 140] [last reward: -100.0]
[3/200] [loss_mean: 42.26033146527349] [rewards_mean: -101.64349865505937] [iterations: 97] [last reward: -100.0]
[4/200] [loss_mean: 33.196225560944654] [rewards_mean: -120.84520431328565] [iterations: 144] [last reward: -100.0]
[5/200] [loss_mean: 32.71149365764019] [rewards_mean: -351.92865830659866] [iterations: 120] [last reward: -100.0]
[6/200] [loss_mean: 33.23636688104197] [rewards_mean: -272.23162192478776] [iterations: 118] [last reward: -100.0]
[7/200] [loss_mean: 32.598240768516455] [rewards_mean: -164.82983389496803] [iterations: 90] [last reward: -100.0]
[8/200] [loss_mean: 29.66788767646341] [rewards_mean: -119.6488358611241] [iterati

In [9]:
import os

save_model_dir = './duel_saved_models'

if not os.path.exists(f'{save_model_dir}'):
    os.mkdir(f'{save_model_dir}')
if not os.path.exists(f'{save_model_dir}/{timestr}'):
    os.mkdir(f'{save_model_dir}/{timestr}')

torch.save(policy_net.state_dict(), f'{save_model_dir}/{timestr}/policy_net')
torch.save(target_net.state_dict(), f'{save_model_dir}/{timestr}/target_net')

In [None]:
#import matplotlib.pyplot as plt
#from IPython import display

#_, ax = plt.subplots(1, 1)

#img = ax.imshow(env.render())
env.close()

env = gym.make("LunarLander-v2",render_mode='human')

while True:
  state, info = env.reset()
  state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

  for t in count():
    with torch.no_grad():
      action = policy_net(state).max(1)[1].view(1,1)
    
    observation, reward, terminated, truncated, _ = env.step(action.item())
    done = terminated or truncated

    if terminated:
      next_state = None
    else:
      next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
    
    state = next_state
    
    env.render()
    #img.set_data(env.render()) 
    #ax.axis('off')
    #display.display(plt.gcf())
    #display.clear_output(wait=True)
    if done:
      break