In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.is_available()

True

In [2]:
import gymnasium as gym
env = gym.make("LunarLander-v2",render_mode='rgb_array')

In [3]:
from collections import namedtuple, deque
import random

Transition = namedtuple('Transition', ('state', 'action','next_state', 'reward'))

class ReplayMemory():

  def __init__(self, capacity):
    self.memory = deque([], maxlen=capacity)
  
  def push(self, *args):
    self.memory.append(Transition(*args))
  
  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)

  def __len__(self):
    return len(self.memory)

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
  def __init__(self, n_observations, n_actions):
    super(DQN, self).__init__()
    self.layer0 = nn.Linear(n_observations, 512)
    self.dropout0 = nn.Dropout(0.2)
    self.layer1 = nn.Linear(512, 512)
    self.dropout1 = nn.Dropout(0.2)
    self.layer2 = nn.Linear(512, 512)
    self.dropout2 = nn.Dropout(0.2)
    self.layer3 = nn.Linear(512, n_actions)

  def forward(self, x):
    x = F.relu(self.layer0(x))
    # x = self.dropout0(x)
    x = F.relu(self.layer1(x))
    # x = self.dropout1(x)
    x = F.relu(self.layer2(x))
    # x = self.dropout2(x)
    return self.layer3(x)

In [10]:
import torch.optim as optim
import math

REPLAY_SIZE = 100_000
BATCH_SIZE = 1024
GAMMA = 0.98
TAU = 0.01
LR = 0.00001

n_actions = env.action_space.n

state, info = env.reset()
n_observations = len(state)

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.Adam(policy_net.parameters(), lr=LR) # amsgrad? r:
memory = ReplayMemory(REPLAY_SIZE)
global_step = 0


#steps_done = 0

def select_action(state, eps_threshold):
  #global steps_done
  sample = random.random()
  
  # steps_done += 1

  if sample > eps_threshold:
    with torch.no_grad():
      return policy_net(state).max(1)[1].view(1,1) # .view(1,1)? r:
  else:
    return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

episode_durations = []

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return

    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions)) # print after

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device)

    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch) # print after


    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
    expected_next_action_values = reward_batch + GAMMA * next_state_values

    criterion = nn.MSELoss()
    loss = criterion(state_action_values, expected_next_action_values.unsqueeze(1))
    
    optimizer.zero_grad()
    loss.backward()

    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    return loss.item()

[ 0.00460558  1.3994718   0.46647897 -0.508822   -0.00532992 -0.10566436
  0.          0.        ]


In [6]:
from matplotlib import animation
import matplotlib.pyplot as plt
import networkx as nx
from matplotlib.animation import FuncAnimation, PillowWriter

def save_frames_as_gif(frames, path='./', filename='gym_animation.gif'):

    #Mess with this to change frame size
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi=72)

    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    anim.save(path + filename, writer='imagemagick', fps=30)

In [7]:
EPS_START = 0.9
EPS_END = 0.01
EPS_DECAY = 30 #6/1

if torch.cuda.is_available():
  num_episodes = 300
else:
  num_episodes = 50

h_params = {
    'EPS_START': EPS_START,
    'EPS_END': EPS_END,
    'EPS_DECAY': EPS_DECAY,
    'REPLAY_SIZE': REPLAY_SIZE,
    'BATCH_SIZE': BATCH_SIZE,
    'GAMMA': GAMMA,
    'TAU': TAU,
    'LR': LR,
    
}

In [8]:
from tensorboardX import SummaryWriter
import time
import numpy as np

timestr = time.strftime("%Y_%m_%d_%H_%M_%S")

In [9]:
from itertools import count
import numpy as np

env.close()
env = gym.make("LunarLander-v2",render_mode='rgb_array')

while len(memory) < BATCH_SIZE:
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    
    while True:
        action = torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device, dtype=torch.float32)
        done = terminated or truncated
        
        if terminated:
          next_state = None
        else:
          next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
        
        memory.push(state, action, next_state, reward)
        
        state = next_state
        if done:
            if int(len(memory) / (BATCH_SIZE * 100)) % 10 == 0:
                print(len(memory) / (BATCH_SIZE * 100))
            break
env.close()

env = gym.make("LunarLander-v2",render_mode='human')

ep_losses = []

with SummaryWriter(log_dir=f'runs/{timestr}') as writer:
    
    while global_step < num_episodes:
        state, info = env.reset(seed=global_step)
        state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

        eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * global_step / EPS_DECAY)

        ep_reward = 0
        ep_qvalues = 0
        # ep_gif_frames = []

        for t in count():

            action = select_action(state, eps_threshold)
            observation, reward, terminated, truncated, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device, dtype=torch.float32)
            done = terminated or truncated

            # ep_gif_frames.append(env.render())
            ep_reward += reward.cpu().numpy().item()
            ep_qvalues += policy_net(state).max(1)[0].item()

            if terminated:
              next_state = None
            else:
              next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

            memory.push(state, action, next_state, reward)
            state = next_state

            loss_scalar = optimize_model()
            ep_losses.append(loss_scalar)

            target_net_state_dict = target_net.state_dict()
            policy_net_state_dict = policy_net.state_dict()

            # Soft update of the target network's weights
            # θ′ ← τ θ + (1 −τ )θ′
            for key in policy_net_state_dict:
                target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
            target_net.load_state_dict(target_net_state_dict)

            if done:
                mean_rewards = ep_reward / (t + 1)
                print(f'[{global_step}/{num_episodes}]', f'[loss_mean: {np.mean(ep_losses)}]', f'[rewards_mean: {mean_rewards}]', f'[iterations: {t}]', f'[last reward: {reward.item()}]')

                #if i_episode % 1 == 0:
                #    save_frames_as_gif(ep_gif_frames, path='./ep_gifs/', filename=f'{i_episode}.gif')
                break
            

        loss_mean = np.mean(ep_losses)
        """
        writer.add_scalar('Train/Loss', loss_scalar, global_step)
        writer.add_scalar('Train/Threshold', eps_threshold, global_step)
        writer.add_scalar('Train/Reward', ep_reward / (t + 1), global_step)
        writer.add_scalar('Train/Q_values', ep_qvalues / (t + 1), global_step)
        writer.add_scalar('Train/Memorylen', len(memory), global_step)
        writer.add_scalar('Train/t', t + 1, global_step)
        writer.add_scalar('Train/global_step', global_step, global_step)
        """
        writer.add_hparams(

            h_params,
            {
                'i_episode': global_step,
                't': t + 1,
                'Memory_len': len(memory),
                'Threshold': eps_threshold,
                'Loss': loss_mean, # loss_scalar,
                'Qvalues': ep_qvalues / (t + 1),
                'Reward': ep_reward / (t + 1),
            }, name='.', global_step=global_step,
        )
        ep_losses = []
        
        global_step += 1 
        writer.flush()

tensor([[ 0.0016,  1.4103,  0.1649, -0.0261, -0.0019, -0.0373,  0.0000,  0.0000]],
       device='cuda:0')
tensor([[ 0.0032,  1.4092,  0.1563, -0.0514, -0.0020, -0.0033,  0.0000,  0.0000]],
       device='cuda:0')
tensor([[ 0.0047,  1.4074,  0.1563, -0.0781, -0.0022, -0.0033,  0.0000,  0.0000]],
       device='cuda:0')
tensor([[ 6.2251e-03,  1.4051e+00,  1.4556e-01, -1.0489e-01, -2.2928e-04,
          3.9604e-02,  0.0000e+00,  0.0000e+00]], device='cuda:0')
tensor([[ 7.7765e-03,  1.4021e+00,  1.5508e-01, -1.3196e-01, -1.6010e-04,
          1.3841e-03,  0.0000e+00,  0.0000e+00]], device='cuda:0')
tensor([[ 9.3287e-03,  1.3999e+00,  1.5514e-01, -9.6825e-02, -8.6834e-05,
          1.4654e-03,  0.0000e+00,  0.0000e+00]], device='cuda:0')
tensor([[ 1.0825e-02,  1.3978e+00,  1.4987e-01, -9.2165e-02, -2.7851e-04,
         -3.8342e-03,  0.0000e+00,  0.0000e+00]], device='cuda:0')
tensor([[ 1.2322e-02,  1.3952e+00,  1.4987e-01, -1.1883e-01, -4.6996e-04,
         -3.8293e-03,  0.0000e+00,  0.000

KeyboardInterrupt: 

In [None]:
import os

if not os.path.exists('./saved_models/'):
    os.mkdir('./saved_models')
if not os.path.exists(f'./saved_models/{timestr}'):
    os.mkdir(f'./saved_models/{timestr}')

torch.save(policy_net.state_dict(), f'./saved_models/{timestr}/policy_net')
torch.save(target_net.state_dict(), f'./saved_models/{timestr}/target_net')

In [None]:
#import matplotlib.pyplot as plt
#from IPython import display

#_, ax = plt.subplots(1, 1)

#img = ax.imshow(env.render())
env.close()

env = gym.make("LunarLander-v2",render_mode='human')

while True:
  state, info = env.reset()
  state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

  for t in count():
    with torch.no_grad():
      action = policy_net(state).max(1)[1].view(1,1)
    
    observation, reward, terminated, truncated, _ = env.step(action.item())
    done = terminated or truncated

    if terminated:
      next_state = None
    else:
      next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)
    
    state = next_state
    
    env.render()
    #img.set_data(env.render()) 
    #ax.axis('off')
    #display.display(plt.gcf())
    #display.clear_output(wait=True)
    if done:
      break

In [None]:
import math

a = 150
math.exp(-1. * 100 / a), math.exp(-1. * 200 / a)