In [5]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple
import random

class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, sigma_init=0.017):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.sigma_init = sigma_init

        self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
        self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))

        self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
        self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
        self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))

        self.reset_parameters()
        self.reset_noise()

    def reset_parameters(self):
        mu_range = 1. / np.sqrt(self.weight_mu.size(1))
        self.weight_mu.data.uniform_(-mu_range, mu_range)
        self.weight_sigma.data.fill_(self.sigma_init)
        self.bias_mu.data.uniform_(-mu_range, mu_range)
        self.bias_sigma.data.fill_(self.sigma_init)

    def reset_noise(self):
        epsilon_in = self._scale_noise(self.in_features)
        epsilon_out = self._scale_noise(self.out_features)
        self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
        self.bias_epsilon.copy_(epsilon_out)

    def forward(self, input):
        if self.training:
            weight = self.weight_mu + self.weight_sigma * self.weight_epsilon
            bias = self.bias_mu + self.bias_sigma * self.bias_epsilon
        else:
            weight = self.weight_mu
            bias = self.bias_mu
        return F.linear(input, weight, bias)

    def _scale_noise(self, size):
        x = torch.randn(size)
        return x.sign().mul_(x.abs().sqrt_())

class NoisyDQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(NoisyDQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.noisy1 = NoisyLinear(128, 128)
        self.noisy2 = NoisyLinear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.noisy1(x))
        return self.noisy2(x)

class NoisyDQNAgent:
    def __init__(self, state_dim, action_dim, buffer_size=int(1e5), batch_size=64, gamma=0.99, lr=1e-4, tau=1e-3, update_every=4):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.batch_size = batch_size
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.update_every = update_every

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.qnetwork_local = NoisyDQN(state_dim, action_dim).to(self.device)
        self.qnetwork_target = NoisyDQN(state_dim, action_dim).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr)

        self.memory = deque(maxlen=buffer_size)
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.append(self.experience(state, action, reward, next_state, done))

        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            if len(self.memory) > self.batch_size:
                experiences = self.sample()
                self.learn(experiences)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        return np.argmax(action_values.cpu().data.numpy())

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions)

        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)
        self.qnetwork_local.noisy1.reset_noise()
        self.qnetwork_local.noisy2.reset_noise()
        self.qnetwork_target.noisy1.reset_noise()
        self.qnetwork_target.noisy2.reset_noise()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)

        return (states, actions, rewards, next_states, dones)

    def load(self, filename):
        self.qnetwork_local.load_state_dict(torch.load(filename))
        self.qnetwork_target.load_state_dict(torch.load(filename))

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename)

env = gym.make('LunarLander-v2',render_mode='human')
agent = NoisyDQNAgent(state_dim=8, action_dim=4)

n_episodes = 2000
max_t = 1000
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995

scores = []
scores_window = deque(maxlen=100)
eps = eps_start


  deprecation(
  deprecation(


In [6]:
path = 'noisy_DQN.pth'
try:
    map_location = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    agent.load(path)
    print("Checkpoint loaded from file.")

except FileNotFoundError:
    print("No checkpoint found, starting from scratch.")

Checkpoint loaded from file.


In [7]:
for i_episode in range(10):
    state = env.reset()
    score = 0
    while True:
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        score += reward
        if done:
            break
    
    
   

    print(f"Episode: {i_episode}, Reward: {score}")
   
env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Reward: 186.58984931046936
Episode: 1, Reward: -41.29513823127909
Episode: 2, Reward: 233.40571213934012
Episode: 3, Reward: 128.8756784081858
Episode: 4, Reward: -37.93997148811772
Episode: 5, Reward: 155.0474500846796
Episode: 6, Reward: 123.41775397241207
Episode: 7, Reward: 157.10741248857423
Episode: 8, Reward: 172.51482112475261
Episode: 9, Reward: -28.428365560572274
