In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque
import gym

# Define the Bayesian Q-Network
class BayesianQNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=128, dropout_prob=0.2):
        super(BayesianQNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

# Define the BDQN Agent with Double Q-Learning
class BDQNAgent:
    def __init__(self, state_size, action_size, hidden_size=128, dropout_prob=0.2, learning_rate=0.0005,
                 gamma=0.99, batch_size=128, buffer_size=100000, tau=0.01, update_freq=4, n_samples=10):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer = deque(maxlen=buffer_size)
        self.tau = tau
        self.update_freq = update_freq
        self.n_samples = n_samples

        self.qnetwork_local = BayesianQNetwork(state_size, action_size, hidden_size, dropout_prob).to(device)
        self.qnetwork_target = BayesianQNetwork(state_size, action_size, hidden_size, dropout_prob).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=learning_rate)

        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
        self.t_step = (self.t_step + 1) % self.update_freq
        if len(self.buffer) > self.batch_size and self.t_step == 0:
            experiences = self.sample()
            self.learn(experiences)

    def act(self, state, eps=0.):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = [self.qnetwork_local(state) for _ in range(self.n_samples)]
            action_values = torch.stack(action_values).mean(0)
        self.qnetwork_local.train()

        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def sample(self):
        experiences = random.sample(self.buffer, k=self.batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)

        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).to(device)
        rewards = torch.FloatTensor(rewards).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).to(device)

        return (states, actions, rewards, next_states, dones)

    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        Q_targets_next = [self.qnetwork_local(next_states).detach() for _ in range(self.n_samples)]
        Q_targets_next = torch.stack(Q_targets_next).mean(0)
        Q_targets_next = Q_targets_next.gather(1, self.qnetwork_target(next_states).max(1)[1].unsqueeze(1)).squeeze(1)

        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))

        Q_expected = self.qnetwork_local(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        loss = F.mse_loss(Q_expected, Q_targets)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.qnetwork_local.parameters(), 1)  # Gradient clipping
        self.optimizer.step()

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

    def save(self, filename):
        torch.save(self.qnetwork_local.state_dict(), filename)

    def load(self, filename, map_location=None):
        self.qnetwork_local.load_state_dict(torch.load(filename, map_location=map_location))
        self.qnetwork_target.load_state_dict(torch.load(filename, map_location=map_location))


    def update_target_network(self):
        self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

# Define the training loop
def test(env, agent, n_episodes=10, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        while True:
            action = agent.act(state, eps_end)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break
        
        scores.append(score)
        eps = max(eps_end, eps_decay * eps)
        print(f"Episode: {i_episode} Reward: {score}")
    
    return scores

# Initialize environment and agent
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
env = gym.make('LunarLander-v2',render_mode='human')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = BDQNAgent(state_size, action_size)

path = 'BDQN_2.pth'
try:
    map_location = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    agent.load(path, map_location=map_location)
    print("Checkpoint loaded from file.")

except FileNotFoundError:
    print("No checkpoint found, starting from scratch.")


# Train the agent
scores = test(env, agent)
env.close()


  deprecation(
  deprecation(


Checkpoint loaded from file.


  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1 Reward: -23.296613815232647
Episode: 2 Reward: 192.0730605204315
Episode: 3 Reward: 140.2250332625997
Episode: 4 Reward: -100.01585074094173
Episode: 5 Reward: -71.67097700583425
Episode: 6 Reward: -165.6513314487065
Episode: 7 Reward: -142.64559502585837
Episode: 8 Reward: -29.91236930214449
Episode: 9 Reward: 5.084150265856106
Episode: 10 Reward: -72.29878505021188
