In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

In [2]:
# Initialize the Lunar Lander environment
env = gym.make('LunarLander-v2',render_mode='human')

  deprecation(
  deprecation(


In [3]:
class GaussianMixtureModel(nn.Module):
    def __init__(self, state_dim, action_dim,num_components,hidden_dim=256):
        super(GaussianMixtureModel, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.num_components = num_components

        self.fc1 = nn.Linear(state_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)

        #output action_dim * num_components   (4*5)
        self.mean = nn.Linear(hidden_dim, action_dim * num_components)
        self.log_var = nn.Linear(hidden_dim, action_dim * num_components)
        self.logits = nn.Linear(hidden_dim, action_dim * num_components)

    def forward(self, state):

        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        #mean is reshaped from (1,20) to (1,4,5)and so is log_var and logits
        mean = self.mean(x).view(-1, self.action_dim, self.num_components)
        log_var = self.log_var(x).view(-1, self.action_dim, self.num_components)
        log_var = torch.clamp(log_var, -10, 10)  # Clipping log variance for stability
        logits = self.logits(x).view(-1, self.action_dim, self.num_components)
        return mean, log_var, logits

    def get_distribution(self, state):
        """
        mean represents the expected value of the distribution.
        log_var represents the logarithm of the variance of the distribution.
        Taking the exponential of log_var yields the variance.
        logits represent the unnormalized log probabilities of each action.
        Softmax is applied to logits to convert them into a probability distribution.
        This function encapsulates the logic of computing the distribution parameters
        and is useful for sampling actions from the distribution or
        computing probabilities of actions given states during training or
        inference in reinforcement learning algorithms.
        """
        mean, log_var, logits = self.forward(state)
        return mean, torch.exp(log_var), torch.softmax(logits, dim=-1)


In [4]:
# Experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity

    def push(self, experience):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[idx] for idx in indices]



In [5]:
def compute_loss(mean, var, logits, target):
    # a Normal distribution object m is created using the provided mean and var tensors.
    #This distribution will be used to calculate the log probabilities of the target values.

    m = torch.distributions.Normal(mean, var)


    #target.unsqueeze(-1) adds an extra dimension to target to make it compatible for broadcasting with mean.
    #expand_as(mean) expands the target tensor to the same shape as mean.
    #m.log_prob(target) computes the log probability of target under the normal distribution m.
    #The result is a tensor of log probabilities with the same shape as mean.
    log_prob = m.log_prob(target.unsqueeze(-1).expand_as(mean))
    # print(f"log_prob: {log_prob.shape}")

    #torch.sum(log_prob, dim=-2) sums the log probabilities along the second last dimension,
    #aggregating the contributions across different dimensions of mean.
    #torch.log(logits + 1e-10) adds the log of logits to the summed log probabilities.
    # The 1e-10 is a small epsilon value added to avoid taking the logarithm of zero,
    # which would result in numerical instability.
    #The resulting log_prob tensor now incorporates both the log probabilities
    #of the target values under the normal distributions and the log of the class logits.
    log_prob = torch.sum(log_prob, dim=-2) + torch.log(logits + 1e-10)  # Adding epsilon for numerical stability
    # print(f"log_prob: {log_prob.shape}")

    #torch.logsumexp(log_prob, dim=-1) computes the log-sum-exp across the last dimension of log_prob.
    # This operation is numerically stable and combines the probabilities in a way that prevents underflow or overflow.
    #The negative sign indicates that we are converting the log-sum-exp to a negative log likelihood.
    #.mean() computes the average loss over the batch.
    loss = -torch.logsumexp(log_prob, dim=-1).mean()
    # print(f"loss: {loss.shape}")
    return loss

In [6]:
def save_checkpoint(state, filename='checkpoint.pth'):
    torch.save(state, filename)

In [7]:
def load_checkpoint(filename='checkpoint.pth', map_location=None):
    if map_location:
        return torch.load(filename, map_location=map_location)
    return torch.load(filename)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [15]:
# Hyperparameters
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 128
num_components = 5
learning_rate = 0.0005

In [10]:
# Initialize model and target model
model = GaussianMixtureModel(state_dim, action_dim, num_components,hidden_dim)
target_model = GaussianMixtureModel(state_dim, action_dim, num_components,hidden_dim)

In [11]:
# Move the model to the chosen device
model.to(device)
target_model.to(device)

GaussianMixtureModel(
  (fc1): Linear(in_features=8, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (mean): Linear(in_features=128, out_features=20, bias=True)
  (log_var): Linear(in_features=128, out_features=20, bias=True)
  (logits): Linear(in_features=128, out_features=20, bias=True)
)

In [16]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [12]:
checkpoint_path = 'mog_dqn.pth'

In [17]:
try:
    map_location = torch.device('cpu') if not torch.cuda.is_available() else None
    checkpoint = load_checkpoint(checkpoint_path, map_location=map_location)
    model.load_state_dict(checkpoint['main_net_state_dict'])
    target_model.load_state_dict(checkpoint['target_net_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epsilon = checkpoint['epsilon']
    start_episode = checkpoint['episode'] + 1
    print(f"Loaded checkpoint from episode {start_episode}")
except FileNotFoundError:
    print("No checkpoint found, starting from scratch.")

Loaded checkpoint from episode 603


In [18]:
target_model.load_state_dict(model.state_dict())
target_model.eval()

GaussianMixtureModel(
  (fc1): Linear(in_features=8, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (mean): Linear(in_features=128, out_features=20, bias=True)
  (log_var): Linear(in_features=128, out_features=20, bias=True)
  (logits): Linear(in_features=128, out_features=20, bias=True)
)

In [20]:
for episode in range(10):
    state = env.reset()
    episode_reward = 0

    while True:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        mean, var, logits = model.get_distribution(state_tensor)

        action_probs = mean.mean(dim=-1).softmax(dim=-1).detach().numpy()

        # Handle NaN values in action_probs
        if np.isnan(action_probs).any():
            action_probs = np.nan_to_num(action_probs, nan=1.0/action_dim)
            action_probs /= action_probs.sum()  # Re-normalize to ensure it's a valid probability distribution

        action = np.argmax(action_probs[0])

        next_state, reward, done, _ = env.step(action)
        state = next_state
        episode_reward += reward
        

        if done:
            break

    print(f"Episode: {episode}, Reward: {episode_reward}")

    

env.close()

  if not isinstance(terminated, (bool, np.bool8)):


Episode: 0, Reward: 212.0231553219341
Episode: 1, Reward: 147.40671003059174
Episode: 2, Reward: 232.90812552693615
Episode: 3, Reward: 90.22635322517452
Episode: 4, Reward: 197.58630293074646
Episode: 5, Reward: 146.569679007311
Episode: 6, Reward: 192.26214516184638
Episode: 7, Reward: 210.95717712649326
Episode: 8, Reward: 144.8541496096244
Episode: 9, Reward: 139.7599173840508
