#Install and Import

In [1]:
# !pip install gym==0.25.2
# !pip install swig
# !pip install gym[box2d]



In [2]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os

#Changing Dir

In [3]:
# PATH = '/content/drive/MyDrive/Pytorch/rl/MOG_DQN'

  and should_run_async(code)


In [4]:
# os.chdir(PATH)

In [5]:
# !pwd

/content/drive/MyDrive/Pytorch/rl/MOG_DQN


#Env

In [6]:
# Initialize the Lunar Lander environment
env = gym.make('LunarLander-v2')

  deprecation(
  deprecation(


#Creating Model

In [7]:
class GaussianMixtureModel(nn.Module):
    def __init__(self, state_dim, action_dim,num_components,hidden_dim=256):
        super(GaussianMixtureModel, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.num_components = num_components

        self.fc1 = nn.Linear(state_dim,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim,hidden_dim)

        #output action_dim * num_components   (4*5)
        self.mean = nn.Linear(hidden_dim, action_dim * num_components)
        self.log_var = nn.Linear(hidden_dim, action_dim * num_components)
        self.logits = nn.Linear(hidden_dim, action_dim * num_components)

    def forward(self, state):

        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        #mean is reshaped from (1,20) to (1,4,5)and so is log_var and logits
        mean = self.mean(x).view(-1, self.action_dim, self.num_components)
        log_var = self.log_var(x).view(-1, self.action_dim, self.num_components)
        log_var = torch.clamp(log_var, -10, 10)  # Clipping log variance for stability
        logits = self.logits(x).view(-1, self.action_dim, self.num_components)
        return mean, log_var, logits

    def get_distribution(self, state):
        """
        mean represents the expected value of the distribution.
        log_var represents the logarithm of the variance of the distribution.
        Taking the exponential of log_var yields the variance.
        logits represent the unnormalized log probabilities of each action.
        Softmax is applied to logits to convert them into a probability distribution.
        This function encapsulates the logic of computing the distribution parameters
        and is useful for sampling actions from the distribution or
        computing probabilities of actions given states during training or
        inference in reinforcement learning algorithms.
        """
        mean, log_var, logits = self.forward(state)
        return mean, torch.exp(log_var), torch.softmax(logits, dim=-1)


In [8]:
# Experience replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity

    def push(self, experience):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[idx] for idx in indices]



#Define Functions

In [9]:
def compute_loss(mean, var, logits, target):
    # a Normal distribution object m is created using the provided mean and var tensors.
    #This distribution will be used to calculate the log probabilities of the target values.

    m = torch.distributions.Normal(mean, var)


    #target.unsqueeze(-1) adds an extra dimension to target to make it compatible for broadcasting with mean.
    #expand_as(mean) expands the target tensor to the same shape as mean.
    #m.log_prob(target) computes the log probability of target under the normal distribution m.
    #The result is a tensor of log probabilities with the same shape as mean.
    log_prob = m.log_prob(target.unsqueeze(-1).expand_as(mean))
    # print(f"log_prob: {log_prob.shape}")

    #torch.sum(log_prob, dim=-2) sums the log probabilities along the second last dimension,
    #aggregating the contributions across different dimensions of mean.
    #torch.log(logits + 1e-10) adds the log of logits to the summed log probabilities.
    # The 1e-10 is a small epsilon value added to avoid taking the logarithm of zero,
    # which would result in numerical instability.
    #The resulting log_prob tensor now incorporates both the log probabilities
    #of the target values under the normal distributions and the log of the class logits.
    log_prob = torch.sum(log_prob, dim=-2) + torch.log(logits + 1e-10)  # Adding epsilon for numerical stability
    # print(f"log_prob: {log_prob.shape}")

    #torch.logsumexp(log_prob, dim=-1) computes the log-sum-exp across the last dimension of log_prob.
    # This operation is numerically stable and combines the probabilities in a way that prevents underflow or overflow.
    #The negative sign indicates that we are converting the log-sum-exp to a negative log likelihood.
    #.mean() computes the average loss over the batch.
    loss = -torch.logsumexp(log_prob, dim=-1).mean()
    # print(f"loss: {loss.shape}")
    return loss

In [10]:
def save_checkpoint(state, filename='checkpoint.pth'):
    torch.save(state, filename)

In [11]:
def load_checkpoint(filename='checkpoint.pth', map_location=None):
    if map_location:
        return torch.load(filename, map_location=map_location)
    return torch.load(filename)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Hyperparameters

In [13]:
# Hyperparameters
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 128
num_components = 5
learning_rate = 0.0005
num_episodes = 2000
gamma = 0.99
batch_size = 64
buffer_capacity = 10000
target_update_freq = 5

In [14]:
# Exploration parameters
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995

#Initialize

In [15]:
buffer = ReplayBuffer(buffer_capacity)

In [16]:
# Initialize model and target model
model = GaussianMixtureModel(state_dim, action_dim, num_components,hidden_dim)
target_model = GaussianMixtureModel(state_dim, action_dim, num_components,hidden_dim)

In [17]:
# Move the model to the chosen device
model.to(device)
target_model.to(device)

GaussianMixtureModel(
  (fc1): Linear(in_features=8, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (mean): Linear(in_features=128, out_features=20, bias=True)
  (log_var): Linear(in_features=128, out_features=20, bias=True)
  (logits): Linear(in_features=128, out_features=20, bias=True)
)

In [18]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
checkpoint_path = 'mog_dqn.pth'

  and should_run_async(code)


In [20]:
try:
    map_location = torch.device('cpu') if not torch.cuda.is_available() else None
    checkpoint = load_checkpoint(checkpoint_path, map_location=map_location)
    model.load_state_dict(checkpoint['main_net_state_dict'])
    target_model.load_state_dict(checkpoint['target_net_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epsilon = checkpoint['epsilon']
    start_episode = checkpoint['episode'] + 1
    print(f"Loaded checkpoint from episode {start_episode}")
except FileNotFoundError:
    print("No checkpoint found, starting from scratch.")

No checkpoint found, starting from scratch.


In [21]:
target_model.load_state_dict(model.state_dict())
target_model.eval()

GaussianMixtureModel(
  (fc1): Linear(in_features=8, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (mean): Linear(in_features=128, out_features=20, bias=True)
  (log_var): Linear(in_features=128, out_features=20, bias=True)
  (logits): Linear(in_features=128, out_features=20, bias=True)
)

#Train

In [22]:
episode_rewards = []

In [None]:
# Training loop
epsilon = epsilon_start
for episode in range(num_episodes):
    #reset the environment
    state = env.reset()

    #reset the total reward for the episode
    total_reward = 0

    #reset the timer
    for t in range(1000):
        # Convert state to tensor
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)

        #pass the state_tensor to the model and get mean, var and logits
        #the shape of mean is (1,action_dim,num_components)  (1,4,5)
        #the shape of var is (1,action_dim,num_components)  (1,4,5)
        #the shape of logits is (1,action_dim,num_components)  (1,4,5)
        mean, var, logits = model.get_distribution(state_tensor)

        #the mean(dim=-1) function calculates the mean across the last dimension of the tensor,
        #and softmax(dim=-1) applies the softmax function across the last dimension.
        #The shape of action_probs is (1,4)
        #Each value representating the probability of taking each action
        action_probs = mean.mean(dim=-1).softmax(dim=-1).cpu().detach().numpy()


        # Handle NaN values in action_probs
        if np.isnan(action_probs).any():
            action_probs = np.nan_to_num(action_probs, nan=1.0/action_dim)
            action_probs /= action_probs.sum()  # Re-normalize to ensure it's a valid probability distribution

        # ε-greedy exploration strategy
        if np.random.rand() < epsilon:
            #Take a random action
            action = np.random.choice(action_dim)
        else:
            #Take the action with the highest probability
            action = np.argmax(action_probs[0])

        #next_state, reward, done after taking the action
        next_state, reward, done, _ = env.step(action)

        #Put these inside replay buffer
        buffer.push((state, action, reward, next_state, done))

        #set next_state as current state
        state = next_state

        #adding to total reward
        total_reward += reward

        #checking the buffer size
        if len(buffer.buffer) >= batch_size:
            #Sample a batch of experiences
            batch = buffer.sample(batch_size)
            #Each tuple represents a single transition in the environment and
            #consists of (state, action, reward, next_state, done).
            #Using the zip(*batch) operation allows you to unpack these
            #transitions into separate tuples for states, actions, rewards, next_states, and dones.
            states, actions, rewards, next_states, dones = zip(*batch)

            #converting to tensor
            states_tensor = torch.tensor(states, dtype=torch.float32).to(device)
            actions_tensor = torch.tensor(actions, dtype=torch.long).to(device)
            rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(device)
            next_states_tensor = torch.tensor(next_states, dtype=torch.float32).to(device)
            dones_tensor = torch.tensor(dones, dtype=torch.float32).to(device)

            #The shape of states_tensor is (batch_size,observation_space) (64,8)
            #mean represents the expected value of the distribution.
            #log_var represents the logarithm of the variance of the distribution.
            #Taking the exponential of log_var yields the variance.
            #logits represent the unnormalized log probabilities of each action.
            #Softmax is applied to logits to convert them into a probability distribution.
            mean, var, logits = model.get_distribution(states_tensor)

            #torch.no_grad() is a context manager in PyTorch that is used to disable gradient calculation.
            #This is useful when you are performing inference (making predictions)
            #and do not need the gradients for backpropagation, which helps save memory and computation.
            with torch.no_grad():
                next_mean, next_var, next_logits = target_model.get_distribution(next_states_tensor)

            target = rewards_tensor + gamma * torch.max(next_mean.mean(dim=-1), dim=1).values * (1 - dones_tensor)


            #Extracts the mean and var as guided by the actions
            mean = mean.gather(1, actions_tensor.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, num_components)).squeeze(1)
            var = var.gather(1, actions_tensor.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, num_components)).squeeze(1)
            logits = logits.gather(1, actions_tensor.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, num_components)).squeeze(1)

            loss = compute_loss(mean, var, logits, target)

            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

        if done:
            break

    # Decay epsilon
    epsilon = max(epsilon_end, epsilon_decay * epsilon)

    episode_rewards.append(total_reward)
    # Update the target network
    if episode % target_update_freq == 0:
        target_model.load_state_dict(model.state_dict())

    print(f"Episode {episode}, Total Reward: {total_reward}")


    if episode % 50 == 0:
        save_checkpoint({
            'episode': episode,
            'main_net_state_dict': model.state_dict(),
            'target_net_state_dict': target_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epsilon': epsilon
        }, checkpoint_path)
        print(f"Checkpoint saved at episode {episode}")


    if(sum(episode_rewards[-5:])>1000):
      print(sum(episode_rewards[-5:])>1000)
      print("Training done")
      save_checkpoint({
            'episode': episode,
            'main_net_state_dict': model.state_dict(),
            'target_net_state_dict': target_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epsilon': epsilon
        }, checkpoint_path)
      print(f"Checkpoint saved at episode {episode}")
      break


  if not isinstance(terminated, (bool, np.bool8)):
  states_tensor = torch.tensor(states, dtype=torch.float32).to(device)


Episode 0, Total Reward: -328.7246920612579
Checkpoint saved at episode 0
Episode 1, Total Reward: -15.642152556351007
Episode 2, Total Reward: -67.96524373670869
Episode 3, Total Reward: -127.91584412776939
Episode 4, Total Reward: -243.46313082528516
Episode 5, Total Reward: -101.97163217847715
Episode 6, Total Reward: -86.51404279548872
Episode 7, Total Reward: -63.26510193873946
Episode 8, Total Reward: -232.60392602898534
Episode 9, Total Reward: -119.47578877260489
Episode 10, Total Reward: -277.9343961736424
Episode 11, Total Reward: -4.237565077275136
Episode 12, Total Reward: -358.0058036030391
Episode 13, Total Reward: -128.2873716616307
Episode 14, Total Reward: -177.98357675876613
Episode 15, Total Reward: -202.6590407152305
Episode 16, Total Reward: -208.3858292636416
Episode 17, Total Reward: -152.24608437087784
Episode 18, Total Reward: -126.57687559199424
Episode 19, Total Reward: -59.27290176340579
Episode 20, Total Reward: -89.33296543204973
Episode 21, Total Reward: 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(episode_rewards)
plt.show()