# Use this file to load the actor critic (or train a new one) and create or enlarge a dataset of successful movement

In [9]:
import gymnasium as gym
import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random
import math
import matplotlib.pyplot as plt
import imageio
# Set random seeds for reproducibility
SEED = 42
th.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Ornstein-Uhlenbeck Noise for exploration
class OUNoise:
    def __init__(self, action_dim, mu=0.0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu * np.ones(action_dim)
        self.theta = theta
        self.sigma = sigma
        self.state = np.copy(self.mu)
    
    def reset(self):
        self.state = np.copy(self.mu)
    
    def sample(self):
        dx = self.theta * (self.mu - self.state) + self.sigma * np.random.randn(self.action_dim)
        self.state += dx
        return self.state

# Custom Reward Wrapper
class HoverLunarLander(gym.Wrapper):
    def __init__(self, env, target_location=(0.1, 0.25), epsilon=1e-3, 
                 penalty_landing=-50.0, penalty_crashing=-100.0, penalty_offscreen=-100.0):
        super(HoverLunarLander, self).__init__(env)
        self.target_x = target_location[0]
        self.target_y = target_location[1]
        self.epsilon = epsilon
        self.penalty_landing = penalty_landing
        self.penalty_crashing = penalty_crashing
        self.penalty_offscreen = penalty_offscreen

    def step(self, action):
        # Take a step in the original environment
        state, original_reward, terminated, truncated, info = self.env.step(action)
        
        # Extract position from state (state[0]: x, state[1]: y)
        x, y = state[0], state[1]
        
        # Compute distance to target location
        distance = math.sqrt((x - self.target_x)**2 + (y - self.target_y)**2)
        
        # Compute inverse distance reward
        inverse_distance_reward = 1.0 / (distance + self.epsilon)
        
        # Initialize new reward
        new_reward = inverse_distance_reward
        
        # Check for penalties
        if terminated:
            # Determine if landed successfully
            if self.is_landed(state):
                new_reward += self.penalty_landing
                info['termination_cause'] = 'landed'
            else:
                new_reward += self.penalty_crashing
                info['termination_cause'] = 'crashed'
        elif truncated:
            # Went off-screen
            new_reward += self.penalty_offscreen
            info['termination_cause'] = 'offscreen'
        
        return state, new_reward, terminated, truncated, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)
    
    def is_landed(self, state):
        # Criteria for successful landing:
        # 1. Both legs are in contact (state[6] and state[7] == 1)
        # 2. Vertical and horizontal velocities are low
        # 3. Angle is near vertical
        leg_contact = state[6] == 1 and state[7] == 1
        vertical_velocity = abs(state[3]) < 0.5  # state[3] is y-velocity
        horizontal_velocity = abs(state[2]) < 0.5  # state[2] is x-velocity
        angle = abs(state[4]) < 0.1  # state[4] is angle
        
        return leg_contact and vertical_velocity and horizontal_velocity and angle

# Actor Network with Layer Normalization
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.max_action = max_action
        
        self.layer1 = nn.Linear(state_dim, 400)
        self.ln1 = nn.LayerNorm(400)
        self.layer2 = nn.Linear(400, 300)
        self.ln2 = nn.LayerNorm(300)
        self.layer3 = nn.Linear(300, 200)
        self.ln3 = nn.LayerNorm(200)
        self.layer4 = nn.Linear(200, action_dim)
        
        self.activation = nn.ReLU()
        self.output_activation = nn.Tanh()
        
    def forward(self, state):
        x = self.activation(self.ln1(self.layer1(state)))
        x = self.activation(self.ln2(self.layer2(x)))
        x = self.activation(self.ln3(self.layer3(x)))
        x = self.output_activation(self.layer4(x))
        return x * self.max_action

# Critic Network with Layer Normalization
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        
        self.layer1 = nn.Linear(state_dim + action_dim, 400)
        self.ln1 = nn.LayerNorm(400)
        self.layer2 = nn.Linear(400, 300)
        self.ln2 = nn.LayerNorm(300)
        self.layer3 = nn.Linear(300, 200)
        self.ln3 = nn.LayerNorm(200)
        self.layer4 = nn.Linear(200, 1)
        
        self.activation = nn.ReLU()
        
    def forward(self, state, action):
        x = th.cat([state, action], dim=1)
        x = self.activation(self.ln1(self.layer1(x)))
        x = self.activation(self.ln2(self.layer2(x)))
        x = self.activation(self.ln3(self.layer3(x)))
        x = self.layer4(x)
        return x

# Replay Buffer
class ReplayBuffer:
    def __init__(self, max_size=1000000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

# DDPG Agent with Flexible Initialization
class DDPGAgent:
    def __init__(self, state_dim, action_dim, max_action, device, actor=None, critic=None):
        """
        Initializes the DDPG Agent.
        
        Args:
            state_dim (int): Dimension of the state space.
            action_dim (int): Dimension of the action space.
            max_action (float): Maximum action value.
            device (torch.device): Device to run the networks on.
            actor (nn.Module, optional): Preloaded Actor network. Defaults to None.
            critic (nn.Module, optional): Preloaded Critic network. Defaults to None.
        """
        self.device = device
        self.max_action = max_action
        self.gamma = 0.99
        self.tau = 0.005
        self.batch_size = 64
        self.grad_clip = 1.0  # Gradient clipping value
        
        # Initialize Actor Network
        if actor is not None:
            self.actor = actor.to(device)
            print("Preloaded Actor network loaded.")
        else:
            self.actor = Actor(state_dim, action_dim, max_action).to(device)
            print("New Actor network initialized.")
        
        # Initialize Actor Target Network
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        if actor is not None:
            self.actor_target.load_state_dict(self.actor.state_dict())
            print("Actor target network initialized with preloaded Actor weights.")
        else:
            self.actor_target.load_state_dict(self.actor.state_dict())
            print("Actor target network initialized with Actor weights.")
        
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        
        # Initialize Critic Network
        if critic is not None:
            self.critic = critic.to(device)
            print("Preloaded Critic network loaded.")
        else:
            self.critic = Critic(state_dim, action_dim).to(device)
            print("New Critic network initialized.")
        
        # Initialize Critic Target Network
        self.critic_target = Critic(state_dim, action_dim).to(device)
        if critic is not None:
            self.critic_target.load_state_dict(self.critic.state_dict())
            print("Critic target network initialized with preloaded Critic weights.")
        else:
            self.critic_target.load_state_dict(self.critic.state_dict())
            print("Critic target network initialized with Critic weights.")
        
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
        
    def select_action(self, state):
        state = th.FloatTensor(state.reshape(1, -1)).to(self.device)
        self.actor.eval()
        with th.no_grad():
            action = self.actor(state).cpu().data.numpy().flatten()
        self.actor.train()
        return action
    
    def train(self, replay_buffer):
        if len(replay_buffer) < self.batch_size:
            return None, None
        
        # Sample from replay buffer
        state, action, reward, next_state, done = replay_buffer.sample(self.batch_size)
        
        # Convert to torch tensors
        state = th.FloatTensor(state).to(self.device)
        action = th.FloatTensor(action).to(self.device)
        reward = th.FloatTensor(reward).reshape(-1, 1).to(self.device)
        next_state = th.FloatTensor(next_state).to(self.device)
        done = th.FloatTensor(done).reshape(-1, 1).to(self.device)
        
        # Compute target Q value
        with th.no_grad():
            target_Q = self.critic_target(next_state, self.actor_target(next_state))
            target_Q = reward + (1 - done) * self.gamma * target_Q
        
        # Get current Q value
        current_Q = self.critic(state, action)
        
        # Compute critic loss
        critic_loss = nn.MSELoss()(current_Q, target_Q)
        
        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # Gradient clipping
        nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_clip)
        self.critic_optimizer.step()
        
        # Compute actor loss
        actor_loss = -self.critic(state, self.actor(state)).mean()
        
        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # Gradient clipping
        nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_clip)
        self.actor_optimizer.step()
        
        # Soft update target networks
        self.soft_update(self.critic, self.critic_target)
        self.soft_update(self.actor, self.actor_target)
        
        return actor_loss.item(), critic_loss.item()
    
    def soft_update(self, source, target):
        for param, target_param in zip(source.parameters(), target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

def train_ddpg(num_episodes=1000, actor=None, critic=None, device=None, 
              state_dim=None, action_dim=None, max_action=None):
    """
    Trains a DDPG agent on the HoverLunarLander environment.
    
    Args:
        num_episodes (int): Number of training episodes.
        actor (nn.Module, optional): Preloaded Actor network. Defaults to None.
        critic (nn.Module, optional): Preloaded Critic network. Defaults to None.
        device (torch.device, optional): Device to run the networks on. If None, defaults to CUDA if available.
        state_dim (int, optional): Dimension of the state space. Required if actor and critic are not provided.
        action_dim (int, optional): Dimension of the action space. Required if actor and critic are not provided.
        max_action (float, optional): Maximum action value. Required if actor and critic are not provided.
    
    Returns:
        actor (nn.Module): Trained Actor network.
        critic (nn.Module): Trained Critic network.
        training_logs (dict): Dictionary containing episode rewards and losses.
    """
    # Initialize device
    if device is None:
        device = th.device("cuda" if th.cuda.is_available() else "cpu")
    
    # Initialize environment to get dimensions if not provided
    if state_dim is None or action_dim is None or max_action is None:
        temp_env = gym.make("LunarLanderContinuous-v2", render_mode=None)
        state_dim = temp_env.observation_space.shape[0]
        action_dim = temp_env.action_space.shape[0]
        max_action = float(temp_env.action_space.high[0])
        temp_env.close()
    
    # Initialize environment with custom reward
    env = HoverLunarLander(gym.make("LunarLanderContinuous-v2", render_mode=None))
    
    # Initialize agent
    agent = DDPGAgent(state_dim, action_dim, max_action, device, actor=actor, critic=critic)
    replay_buffer = ReplayBuffer()
    
    # Initialize exploration noise
    exploration_noise = OUNoise(action_dim)
    exploration_noise.reset()
    
    # Initialize training logs
    training_logs = {
        'episode_rewards': [],
        'avg_rewards': [],
        'actor_losses': [],
        'critic_losses': []
    }
    
    for episode in range(1, num_episodes + 1):
        state, _ = env.reset(seed=SEED)
        exploration_noise.reset()
        episode_reward = 0
        actor_loss_ep = 0
        critic_loss_ep = 0
        steps = 0
        
        for step in range(1000):  # max_steps
            # Select action and add exploration noise
            action = agent.select_action(state)
            noise = exploration_noise.sample()
            action = action + noise
            action = np.clip(action, -agent.max_action, agent.max_action)
            
            # Take action in environment
            next_state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            
            # Store transition in replay buffer
            replay_buffer.add(state, action, reward, next_state, float(done))
            
            state = next_state
            episode_reward += reward
            steps += 1
            
            # Train agent
            actor_loss, critic_loss = agent.train(replay_buffer)
            if actor_loss is not None and critic_loss is not None:
                actor_loss_ep += actor_loss
                critic_loss_ep += critic_loss
            
            if done:
                break
        
        training_logs['episode_rewards'].append(episode_reward)
        if actor_loss_ep != 0 and critic_loss_ep != 0:
            training_logs['actor_losses'].append(actor_loss_ep / steps)
            training_logs['critic_losses'].append(critic_loss_ep / steps)
        else:
            training_logs['actor_losses'].append(0)
            training_logs['critic_losses'].append(0)
        
        # Logging every 10 episodes
        if episode % 10 == 0:
            avg_reward = np.mean(training_logs['episode_rewards'][-10:])
            avg_actor_loss = np.mean(training_logs['actor_losses'][-10:])
            avg_critic_loss = np.mean(training_logs['critic_losses'][-10:])
            training_logs['avg_rewards'].append(avg_reward)
            print(f"Episode {episode}\tAverage Reward: {avg_reward:.2f}\tAvg Actor Loss: {avg_actor_loss:.4f}\tAvg Critic Loss: {avg_critic_loss:.4f}")
    
    env.close()
    return agent.actor, agent.critic, training_logs

def render_episode(actor, state_dim, action_dim, max_action, device=None, render_delay=0.02):
    """
    Renders a single episode of the agent interacting with the HoverLunarLander environment.
    
    Args:
        actor (nn.Module): Trained Actor network.
        state_dim (int): Dimension of the state space.
        action_dim (int): Dimension of the action space.
        max_action (float): Maximum action value.
        device (torch.device, optional): Device to run the network on. If None, defaults to CUDA if available.
        render_delay (float, optional): Delay between frames in seconds. Useful to control rendering speed. Defaults to 0.02.
    
    Returns:
        None
    """
    # Initialize device
    if device is None:
        device = th.device("cuda" if th.cuda.is_available() else "cpu")
    
    # Initialize environment with custom reward
    env = HoverLunarLander(gym.make("LunarLanderContinuous-v2", render_mode="human"))
    
    # Initialize agent with only the actor
    agent = DDPGAgent(state_dim, action_dim, max_action, device, actor=actor, critic=None)
    
    # Initialize environment and get initial state
    state, _ = env.reset(seed=SEED)
    done = False
    step_count = 0
    
    while not done and step_count < 1000:
        action = agent.select_action(state)
        state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        env.render()
        step_count += 1
        
        # Optional: Add a small delay to make rendering visible
        # import time
        # time.sleep(render_delay)
    
    env.close()


In [2]:
# #train model
# actor, critic, training_logs = train_ddpg(num_episodes=1000)

In [10]:
temp_env = gym.make("LunarLanderContinuous-v2")
state_dim = temp_env.observation_space.shape[0]
action_dim = temp_env.action_space.shape[0]
max_action = float(temp_env.action_space.high[0])
actor = Actor(state_dim, action_dim, max_action)
actor.load_state_dict(th.load('actor_custom_reward.pth'))

<All keys matched successfully>

In [4]:
# #save actor critic, custom reward name

# th.save(actor.state_dict(), 'actor_custom_reward.pth')
# th.save(critic.state_dict(), 'critic_custom_reward.pth')

# #plotting
# plt.figure(figsize=(12, 8))
# plt.plot(training_logs['episode_rewards'], label='Episode Reward')
# plt.plot(training_logs['avg_rewards'], label='Average Reward (10 episodes)')
# plt.xlabel('Episode')
# plt.ylabel('Reward')
# plt.title('DDPG Training on HoverLunarLander')
# plt.legend()
# plt.grid(True)
# plt.show()



In [5]:
#render episode
#render_episode(actor, state_dim=8, action_dim=2, max_action=1.0, device=None, render_delay=0.02)

<H1>Get Data With Trained Actor

In [6]:
import torch

# The most important part of this file for creating or enlarging a dataset

In [16]:
env = HoverLunarLander(gym.make("LunarLanderContinuous-v2", render_mode=None))
actor.eval()

N = 1000

# Dataset container
dataset = {"states": [], "actions": [], "rewards": [], "next_states": []}

# Generate N episodes
for episode in range(N):
    state = temp_env.reset()[0]  # Reset the environment
    episode_states, episode_actions, episode_rewards, episode_next_states = [], [], [], []
    done = False
    counter = 0
    print(f"Generating episode {episode + 1}...")
    while not done and counter < 200:
        # Convert state to a PyTorch tensor
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        
        # Get action from the actor
        action = actor(state_tensor).detach().numpy().squeeze()

        # Take a step in the environment
        next_state, reward, done, _, _ = temp_env.step(action)

        # Append step data
        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)
        episode_next_states.append(next_state)

        # Move to next state
        state = next_state
        counter += 1

    # Store the episode data
    dataset["states"].append(torch.tensor(episode_states, dtype=torch.float32))
    dataset["actions"].append(torch.tensor(episode_actions, dtype=torch.float32))
    dataset["rewards"].append(torch.tensor(episode_rewards, dtype=torch.float32))
    dataset["next_states"].append(torch.tensor(episode_next_states, dtype=torch.float32))
# Save the dataset as PyTorch tensors
torch.save(dataset, "lunarlander_custom_reward_trained_dataset.pt")

Generating episode 1...
Generating episode 2...
Generating episode 3...
Generating episode 4...
Generating episode 5...
Generating episode 6...
Generating episode 7...
Generating episode 8...
Generating episode 9...
Generating episode 10...
Generating episode 11...
Generating episode 12...
Generating episode 13...
Generating episode 14...
Generating episode 15...
Generating episode 16...
Generating episode 17...
Generating episode 18...
Generating episode 19...
Generating episode 20...
Generating episode 21...
Generating episode 22...
Generating episode 23...
Generating episode 24...
Generating episode 25...
Generating episode 26...
Generating episode 27...
Generating episode 28...
Generating episode 29...
Generating episode 30...
Generating episode 31...
Generating episode 32...
Generating episode 33...
Generating episode 34...
Generating episode 35...
Generating episode 36...
Generating episode 37...
Generating episode 38...
Generating episode 39...
Generating episode 40...
Generatin

<H1>Render Gif Below

In [11]:
import imageio
import shutil

In [15]:
def render_episode_gif(actor, state_dim, action_dim, max_action, device=None, render_delay=0.02):
    """
    Renders a single episode of the agent interacting with the HoverLunarLander environment and saves it as a GIF.
    
    Args:
        actor (nn.Module): Trained Actor network.
        state_dim (int): Dimension of the state space.
        action_dim (int): Dimension of the action space.
        max_action (float): Maximum action value.
        device (torch.device, optional): Device to run the network on. If None, defaults to CUDA if available.
        render_delay (float, optional): Delay between frames in seconds. Useful to control rendering speed. Defaults to 0.02.
    
    Returns:
        None
    """
    # Initialize device
    if device is None:
        device = th.device("cuda" if th.cuda.is_available() else "cpu")
    
    # Initialize environment with custom reward and render_mode='rgb_array'
    env = HoverLunarLander(gym.make("LunarLanderContinuous-v2", render_mode='rgb_array'))
    
    # Initialize agent with only the actor
    agent = DDPGAgent(state_dim, action_dim, max_action, device, actor=actor, critic=None)
    
    # Initialize environment and get initial state
    state, _ = env.reset()
    done = False
    step_count = 0
    
    images = []  # To store frames
    
    while not done and step_count < 250:
        action = agent.select_action(state)
        state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        frame = env.render()  # Removed mode='rgb_array'
        images.append(frame)
        step_count += 1
        
        # Optional: Add a small delay to make rendering visible (not needed for GIF)
        # import time
        # time.sleep(render_delay)
    
    env.close()
    
    # Save images as gif, loop=0 means infinite loop
    gif_path = 'rendered_episode.gif'
    imageio.mimsave(gif_path, images, fps=30, loop=0)
    print(f"Rendered GIF saved as {gif_path}")
#load actor

render_episode_gif(actor, state_dim=8, action_dim=2, max_action=1.0, device=None, render_delay=0.02)

Preloaded Actor network loaded.
Actor target network initialized with preloaded Actor weights.
New Critic network initialized.
Critic target network initialized with Critic weights.
Rendered GIF saved as rendered_episode.gif
