<a href="https://colab.research.google.com/github/dgsob/MT7051-VT25/blob/main/Group%20project/Project_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Environment

### MuJoCo engine set up

In [188]:
# Install mujoco
!pip install mujoco

# Set up GPU rendering.
from google.colab import files
import distutils.util
import os
import subprocess

# Check if GPU is available
if subprocess.run('nvidia-smi').returncode:
  raise RuntimeError(
      'Cannot communicate with GPU. '
      'Make sure you are using a GPU Colab runtime. '
      'Go to the Runtime menu and select Choose runtime type.')

# Add an ICD config for Nvidia EGL driver
NVIDIA_ICD_CONFIG_PATH = '/usr/share/glvnd/egl_vendor.d/10_nvidia.json'
if not os.path.exists(NVIDIA_ICD_CONFIG_PATH):
  with open(NVIDIA_ICD_CONFIG_PATH, 'w') as f:
    f.write("""{
    "file_format_version" : "1.0.0",
    "ICD" : {
        "library_path" : "libEGL_nvidia.so.0"
    }
}
""")

# Configure MuJoCo to use the EGL rendering backend (requires GPU)
print('Setting environment variable to use GPU rendering:')
%env MUJOCO_GL=egl

# Check if installation was successful
try:
  print('Checking that the installation succeeded:')
  import mujoco
  mujoco.MjModel.from_xml_string('<mujoco/>')
except Exception as e:
  raise e from RuntimeError(
      'Something went wrong during installation. Check the shell output above '
      'for more information.\n'
      'If using a hosted Colab runtime, make sure you enable GPU acceleration '
      'by going to the Runtime menu and selecting "Choose runtime type".')

print('Installation successful.')

Setting environment variable to use GPU rendering:
env: MUJOCO_GL=egl
Checking that the installation succeeded:
Installation successful.


### Imports

In [189]:
# Gym
!pip install gymnasium
!pip install gymnasium-robotics
import gymnasium as gym
import gymnasium_robotics as robotics

# Rendering
!apt-get install -y xvfb
!pip install imageio ffmpeg pyvirtualdisplay
import imageio
from IPython.display import display
from PIL import Image, ImageDraw, ImageFont

# Train
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import copy
from collections import namedtuple

# Save the agent
import os
import pickle

E: dpkg was interrupted, you must manually run 'dpkg --configure -a' to correct the problem. 


### Define the environment

In [190]:
REACH = "FetchReach-v3" # easiest one, arm reaches to a point
PICK_AND_PLACE = "FetchPickAndPlace-v3" # hardest one, arm reaches for the block, grabs it, reaches to a point

In [191]:
def create_env(env_name="FetchReach-v3"):
    """Creates and returns a Gymnasium environment.

    Args:
        env_name (string, optional): The name of the environment to create.
            Defaults to: "FetchReach-v3"
            Valid options are:
                - "FetchReach-v3"
                - "FetchPickAndPlace-v3"

    Returns:
        gym.Env: The created Gymnasium environment.

    Prints:
        Observation space: The observation space of the environment.
        Action space: The action space of the environment.
    """
    env = gym.make(env_name, render_mode="rgb_array")
    print("Observation space:", env.observation_space)
    print("Action space:", env.action_space)
    return env

### Run a random agent to test the env (commented out)

In [192]:
# def run_random_actions(env, steps):
#     """Run a random agent in the environment for a given number of steps."""
#     frames = []
#     observation, info = env.reset()

#     for _ in range(steps):
#         action = env.action_space.sample()  # Sample a random action
#         obs, reward, terminated, truncated, info = env.step(action)

#         # Capture frame
#         frames.append(env.render())

#         # Stop if episode ends
#         if terminated or truncated:
#             break

#     env.close()
#     return frames

In [193]:
# # Run everything
# env = create_env()
# frames = run_random_actions(env, steps=1000)
# imageio.mimsave("random_actions_fetch.gif", frames, fps=30)

# Training

### Device configuration (cuda)

In [194]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Neural Networks

In [195]:
# Configuration from the fetch paper
# Define Actor network
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        self.max_action = max_action

        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 256)
        self.out = nn.Linear(256, action_dim)

        self.out.weight.data.uniform_(-3e-3, 3e-3)
        self.out.bias.data.uniform_(-3e-3, 3e-3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = torch.tanh(self.out(x)) * self.max_action
        return x

# Define Critic network
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256 + action_dim, 256)
        self.fc3 = nn.Linear(256, 256)
        self.out = nn.Linear(256, 1)

        self.out.weight.data.uniform_(-3e-4, 3e-4)
        self.out.bias.data.uniform_(-3e-4, 3e-4)

    def forward(self, state, action):
        xs = F.relu(self.fc1(state))
        x = torch.cat([xs, action], dim=1)
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.out(x)
        return x

### Ornstein-Uhlenbeck noise process for temporally correlated exploration

In [196]:
class OrnsteinUhlenbeckNoise:
    """
    Ornstein-Uhlenbeck process for temporally correlated exploration.

    Parameters:
    - action_dimension: Dimension of the action space
    - mu: Mean of the noise
    - theta: How quickly the process reverts to the mean (larger -> faster)
    - sigma: Scale of the noise
    """
    def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.2):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.reset()

    def reset(self):
        """Reset the internal state to the mean."""
        self.state = np.ones(self.action_dimension) * self.mu

    def sample(self):
        """Generate a sample of noise."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dimension)
        self.state = x + dx
        return self.state

### Implementation: *HER*

In [197]:
# Replay buffer with HER
class ReplayBuffer:
    def __init__(self, max_size=1000000):
        self.buffer = []
        self.max_size = max_size # from fetch paper
        self.ptr = 0

        # For storing entire episodes for HER
        self.episode_buffer = []

    def add(self, state, action, next_state, reward, done, goal):
        if len(self.buffer) < self.max_size:
            self.buffer.append(None)

        self.buffer[self.ptr] = (state, action, next_state, reward, done, goal)
        self.ptr = (self.ptr + 1) % self.max_size

    def add_episode_step(self, observation, action, next_observation, reward, done):
        """Store a step in the episode buffer with full observation dictionary"""
        self.episode_buffer.append((observation, action, next_observation, reward, done))

    def process_episode_with_her(self, env, k=4):
        """Process episode with HER, using the Gym-robotics style API"""
        episode_len = len(self.episode_buffer)

        # First, add the original experience to the replay buffer
        for i in range(episode_len):
            obs_dict, action, next_obs_dict, reward, done = self.episode_buffer[i]

            # Original experience with intended goal
            self.store_transition(obs_dict, action, next_obs_dict, reward, done)

            # HER: For k random future states - this is a future strategy implementation from the HER paper
            if k > 0:
                future_indices = np.random.randint(i, episode_len, size=min(k, episode_len-i))

                for future_idx in future_indices:
                    future_obs_dict = self.episode_buffer[future_idx][2]  # Get the next observation at future step
                    achieved_goal = future_obs_dict["achieved_goal"]

                    # Use the future achieved goal as the desired goal
                    her_reward = env.unwrapped.compute_reward(
                        next_obs_dict["achieved_goal"],
                        achieved_goal,
                        info={}
                    )

                    # Store the transition with modified goal
                    self.store_transition(
                        obs_dict,
                        action,
                        next_obs_dict,
                        her_reward,
                        done,
                        new_goal=achieved_goal
                    )

        # Clear episode buffer for next episode
        self.episode_buffer = []

    def store_transition(self, obs_dict, action, next_obs_dict, reward, done, new_goal=None):
        """Store a processed transition in the replay buffer"""
        # If new_goal is provided, use it instead of the original desired_goal
        goal = new_goal if new_goal is not None else obs_dict["desired_goal"]

        # Create combined state vector
        state = np.concatenate([obs_dict["observation"], goal])
        next_state = np.concatenate([next_obs_dict["observation"], goal])

        # Add to buffer
        self.add(state, action, next_state, reward, done, goal)

    def sample(self, batch_size):
        ind = np.random.randint(0, len(self.buffer), size=batch_size)

        states, actions, next_states, rewards, dones, goals = [], [], [], [], [], []

        for i in ind:
            s, a, s_, r, d, g = self.buffer[i]
            states.append(np.asarray(s))
            actions.append(np.asarray(a))
            next_states.append(np.asarray(s_))
            rewards.append(np.asarray(r))
            dones.append(np.asarray(d))
            goals.append(np.asarray(g))

        return (
            torch.FloatTensor(np.array(states)).to(device),
            torch.FloatTensor(np.array(actions)).to(device),
            torch.FloatTensor(np.array(next_states)).to(device),
            torch.FloatTensor(np.array(rewards).reshape(-1, 1)).to(device),
            torch.FloatTensor(np.array(dones).reshape(-1, 1)).to(device),
            torch.FloatTensor(np.array(goals)).to(device)
        )

    def __len__(self):
        return len(self.buffer)

### Implementation: *DDPG*

In [198]:
# DDPG with HER agent
class DDPG_HER:
    def __init__(self, state_dim, action_dim, goal_dim, max_action):
        # Our input to the actor and critic is state+goal, so we need to account for that in the dimensions
        input_dim = state_dim + goal_dim

        # The networks
        self.actor = Actor(input_dim, action_dim, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=0.001)

        self.critic = Critic(input_dim, action_dim).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=0.001)

        self.max_action = max_action # continuous action-values -> this is 1.0 for Fetch
        self.replay_buffer = ReplayBuffer()

        self.batch_size = 256 # from the fetch paper
        self.gamma = 0.98 # arbitrary, I don't see it in the paper
        self.tau = 0.95 # from the fetch paper - surprisingly very high

    def select_action(self, observation, noise, noise_type="Gaussian"):
        """Select action based on observation dictionary"""
        # Create input by concatenating state and goal
        state = observation["observation"]
        goal = observation["desired_goal"]
        state_goal = np.concatenate([state, goal])
        state_goal = torch.FloatTensor(state_goal.reshape(1, -1)).to(device)

        with torch.no_grad():
            action = self.actor(state_goal).cpu().data.numpy().flatten()

        # Add noise for exploration
        if noise > 0 and noise_type == "OU":
          # We check if the subprocess is not already running
          if not hasattr(self, 'noise_process'):
            self.noise_process = OrnsteinUhlenbeckNoise(
                action_dimension=len(action),
                mu=0,
                theta=0.15, # from the continuous control paper
                sigma=0.2   # from the continuous control paper
            )
            action = action + noise * self.noise_process.sample()

        if noise > 0 and noise_type == "Gaussian":
            action = action + np.random.normal(0, noise, size=action.shape)

        return np.clip(action, -self.max_action, self.max_action)

    def reset_noise(self):
        if hasattr(self, 'noise_process'):
            self.noise_process.reset()

    def train(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        # Sample from replay buffer
        state, action, next_state, reward, done, goal = self.replay_buffer.sample(self.batch_size)

        # Compute target Q value
        with torch.no_grad():
            next_action = self.actor_target(next_state)
            target_Q = self.critic_target(next_state, next_action)
            target_Q = reward + (1 - done) * self.gamma * target_Q

        # Get current Q estimate
        current_Q = self.critic(state, action)

        # Compute critic loss
        critic_loss = F.mse_loss(current_Q, target_Q)

        # Optimize the critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Compute actor loss
        actor_loss = -self.critic(state, self.actor(state)).mean()

        # Optimize the actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update target networks
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

### Train Function

In [199]:
def train_ddpg_with_her(env, debug = True):
    # Set seeds
    seed = 5
    env.action_space.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env_name = env.unwrapped.spec.id

    # Create agent
    obs_dict = env.reset()[0]
    state_dim = obs_dict["observation"].shape[0]  # 10
    action_dim = env.action_space.shape[0]  # 4
    goal_dim = obs_dict["desired_goal"].shape[0]  # 3
    max_action = float(env.action_space.high[0])  # 1.0

    agent = DDPG_HER(state_dim, action_dim, goal_dim, max_action)

    # Training parameters - increased for better learning
    n_epochs = 30 # from the fetch paper results we should converge by that time
    n_cycles = 50 # from the fetch paper
    # In the paper they trained on cpu 19 cores, I don't really understand their calculations so these two are arbitrary
    n_episodes = 16
    n_timesteps = 100

    if debug:
      n_epochs = 2
      n_cycles = 2
      n_episodes = 2
      n_timesteps = 2

    # For evaluation
    success_history = []
    # and a typical learning curve
    reward_history = []

    # Start training
    print("Starting training...")

    for epoch in range(n_epochs):
        successes = 0
        epoch_rewards = []  # track rewards for all episodes in this epoch

        for cycle in range(n_cycles):
            for episode in range(n_episodes):
                # Reset environment
                obs_dict, _ = env.reset()

                # Reset OU exploration noise
                agent.reset_noise()

                # Initialize tracking
                episode_success = False
                episode_reward = 0

                for t in range(n_timesteps):
                    # Select action with noise for exploration
                    action = agent.select_action(obs_dict, noise=0.2) # TODO: gradually decay the noise

                    # Execute action
                    next_obs_dict, reward, terminated, truncated, info = env.step(action)
                    done = terminated or truncated

                    # Accumulate rewards
                    episode_reward += reward

                    # Store transition in episode buffer
                    agent.replay_buffer.add_episode_step(
                        obs_dict, action, next_obs_dict, reward, float(done)
                    )

                    # Update observations
                    obs_dict = next_obs_dict

                    # Track success
                    if info.get("is_success", 0) > 0:
                        episode_success = True

                    if done:
                        break

                # Process episode with HER
                agent.replay_buffer.process_episode_with_her(env, k=4) # k=4 means the probability of HER experience replay is 0.8 like in the fetch paper

                epoch_rewards.append(episode_reward)

                if episode_success:
                    successes += 1

            # Update policy after each cycle
            for _ in range(40):  # Increased updates per cycle
                agent.train()

        # Calculate success rate and rewards for this epoch
        success_rate = successes / (n_cycles * n_episodes)
        success_history.append(success_rate)

        avg_reward = sum(epoch_rewards) / len(epoch_rewards)
        reward_history.append(avg_reward)

        print(f"Epoch {epoch+1}/{n_epochs}, Success Rate: {success_rate:.2f}")

    # Save trained model
    torch.save(agent.actor.state_dict(), f"{env_name}_actor.pth")

    return agent.actor, success_history, reward_history

### Run training with DDPG + HER

In [None]:
env = create_env()
actor, success_history, reward_history = train_ddpg_with_her(env, debug = False)

Observation space: Dict('achieved_goal': Box(-inf, inf, (3,), float64), 'desired_goal': Box(-inf, inf, (3,), float64), 'observation': Box(-inf, inf, (10,), float64))
Action space: Box(-1.0, 1.0, (4,), float32)
Starting training...
Epoch 1/30, Success Rate: 0.00
Epoch 2/30, Success Rate: 0.00
Epoch 3/30, Success Rate: 0.00
Epoch 4/30, Success Rate: 0.03


# Load the actor from a file

In [None]:
def load_agent(model_path, env):
    obs_dict = env.reset()[0]
    state_dim = obs_dict["observation"].shape[0]  # 10
    action_dim = env.action_space.shape[0]  # 4
    goal_dim = obs_dict["desired_goal"].shape[0]  # 3
    max_action = float(env.action_space.high[0])  # 1.0

    # Create agent with the same architecture
    input_dim = state_dim + goal_dim
    actor = Actor(input_dim, action_dim, max_action).to(device)

    # Load the saved weights
    actor.load_state_dict(torch.load(model_path))
    actor.eval()  # Set to evaluation mode

    return actor

# Evaluation

### Evaluate Function

In [None]:
def evaluate(actor_model, env, n_episodes=10, max_steps=100):
    """
    Evaluates using just an actor model on multiple episodes and creates a single GIF
    with episodes playing one after another in sequence.

    Args:
        actor_model: The trained actor network
        env: The environment
        n_episodes: Number of episodes to run
        max_steps: Maximum steps per episode

    Returns:
        success_rate: Fraction of successful episodes
    """
    # Storage for sequential frames
    all_frames = []
    successes = 0
    env_name = env.unwrapped.spec.id
    device = next(actor_model.parameters()).device  # Get the device of the model

    # First get a sample frame to determine dimensions
    obs_dict, _ = env.reset()
    sample_frame = env.render()
    frame_height, frame_width = sample_frame.shape[0], sample_frame.shape[1]

    # Run all episodes sequentially
    for episode in range(n_episodes):
        obs_dict, _ = env.reset()
        episode_success = False

        # Episode title frame
        title_frame = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
        pil_title = Image.fromarray(title_frame)
        draw = ImageDraw.Draw(pil_title)
        draw.text((frame_width//2 - 80, frame_height//2), f"Episode {episode+1}", fill=(255, 255, 255))
        all_frames.append(np.array(pil_title))

        # 5 blank frames as a pause between episodes
        for _ in range(5):
            all_frames.append(title_frame.copy())

        for t in range(max_steps):
            # Render the current state
            frame = env.render()

            # Add episode number as overlay
            pil_frame = Image.fromarray(frame)
            draw = ImageDraw.Draw(pil_frame)
            draw.text((10, 10), f"Episode {episode+1}", fill=(255, 255, 255))
            frame = np.array(pil_frame)

            # Add frame to full sequence
            all_frames.append(frame)

            # Select action using actor model directly
            state = obs_dict["observation"]
            goal = obs_dict["desired_goal"]
            state_goal = np.concatenate([state, goal])
            state_goal = torch.FloatTensor(state_goal.reshape(1, -1)).to(device)

            with torch.no_grad():
                action = actor_model(state_goal).cpu().data.numpy().flatten()

            # Step environment
            next_obs_dict, _, terminated, truncated, info = env.step(action)

            # Check success
            if info.get("is_success", 0) > 0:
                episode_success = True

            obs_dict = next_obs_dict

            if terminated or truncated:
                break

        # Make sure we have at least one frame from this episode
        if len(all_frames) == 0:
            all_frames.append(title_frame.copy())

        # Get the last frame for this episode
        last_frame = all_frames[-1].copy()

        # Mark episode result with a summary frame
        pil_result = Image.fromarray(last_frame)
        draw = ImageDraw.Draw(pil_result)
        status = "SUCCESS" if episode_success else "FAILURE"
        color = (0, 255, 0) if episode_success else (255, 0, 0)
        # Draw larger text for visibility
        draw.text((frame_width//2 - 80, frame_height//2),
                  status, fill=color)

        result_frame = np.array(pil_result)
        all_frames.append(result_frame)

        # Add a few still frames at the end of each episode to see the result
        for _ in range(15):  # Hold the result for 15 frames
            all_frames.append(result_frame.copy())

        # Track success
        if episode_success:
            successes += 1

    # Check if all frames have the same shape
    shapes = set(frame.shape for frame in all_frames)
    if len(shapes) > 1:
        print(f"Warning: Frames have different shapes: {shapes}")
        # Resize all frames to the dimensions of the first frame
        target_shape = all_frames[0].shape
        for i in range(len(all_frames)):
            if all_frames[i].shape != target_shape:
                pil_frame = Image.fromarray(all_frames[i])
                pil_frame = pil_frame.resize((target_shape[1], target_shape[0]), Image.LANCZOS)
                all_frames[i] = np.array(pil_frame)

    # Save as GIF
    imageio.mimsave(f"{env_name}_eval.gif", all_frames, fps=30, loop=0)
    print(f"Saved sequential animation to {env_name}_eval.gif")

    success_rate = successes / n_episodes
    print(f"Evaluation success rate: {success_rate:.2f}")

    return success_rate

### Run evaluation and save the gif

In [None]:
env = create_env()
evaluate(actor, env, n_episodes=10)