In [None]:
! pip install gymnasium
! pip install gymnasium-robotics
! pip install mujoco
! pip install tensorboardX

In [18]:
import numpy as np
import torch
from torch import nn
import torch.optim as optim
import gymnasium as gym
from tensorboardX import SummaryWriter
import os
from datetime import datetime
import gymnasium_robotics
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [8]:
# Register robotics environments
gym.register_envs(gymnasium_robotics)

# Set device for PyTorch (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
class Actor(nn.Module):
    """
    Actor Network Architecture with LSTM for handling temporal dependencies.
    This network maps states to a probability distribution over discretized actions.

    Architecture:
    1. Fully connected layer for initial feature extraction
    2. LSTM layer for temporal dependency modeling
    3. Output layers that produce discrete probability distributions for each action dimension

    The action space is discretized into 11 values between -1 and 1 to make the continuous
    control problem more tractable while maintaining fine-grained control.

    Args:
        obs_dim (int): Dimension of the observation/state space
        action_dim (int): Dimension of the action space
        fc_hidden_dim (int, optional): Hidden size of fully connected layer. Defaults to 256
        lstm_hidden_dim (int, optional): Hidden size of LSTM layer. Defaults to 128
    """
    def __init__(self, obs_dim, action_dim, fc_hidden_dim=256, lstm_hidden_dim=128):
        super(Actor, self).__init__()

        # Initial feature extraction
        self.fc1 = nn.Linear(obs_dim, fc_hidden_dim)

        # Temporal dependency modeling
        self.lstm_layer = nn.LSTM(fc_hidden_dim, lstm_hidden_dim, batch_first=True)

        # Action distribution generation
        self.output_sequential = torch.nn.Sequential(
            nn.Linear(lstm_hidden_dim, action_dim * 11),  # 11 discrete values per action dimension
            nn.Unflatten(dim=-1, unflattened_size=(action_dim, 11)),  # Reshape to (action_dim, 11)
            nn.Softmax(dim=-1)  # Convert to probability distribution
        )

        # Define discretized action values from -1 to 1
        self.action_tensor = torch.tensor(
            [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]
        ).to(device)

    def forward(self, x, hidden_state=None):
        """
        Forward pass through the network.

        Args:
            x (torch.Tensor): Input state/observation
            hidden_state (tuple, optional): LSTM hidden state and cell state.
                                          If None, initialized as zeros.

        Returns:
            tuple:
                - action_prob (torch.Tensor): Action probability distributions
                - hidden_state (tuple): Updated LSTM hidden state and cell state
        """
        # Feature extraction
        x = torch.relu(self.fc1(x))

        # LSTM processing
        if hidden_state is None:
            x, hidden_state = self.lstm_layer(x)  # Initialize hidden state as zeros
        else:
            x, hidden_state = self.lstm_layer(x, hidden_state)  # Use provided hidden state

        # Generate action probabilities
        action_prob = self.output_sequential(x)

        return action_prob, hidden_state

In [10]:
class Critic(nn.Module):
    """
    Critic Network Architecture with LSTM for handling temporal dependencies.
    This network estimates the value function for given states.

    Architecture:
    1. Fully connected layer for initial feature extraction
    2. LSTM layer for temporal dependency modeling
    3. Output layer producing scalar state values

    Args:
        obs_dim (int): Dimension of the observation/state space
        fc_hidden_dim (int, optional): Hidden size of fully connected layer. Defaults to 256
        lstm_hidden_dim (int, optional): Hidden size of LSTM layer. Defaults to 128
    """
    def __init__(self, obs_dim, fc_hidden_dim=256, lstm_hidden_dim=128):
        super(Critic, self).__init__()

        # Initial feature extraction
        self.fc1 = nn.Linear(obs_dim, fc_hidden_dim)

        # Temporal dependency modeling
        self.lstm_layer = nn.LSTM(fc_hidden_dim, lstm_hidden_dim, batch_first=True)

        # Value prediction
        self.output_layer = nn.Linear(lstm_hidden_dim, 1)

    def forward(self, x, hidden_state=None):
        """
        Forward pass through the network.

        Args:
            x (torch.Tensor): Input state/observation
            hidden_state (tuple, optional): LSTM hidden state and cell state.
                                          If None, initialized as zeros.

        Returns:
            tuple:
                - values (torch.Tensor): Predicted state values
                - hidden_state (tuple): Updated LSTM hidden state and cell state
        """
        # Feature extraction
        x = torch.relu(self.fc1(x))

        # LSTM processing with automatic hidden state initialization
        if hidden_state is None:
            x, hidden_state = self.lstm_layer(x)
        else:
            x, hidden_state = self.lstm_layer(x, hidden_state)

        # Generate value predictions
        values = self.output_layer(x)

        return values, hidden_state

In [11]:
class ReplayMemory:
    """
    Fixed-size circular buffer for storing and managing experience replay data.
    Implements a FIFO (First-In-First-Out) queue when buffer is full.

    This buffer stores:
    - observations: State observations from the environment
    - actions: Actions taken by the agent
    - rewards: Rewards received from the environment
    - values: Value function estimates from the critic
    - dones: Episode termination flags
    - log_probs: Log probabilities of taken actions

    Args:
        batch_size (int): Maximum size of the buffer
    """
    def __init__(self, batch_size):
        # Initialize empty lists for storing experiences
        self.observations = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.dones = []
        self.log_probs = []
        self.batch_size = batch_size
        self.current_size = 0

    def add_memo(self, observation, action, reward, value, done, log_prob=None):
        """
        Add a new experience to the buffer. If buffer is full, remove oldest experience.

        Args:
            observation (np.ndarray): State observation
            action (np.ndarray): Action taken
            reward (float): Reward received
            value (float): Value estimate
            done (bool): Episode termination flag
            log_prob (np.ndarray, optional): Log probability of the action
        """
        # If buffer is full, remove oldest entries (FIFO)
        if len(self.observations) >= self.batch_size:
            self.observations.pop(0)
            self.actions.pop(0)
            self.rewards.pop(0)
            self.values.pop(0)
            self.dones.pop(0)
            if log_prob is not None:
                self.log_probs.pop(0)

        # Add new experience
        self.observations.append(observation)
        self.actions.append(action)
        self.rewards.append(reward)
        self.values.append(value)
        self.dones.append(done)
        if log_prob is not None:
            self.log_probs.append(log_prob)

        # Update current buffer size
        self.current_size = len(self.observations)

    def sample(self):
        """
        Convert stored experiences to numpy arrays for batch processing.

        Returns:
            tuple: Arrays of observations, actions, rewards, values, dones, and log_probs
        """
        return (np.array(self.observations),
                np.array(self.actions),
                np.array(self.rewards),
                np.array(self.values),
                np.array(self.dones),
                np.array(self.log_probs) if self.log_probs else None)

    def clear_memo(self):
        """Clear all stored experiences from the buffer."""
        self.observations = []
        self.actions = []
        self.rewards = []
        self.values = []
        self.dones = []
        self.log_probs = []
        self.current_size = 0

In [12]:
class PPOAgent:
    """
    PPO (Proximal Policy Optimization) agent implementation.

    This agent implements the PPO algorithm with the following features:
    - Actor-Critic architecture with LSTM
    - Discretized action space
    - Clipped objective function
    - Value function loss
    - Entropy regularization
    - Gradient clipping

    Args:
        obs_dim (int): Dimension of the observation space
        action_dim (int): Dimension of the action space
        batch_size (int): Size of experience replay buffer
    """
    def __init__(self, obs_dim, action_dim, batch_size):
        # PPO hyperparameters
        self.LR_ACTOR = 3e-4          # Learning rate for actor network
        self.LR_CRITIC = 3e-4         # Learning rate for critic network
        self.GAMMA = 0.99             # Discount factor for future rewards
        self.LAMBDA = 0.95            # GAE (Generalized Advantage Estimation) parameter
        self.EPOCH = 10               # Number of optimization epochs
        self.EPSILON_CLIP = 0.2       # PPO clipping parameter
        self.ENT_COEF = 0.01          # Entropy coefficient for exploration
        self.VF_COEF = 0.5            # Value function coefficient
        self.MAX_GRAD_NORM = 0.5      # Maximum gradient norm for clipping

        # Initialize neural networks
        self.actor = Actor(obs_dim, action_dim).to(device)
        self.old_actor = Actor(obs_dim, action_dim).to(device)  # Target network for stable training
        self.critic = Critic(obs_dim).to(device)

        # Initialize optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.LR_ACTOR)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.LR_CRITIC)

        # Initialize replay buffer
        self.replay_buffer = ReplayMemory(batch_size)

        # Initialize LSTM states
        self.policy_hidden_state = None
        self.value_hidden_state = None

        # Action space setup
        self.action_tensor = torch.tensor(
            [-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]
        ).to(device)
        self.action_dim = action_dim

    def get_action(self, obs):
        """
        Select action using current policy network.

        Args:
            obs (np.ndarray): Current observation/state

        Returns:
            tuple:
                - actions (np.ndarray): Selected actions
                - value (np.ndarray): Value estimate for current state
                - log_probs (np.ndarray): Log probabilities of selected actions
        """
        # Convert observation to tensor and add batch and sequence dimensions
        obs = torch.FloatTensor(obs).unsqueeze(0).unsqueeze(0).to(device)

        with torch.no_grad():
            # Get action probabilities and value estimate
            action_probs, self.policy_hidden_state = self.actor(obs, self.policy_hidden_state)
            value, self.value_hidden_state = self.critic(obs, self.value_hidden_state)

            action_probs = action_probs.squeeze()
            actions = torch.zeros(self.action_dim)
            log_probs = torch.zeros(self.action_dim)

            # Sample actions for each dimension
            for dim in range(self.action_dim):
                dist = torch.distributions.Categorical(action_probs[dim])
                action_idx = dist.sample()
                actions[dim] = self.action_tensor[action_idx]
                log_probs[dim] = dist.log_prob(action_idx)

        return actions.numpy(), value.cpu().numpy(), log_probs.numpy()

    def update(self):
        """
        Update policy and value networks using PPO algorithm.

        Returns:
            dict: Training metrics including actor loss, critic loss, and entropy
        """
        # Copy current policy to old policy
        self.old_actor.load_state_dict(self.actor.state_dict())

        # Get experiences from buffer
        memo_observations, memo_actions, memo_rewards, memo_values, memo_dones, _ = self.replay_buffer.sample()

        # Convert to tensors
        observations = torch.FloatTensor(memo_observations).unsqueeze(1).to(device)
        actions = torch.FloatTensor(memo_actions).to(device)
        old_values = torch.FloatTensor(memo_values).to(device)
        rewards = torch.FloatTensor(memo_rewards).to(device)
        dones = torch.FloatTensor(memo_dones).to(device)

        # Get old action probabilities
        with torch.no_grad():
            old_action_probs, _ = self.old_actor(observations)
            old_action_probs = old_action_probs.squeeze(1)

        # Optimization loop
        for _ in range(self.EPOCH):
            # Get current action probabilities and values
            action_probs, _ = self.actor(observations)
            action_probs = action_probs.squeeze(1)
            current_values, _ = self.critic(observations)

            # Calculate advantages
            advantages = rewards - old_values.squeeze(-1)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

            # Calculate probability ratios
            ratios = torch.exp(
                torch.sum(
                    torch.log(action_probs + 1e-10) - torch.log(old_action_probs + 1e-10),
                    dim=-1
                )
            )

            # Calculate surrogate losses
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.EPSILON_CLIP, 1+self.EPSILON_CLIP) * advantages

            # Calculate final losses
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = F.mse_loss(current_values.squeeze(), rewards.unsqueeze(-1).squeeze())
            entropy = -torch.mean(
                torch.sum(-action_probs * torch.log(action_probs + 1e-10), dim=-1)
            )

            # Combined loss
            loss = actor_loss + 0.5 * critic_loss - self.ENT_COEF * entropy

            # Optimize
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.MAX_GRAD_NORM)
            torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.MAX_GRAD_NORM)

            # Update networks
            self.actor_optimizer.step()
            self.critic_optimizer.step()

        # Clear replay buffer
        self.replay_buffer.clear_memo()

        # Return training metrics
        return {
            'actor_loss': actor_loss.item(),
            'critic_loss': critic_loss.item(),
            'entropy': entropy.item()
        }

    def save_policy(self, path):
        """
        Save the actor network to disk.

        Args:
            path (str): Path where to save the model
        """
        torch.save(self.actor.state_dict(), path)

In [24]:
def evaluate_policy(env, agent, num_episodes=5):
    """
    Evaluate the current policy without training.

    Features:
    - Tracks total rewards and success rate
    - Monitors consecutive successful episodes
    - Preserves LSTM states during evaluation
    - Handles early termination

    Args:
        env (gym.Env): Evaluation environment
        agent (PPOAgent): Agent to evaluate
        num_episodes (int): Number of episodes to evaluate

    Returns:
        dict: Evaluation metrics including mean reward, success rate, and consecutive successes
    """
    total_rewards = []
    success_rate = []
    episode_lengths = []
    consecutive_successes = 0
    max_consecutive_successes = 0

    # Store original LSTM states
    orig_policy_hidden = agent.policy_hidden_state
    orig_value_hidden = agent.value_hidden_state

    # Reset LSTM states for evaluation
    agent.policy_hidden_state = None
    agent.value_hidden_state = None

    for _ in range(num_episodes):
        state, _ = env.reset()
        state = flatten_observation(state)
        episode_reward = 0
        steps = 0
        success = False

        while True:
            # Get action from policy
            action, _, _ = agent.get_action(state)

            # Execute action in environment
            next_state, reward, terminated, truncated, info = env.step(action)
            next_state = flatten_observation(next_state)

            # Update episode information
            episode_reward += reward
            steps += 1

            # Check for success
            if info.get('is_success', False):
                success = True

            state = next_state

            # Check termination conditions
            if terminated or truncated or steps >= 200:
                break

        # Update consecutive success tracking
        if success:
            consecutive_successes += 1
            max_consecutive_successes = max(max_consecutive_successes, consecutive_successes)
        else:
            consecutive_successes = 0

        # Store episode results
        total_rewards.append(episode_reward)
        success_rate.append(float(success))
        episode_lengths.append(steps)

    # Restore original LSTM states
    agent.policy_hidden_state = orig_policy_hidden
    agent.value_hidden_state = orig_value_hidden

    # Return comprehensive evaluation metrics
    return {
        'mean_reward': np.mean(total_rewards),
        'std_reward': np.std(total_rewards),
        'success_rate': np.mean(success_rate),
        'mean_episode_length': np.mean(episode_lengths),
        'max_consecutive_successes': max_consecutive_successes
    }

def flatten_observation(obs):
    """
    Convert dictionary observation to flat array.

    Args:
        obs (dict): Dictionary containing observation values

    Returns:
        np.ndarray: Flattened observation array
    """
    return np.concatenate([obs[key].flatten() for key in sorted(obs.keys())])

def plot_training_curves(log_dir, timestamp):
    """
    Generate comprehensive training visualization from training data.

    Args:
        log_dir (str): Directory containing training logs
        timestamp (str): Timestamp for file naming
    """
    try:
        # Load data from tensorboard logs
        from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
        import pandas as pd

        event_acc = EventAccumulator(log_dir)
        event_acc.Reload()

        # Extract metrics from tensorboard logs
        metrics_data = {}
        for tag in event_acc.Tags()['scalars']:
            events = event_acc.Scalars(tag)
            metrics_data[tag] = {
                'steps': [e.step for e in events],
                'values': [e.value for e in events]
            }

        # Create main figure
        fig = plt.figure(figsize=(20, 15))
        gs = plt.GridSpec(3, 2, figure=fig)

        # Plot configurations
        plot_configs = [
            ('Episode_reward', 'Episode Rewards', gs[0, 0], 'Reward'),
            ('Average_reward', 'Moving Average Reward', gs[0, 1], 'Average Reward'),
            ('eval_success_rate', 'Success Rate', gs[1, 0], 'Success Rate (%)'),
            ('eval_max_consecutive_successes', 'Max Consecutive Successes', gs[1, 1], 'Count'),
            ('actor_loss', 'Actor Loss', gs[2, 0], 'Loss'),
            ('critic_loss', 'Critic Loss', gs[2, 1], 'Loss')
        ]

        for metric_name, title, position, ylabel in plot_configs:
            if metric_name in metrics_data:
                ax = fig.add_subplot(position)
                data = metrics_data[metric_name]
                steps = data['steps']
                values = data['values']

                if len(values) == 0:
                    continue

                # Convert to pandas Series for easier manipulation
                series = pd.Series(values, index=steps)

                # Plot raw data
                ax.plot(steps, values, 'b-', alpha=0.3, label='Raw Data')

                # Add moving average for smoothing
                if len(values) > 5:
                    window_size = min(10, len(values) // 5)
                    rolling_mean = series.rolling(window=window_size, min_periods=1).mean()
                    ax.plot(steps, rolling_mean, 'r-', linewidth=2,
                           label=f'{window_size}-point Moving Average')

                # Add trend line
                if len(values) > 1:
                    z = np.polyfit(steps, values, 1)
                    p = np.poly1d(z)
                    ax.plot(steps, p(steps), 'g--', alpha=0.8,
                           label=f'Trend (slope: {z[0]:.2e})')

                # Calculate statistics
                stats = {
                    'Mean': np.mean(values),
                    'Std': np.std(values),
                    'Max': np.max(values),
                    'Min': np.min(values),
                    'Latest': values[-1]
                }

                # Add statistics box
                stats_text = '\n'.join([f'{k}: {v:.3f}' for k, v in stats.items()])
                ax.text(1.02, 0.5, stats_text,
                       transform=ax.transAxes,
                       bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray'),
                       verticalalignment='center')

                # Customize plot
                ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
                ax.set_xlabel('Episode', fontsize=10)
                ax.set_ylabel(ylabel, fontsize=10)
                ax.grid(True, alpha=0.3)
                ax.legend(loc='upper left')

                # Add minor grid
                ax.minorticks_on()
                ax.grid(True, which='minor', alpha=0.1)

        # Add overall title
        plt.suptitle('Training Progress Overview',
                    fontsize=16, y=0.95, fontweight='bold')

        # Adjust layout and save
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        save_path = os.path.join(log_dir, f'training_curves_{timestamp}.png')
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"\nTraining curves saved to: {save_path}")

        plt.close()

        # Create correlation plot only if we have enough aligned data
        main_metrics = ['Episode_reward', 'eval_success_rate', 'eval_max_consecutive_successes']
        aligned_data = {}

        # Find common steps across all metrics
        common_steps = None
        for metric in main_metrics:
            if metric in metrics_data:
                steps = set(metrics_data[metric]['steps'])
                if common_steps is None:
                    common_steps = steps
                else:
                    common_steps = common_steps.intersection(steps)

        if common_steps and len(common_steps) > 1:
            common_steps = sorted(list(common_steps))

            # Create aligned data
            for metric in main_metrics:
                if metric in metrics_data:
                    steps = metrics_data[metric]['steps']
                    values = metrics_data[metric]['values']
                    step_to_value = dict(zip(steps, values))
                    aligned_data[metric] = [step_to_value[step] for step in common_steps]

            if len(aligned_data) > 1:
                plt.figure(figsize=(12, 8))
                df = pd.DataFrame(aligned_data, index=common_steps)
                sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0,
                           vmin=-1, vmax=1, square=True)
                plt.title('Metrics Correlation Matrix', pad=20)

                corr_path = os.path.join(log_dir, f'correlation_matrix_{timestamp}.png')
                plt.savefig(corr_path, dpi=300, bbox_inches='tight')
                print(f"Correlation matrix saved to: {corr_path}")

                plt.close()

    except Exception as e:
        print(f"Error while plotting training curves: {str(e)}")
        import traceback
        traceback.print_exc()

In [27]:
def train():
    """
    Main training loop for PPO agent.

    Features:
    - Episodic training with early stopping
    - Regular evaluation and model checkpointing
    - Progress tracking and visualization
    - Automatic model saving
    - Exception handling and cleanup

    Training Process:
    1. Environment and agent initialization
    2. Episode-based training loop
    3. Regular policy updates
    4. Periodic evaluation
    5. Progress visualization
    6. Model checkpointing
    """
    # Initialize environments
    env = gym.make('HandManipulateBlockDense-v1', max_episode_steps=100)
    eval_env = gym.make('HandManipulateBlockDense-v1', max_episode_steps=100)

    # Setup dimensions based on environment
    obs_space = env.observation_space.spaces
    obs_dim = sum(np.prod(space.shape) for space in obs_space.values())
    action_dim = env.action_space.shape[0]

    # Training hyperparameters
    NUM_EPISODES = 500        # Total number of training episodes
    NUM_STEPS = 100             # Maximum steps per episode
    UPDATE_INTERVAL = 20        # Steps between policy updates
    BATCH_SIZE = 32            # Size of replay buffer
    EVAL_INTERVAL = 2         # Episodes between evaluations

    # Setup directories for saving
    base_dir = os.getcwd()
    model_dir = os.path.join(base_dir, 'models')
    os.makedirs(model_dir, exist_ok=True)

    # Create unique timestamp for this training run
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    log_dir = os.path.join(base_dir, 'ppo_logs', timestamp)
    os.makedirs(log_dir, exist_ok=True)

    # Initialize tensorboard writer
    writer = SummaryWriter(log_dir)

    # Initialize agent and tracking variables
    agent = PPOAgent(obs_dim=obs_dim, action_dim=action_dim, batch_size=BATCH_SIZE)
    reward_buffer = np.empty(shape=NUM_EPISODES)
    best_reward = -float('inf')
    best_consecutive_successes = 0
    reward_window = deque(maxlen=100)

    # Training metrics history
    history = {
        'episode_rewards': [],
        'success_rates': [],
        'consecutive_successes': [],
        'episode_lengths': []
    }

    try:
        for episode in range(NUM_EPISODES):
            # Reset environment and agent state
            state, _ = env.reset()
            state = flatten_observation(state)
            episode_reward = 0
            steps_in_episode = 0

            # Reset LSTM states at the start of each episode
            agent.policy_hidden_state = None
            agent.value_hidden_state = None

            # Episode loop
            for step in range(NUM_STEPS):
                # Get action from policy
                action, value, log_prob = agent.get_action(state)

                # Execute action in environment
                next_state, reward, terminated, truncated, info = env.step(action)
                next_state = flatten_observation(next_state)

                # Update episode information
                episode_reward += reward
                steps_in_episode = step + 1

                # Store experience
                done = terminated or truncated or step == NUM_STEPS - 1
                agent.replay_buffer.add_memo(state, action, reward, value, done, log_prob)

                state = next_state

                # Periodic policy update
                if (step + 1) % UPDATE_INTERVAL == 0:
                    update_info = agent.update()
                    # Log training metrics
                    writer.add_scalar("actor_loss", update_info['actor_loss'], episode)
                    writer.add_scalar("critic_loss", update_info['critic_loss'], episode)
                    writer.add_scalar("entropy", update_info['entropy'], episode)

                if done:
                    break

            # Update tracking metrics
            reward_buffer[episode] = episode_reward
            reward_window.append(episode_reward)
            avg_reward = np.mean(list(reward_window))

            # Log episode metrics
            writer.add_scalar("Episode_reward", episode_reward, episode)
            writer.add_scalar("Average_reward", avg_reward, episode)
            writer.add_scalar("Steps_in_episode", steps_in_episode, episode)

            # Periodic evaluation
            if episode % EVAL_INTERVAL == 0:
                eval_metrics = evaluate_policy(eval_env, agent)

                # Log evaluation metrics
                for metric_name, value in eval_metrics.items():
                    writer.add_scalar(f"eval_{metric_name}", value, episode)
                    history[metric_name] = history.get(metric_name, []) + [value]

                print(f"\nEvaluation at episode {episode}:")
                print(f"Mean reward: {eval_metrics['mean_reward']:.2f}")
                print(f"Success rate: {eval_metrics['success_rate']:.2f}")
                print(f"Max consecutive successes: {eval_metrics['max_consecutive_successes']}")

                # Save best model based on consecutive successes
                if eval_metrics['max_consecutive_successes'] > best_consecutive_successes:
                    best_consecutive_successes = eval_metrics['max_consecutive_successes']
                    model_path = os.path.join(model_dir,
                                            f'ppo_actor_best_consecutive_{timestamp}.pth')
                    agent.save_policy(model_path)
                    print(f"New best consecutive successes: {best_consecutive_successes}!")

            # Save best model based on episode reward
            if episode_reward > best_reward:
                best_reward = episode_reward
                model_path = os.path.join(model_dir, f'ppo_actor_best_reward_{timestamp}.pth')
                agent.save_policy(model_path)
                print(f"New best reward: {best_reward:.2f}!")

            # Print episode summary
            print(f"\nEpisode: {episode}")
            print(f"Reward: {episode_reward:.2f}")
            print(f"Average Reward: {avg_reward:.2f}")
            print(f"Steps: {steps_in_episode}")
            print("-" * 50)

            # Early stopping if we achieve consistent success
            if best_consecutive_successes >= 50:  # Adjustable threshold
                print("\nReached target consecutive successes! Training complete.")
                break

    except KeyboardInterrupt:
        print("\nTraining interrupted by user")

    finally:
        # Cleanup and save final results
        env.close()
        eval_env.close()
        writer.close()

        # Save final model
        final_model_path = os.path.join(model_dir, f'ppo_actor_final_{timestamp}.pth')
        agent.save_policy(final_model_path)

        # Save reward history
        reward_path = os.path.join(base_dir, f'ppo_reward_{timestamp}.txt')
        np.savetxt(reward_path, reward_buffer)

        # Generate and save training curves
        plot_training_curves(log_dir, timestamp)

        print(f"\nTraining completed!")
        print(f"Models saved in {model_dir}")
        print(f"Logs saved in {log_dir}")
        print(f"Rewards saved to {reward_path}")
        print(f"Best consecutive successes achieved: {best_consecutive_successes}")

In [None]:
train()