# Baseline Model

In [None]:
! pip install gymnasium
! pip install gymnasium-robotics
! pip install mujoco
! pip install tensorboardX

In [26]:
print(os.getcwd())

c:\Users\loaner\Desktop\Study\Reinforcement_Learning\Code\Rough


# Current Model (Using RSAC)

In [1]:
import torch
from torch import nn
import torch.optim as optim
import gymnasium as gym
from tensorboardX import SummaryWriter
import os
from datetime import datetime
import gymnasium_robotics
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt
import numpy as np

# Set device for PyTorch (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [2]:
class Actor(nn.Module):
    def __init__(self, obs_dim, action_dim, fc_hidden_dim=256, lstm_hidden_dim=128):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(obs_dim, fc_hidden_dim)
        self.lstm_layer = nn.LSTM(fc_hidden_dim, lstm_hidden_dim, batch_first=True)
        self.mean_layer = nn.Linear(lstm_hidden_dim, action_dim)
        self.log_std_layer = nn.Linear(lstm_hidden_dim, action_dim)

    def forward(self, x, hidden_state=None):
        x = F.relu(self.fc1(x))
        if hidden_state is None:
            x, hidden_state = self.lstm_layer(x)
        else:
            x, hidden_state = self.lstm_layer(x, hidden_state)
        mean = torch.tanh(self.mean_layer(x))
        log_std = torch.clamp(self.log_std_layer(x), -20, 2)
        return mean, log_std, hidden_state

In [3]:
class Critic(nn.Module):
    def __init__(self, obs_dim, action_dim, fc_hidden_dim=256, lstm_hidden_dim=128):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(obs_dim + action_dim, fc_hidden_dim)
        self.lstm_layer = nn.LSTM(fc_hidden_dim, lstm_hidden_dim, batch_first=True)
        self.q_layer = nn.Linear(lstm_hidden_dim, 1)

    def forward(self, obs, action, hidden_state=None):
        x = torch.cat([obs, action], dim=-1)
        x = F.relu(self.fc1(x))
        if hidden_state is None:
            x, hidden_state = self.lstm_layer(x)
        else:
            x, hidden_state = self.lstm_layer(x, hidden_state)
        q_value = self.q_layer(x)
        return q_value, hidden_state

In [4]:
class ReplayMemory:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, obs, action, reward, next_obs, done):
        self.buffer.append((obs, action, reward, next_obs, done))

    def sample(self, batch_size):
        idx = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[i] for i in idx]
        obs, action, reward, next_obs, done = zip(*batch)
        return (
            torch.tensor(obs, dtype=torch.float32).to(device),
            torch.tensor(action, dtype=torch.float32).to(device),
            torch.tensor(reward, dtype=torch.float32).unsqueeze(1).to(device),
            torch.tensor(next_obs, dtype=torch.float32).to(device),
            torch.tensor(done, dtype=torch.float32).unsqueeze(1).to(device),
        )

In [5]:
def soft_q_loss(critic, target_critic, actor, obs, action, reward, next_obs, done, alpha, gamma):
    with torch.no_grad():
        next_mean, next_log_std, _ = actor(next_obs)
        next_std = next_log_std.exp()
        next_dist = torch.distributions.Normal(next_mean, next_std)
        next_action = next_dist.rsample()
        next_log_prob = next_dist.log_prob(next_action).sum(dim=-1, keepdim=True)
        target_q_value, _ = target_critic(next_obs, next_action)
        target_value = reward + gamma * (1 - done) * (target_q_value - alpha * next_log_prob)
    current_q_value, _ = critic(obs, action)
    return F.mse_loss(current_q_value, target_value)

In [6]:
def policy_loss(actor, critic, obs, alpha):
    mean, log_std, _ = actor(obs)
    std = log_std.exp()
    dist = torch.distributions.Normal(mean, std)
    action = dist.rsample()
    log_prob = dist.log_prob(action).sum(dim=-1, keepdim=True)
    q_value, _ = critic(obs, action)
    return (alpha * log_prob - q_value).mean()

In [7]:
def flatten_observation(obs):
    """
    Flatten dictionary-based observations into a single array.
    """
    return np.concatenate([value.flatten() for value in obs.values()])

In [8]:
def plot_training_curves(log_dir, timestamp):
    """
    Generate comprehensive training visualization from training data.

    Args:
        log_dir (str): Directory containing training logs
        timestamp (str): Timestamp for file naming
    """
    try:
        from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
        import pandas as pd

        # Load TensorBoard logs
        event_acc = EventAccumulator(log_dir)
        event_acc.Reload()

        # Print available tags to debug missing data
        print(f"Available tags in logs: {event_acc.Tags()['scalars']}")

        # Extract metrics from logs
        metrics_data = {}
        for tag in event_acc.Tags()['scalars']:
            events = event_acc.Scalars(tag)
            metrics_data[tag] = {
                'steps': [e.step for e in events],
                'values': [e.value for e in events],
            }

        # Create main figure
        fig = plt.figure(figsize=(30, 20))  # Adjust figure size if needed
        gs = plt.GridSpec(3, 2, figure=fig)

        # Plot configurations
        plot_configs = [
            ('Reward/Episode', 'Episode Rewards', gs[0, 0], 'Reward'),
            ('Reward/Average', 'Average Reward', gs[0, 1], 'Reward'),
            ('Loss/Actor', 'Actor Loss', gs[1, 0], 'Loss'),
            ('Loss/Critic', 'Critic Loss', gs[1, 1], 'Loss'),
            ('Loss/Alpha', 'Alpha Loss', gs[2, 0], 'Loss'),
            ('Success/Consecutive', 'Consecutive Successes', gs[2, 1], 'Count'),
        ]

        for metric_name, title, position, ylabel in plot_configs:
            if metric_name in metrics_data:
                ax = fig.add_subplot(position)
                data = metrics_data[metric_name]
                steps = data['steps']
                values = data['values']

                if len(values) == 0:
                    print(f"No data for metric: {metric_name}")
                    continue

                # Convert to pandas Series for easier manipulation
                series = pd.Series(values, index=steps)

                # Plot raw data
                ax.plot(steps, values, 'b-', alpha=0.3, label='Raw Data')

                # Add moving average for smoothing
                if len(values) > 5:
                    window_size = min(10, len(values) // 5)
                    rolling_mean = series.rolling(window=window_size, min_periods=1).mean()
                    ax.plot(steps, rolling_mean, 'r-', linewidth=2,
                            label=f'{window_size}-point Moving Average')

                # Add trend line
                if len(values) > 1:
                    z = np.polyfit(steps, values, 1)
                    p = np.poly1d(z)
                    ax.plot(steps, p(steps), 'g--', alpha=0.8,
                            label=f'Trend (slope: {z[0]:.2e})')

                # Add statistics box
                stats = {
                    'Mean': np.mean(values),
                    'Std': np.std(values),
                    'Max': np.max(values),
                    'Min': np.min(values),
                    'Latest': values[-1]
                }
                stats_text = '\n'.join([f'{k}: {v:.3f}' for k, v in stats.items()])
                ax.text(1.02, 0.5, stats_text,
                        transform=ax.transAxes,
                        bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray'),
                        verticalalignment='center')

                # Customize plot
                ax.set_title(title, pad=10, fontsize=12, fontweight='bold')
                ax.set_xlabel('Episode', fontsize=10)
                ax.set_ylabel(ylabel, fontsize=10)
                ax.grid(True, alpha=0.3)
                ax.legend(loc='upper left')

                # Add minor grid
                ax.minorticks_on()
                ax.grid(True, which='minor', alpha=0.1)

        # Add overall title
        plt.suptitle('Training Progress Overview', fontsize=16, y=0.95, fontweight='bold')

        # Adjust layout and save
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
        save_path = os.path.join(log_dir, f'training_curves2_{timestamp}.png')
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"\nTraining curves saved to: {save_path}")

        plt.close()

    except Exception as e:
        print(f"Error while plotting training curves: {str(e)}")
        import traceback
        traceback.print_exc()

In [None]:
def train_rsac(env, num_episodes, batch_size, actor_lr, critic_lr, alpha_lr, gamma=0.99, tau=0.005):
    # Flatten dictionary-based observations
    obs_dim = sum(np.prod(space.shape) for space in env.observation_space.spaces.values())
    action_dim = env.action_space.shape[0]

    actor = Actor(obs_dim, action_dim).to(device)
    critic = Critic(obs_dim, action_dim).to(device)
    target_critic = Critic(obs_dim, action_dim).to(device)
    target_critic.load_state_dict(critic.state_dict())

    actor_optimizer = optim.Adam(actor.parameters(), lr=actor_lr)
    critic_optimizer = optim.Adam(critic.parameters(), lr=critic_lr)

    alpha = torch.tensor(0.2, requires_grad=True, device=device)
    alpha_optimizer = optim.Adam([alpha], lr=alpha_lr)

    replay_buffer = ReplayMemory(capacity=100000)

    # Visualization Setup
    base_dir = os.getcwd()
    model_dir = os.path.join(base_dir, 'models2')
    os.makedirs(model_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    log_dir = os.path.join(base_dir, 'rsac_baselogs', timestamp)
    os.makedirs(log_dir, exist_ok=True)

    writer = SummaryWriter(log_dir)
    reward_buffer = np.empty(shape=num_episodes)
    best_reward = -float('inf')
    reward_window = deque(maxlen=100)
    success_count = 0
    consecutive_successes = 0
    max_consecutive_successes = 0

    print(f"Logging to TensorBoard at {log_dir}")

    try:
        for episode in range(num_episodes):
            obs_dict, _ = env.reset()
            obs = torch.tensor(flatten_observation(obs_dict), dtype=torch.float32).unsqueeze(0).to(device)
            episode_reward = 0
            hidden_state_actor, hidden_state_critic = None, None
            success = False
            
            prev_reward=0
            for t in range(env._max_episode_steps):
                mean, log_std, hidden_state_actor = actor(obs, hidden_state_actor)
                std = log_std.exp()
                dist = torch.distributions.Normal(mean, std)
                action = dist.sample().squeeze().cpu().numpy()
                next_obs_dict, reward_t, done, truncated, info = env.step(action)
                reward=reward_t-prev_reward
                prev_reward=reward
                next_obs = torch.tensor(flatten_observation(next_obs_dict), dtype=torch.float32).unsqueeze(0).to(device)
                replay_buffer.add(obs.squeeze().cpu().numpy(), action, reward, next_obs.squeeze().cpu().numpy(), done or truncated)
                obs = next_obs
                episode_reward += reward

                if 'is_success' in info and info['is_success']:
                    success = True

                if len(replay_buffer.buffer) > batch_size:
                    obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = replay_buffer.sample(batch_size)

                    # Critic Update
                    critic_loss = soft_q_loss(critic, target_critic, actor, obs_batch, action_batch, reward_batch, next_obs_batch, done_batch, alpha, gamma)
                    critic_optimizer.zero_grad()
                    critic_loss.backward()
                    critic_optimizer.step()

                    # Actor Update
                    actor_loss = policy_loss(actor, critic, obs_batch, alpha)
                    actor_optimizer.zero_grad()
                    actor_loss.backward()
                    actor_optimizer.step()

                    # Alpha Update
                    alpha_loss = -(alpha * (torch.logsumexp(next_obs_batch, dim=-1) + 1)).mean()
                    alpha_optimizer.zero_grad()
                    alpha_loss.backward()
                    alpha_optimizer.step()

                    # Update Target Critic
                    for target_param, param in zip(target_critic.parameters(), critic.parameters()):
                        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                if done or truncated:
                    break

            # Track success and consecutive successes
            if success:
                consecutive_successes += 1
                max_consecutive_successes = max(max_consecutive_successes, consecutive_successes)
            else:
                consecutive_successes = 0

            success_count += int(success)

            # Logging
            reward_buffer[episode] = episode_reward
            reward_window.append(episode_reward)
            avg_reward = np.mean(list(reward_window))

            writer.add_scalar("Reward/Episode", episode_reward, episode)
            writer.add_scalar("Reward/Average", avg_reward, episode)
            writer.add_scalar("Loss/Actor", actor_loss.item(), episode)
            writer.add_scalar("Loss/Critic", critic_loss.item(), episode)
            writer.add_scalar("Loss/Alpha", alpha_loss.item(), episode)
            writer.add_scalar("Success/Consecutive", consecutive_successes, episode)

            # Print Episode Summary
            print(f"\nEpisode: {episode}")
            print(f"Reward: {episode_reward:.2f}")
            print(f"Average Reward: {avg_reward:.2f}")
            print(f"Steps: {t + 1}")
            print("-" * 50)

            # Periodic Evaluation
            if episode % 2 == 0:
                print(f"Evaluation at episode {episode}:")
                print(f"Mean reward: {np.mean(reward_buffer[:episode+1]):.2f}")
                print(f"Success rate: {success_count / (episode + 1):.2f}")
                print(f"Max consecutive successes: {max_consecutive_successes}")

            # Save Best Model
            if episode_reward > best_reward:
                best_reward = episode_reward
                torch.save(actor.state_dict(), os.path.join(model_dir, f"actor_best_{timestamp}.pth"))
                print(f"New best reward: {best_reward:.2f}!")

            # Save if early stopping condition met
            if consecutive_successes >= 50:
                print("\nTarget consecutive successes achieved! Stopping early.")
                break

        print("Training completed successfully!")

    except KeyboardInterrupt:
        print("\nTraining interrupted by user. Performing cleanup...")
        torch.save(actor.state_dict(), os.path.join(model_dir, f"actor_interrupted_{timestamp}.pth"))
        print(f"Training interrupted! Partial model saved to {model_dir}")

    finally:
        writer.close()
        env.close()
        plot_training_curves(log_dir, timestamp)
        print(f"Visualizations saved to {log_dir}")


# Train the RSAC model
env = gym.make("HandManipulateBlockDense-v1")
train_rsac(env, num_episodes=5000, batch_size=64, actor_lr=1e-4, critic_lr=1e-4, alpha_lr=1e-4)
