In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import os
from stable_baselines3 import PPO, A2C, SAC, TD3, DQN
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv

from EnvLibs import Environment, RewardKernel, TrafficGenerator
from EnvLibs.DRL_EnvSim import DRLResourceSchedulingEnv
from EnvLibs.DRL_config import (
    get_algorithm_config, 
    get_environment_config, 
    get_training_config,
    print_algorithm_info
)

In [2]:
class TrainingCallback(BaseCallback):
    """Custom callback to track detailed training performance."""
    
    def __init__(self, algorithm_name: str, verbose=0):
        super().__init__(verbose)
        self.algorithm_name = algorithm_name
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_alphas = []
        self.episode_loss_rates = []
        self.timesteps = []
        self.episodes = 0
    
    def _on_step(self) -> bool:
        # Track per-step metrics
        if len(self.locals.get('infos', [])) > 0:
            info = self.locals['infos'][0]
            
            # Check if episode ended
            if 'episode' in info or self.locals.get('dones', [False])[0]:
                self.episodes += 1
                
                # Log episode metrics
                if 'total_packet_loss_rate' in info:
                    self.episode_loss_rates.append(info['total_packet_loss_rate'])
                    self.timesteps.append(self.num_timesteps)
        
        return True

In [3]:
def create_environment(action_mode="full_action", use_real_traffic=True, traffic_update_mode="sequential", seed=None):
    """Create and return the resource scheduling environment."""
    
    env_config = get_environment_config()
    

    
    env = DRLResourceSchedulingEnv(
        n_users=env_config["n_users"],
        len_window=env_config["len_window"],
        r_bar=env_config["r_bar"],
        bandwidth=env_config["bandwidth"],
        action_mode=action_mode,
        observation_mode="full",
        max_episode_steps=env_config["max_episode_steps"],
        random_seed=seed,
        reward_mode=env_config["reward_mode"],
        traffic_data_path="Results/TrafficData/trafficData.pkl",  # NEW
        use_real_traffic=use_real_traffic,  # NEW
        traffic_update_mode=traffic_update_mode  # NEW
    )
    return Monitor(env)

In [4]:
def train_drl_agent(algorithm_name: str, env, total_timesteps=None, save_path=None):
    """Train a DRL agent using the specified algorithm."""
    
    # Get configurations
    training_config = get_training_config()
    if total_timesteps is None:
        total_timesteps = training_config["total_timesteps"]
    
    if save_path is None:
        # Extract environment parameters for the save path
        # Access the underlying environment (unwrap Monitor wrapper)
        underlying_env = env.unwrapped if hasattr(env, 'unwrapped') else env
        n_users = underlying_env.n_users
        bandwidth = underlying_env.bandwidth
        save_path = f"{training_config['models_dir']}/{algorithm_name.lower()}_mdp_scheduling_N_u={n_users},B={bandwidth}"
    
    print(f"\n{'='*60}")
    print(f"Training {algorithm_name} Agent")
    print(f"{'='*60}")
    print(f"Total timesteps: {total_timesteps}")
    print(f"Environment: {underlying_env.n_users} users, {underlying_env.bandwidth} bandwidth")
    print(f"Save path: {save_path}.zip")

    
    # Get algorithm configuration
    config = get_algorithm_config(algorithm_name, env)
    algorithm_class = config["class"]
    params = config["params"]

    
    # Create callback to track performance
    callback = TrainingCallback(algorithm_name)
    
    # Create model
    model = algorithm_class(
        "MlpPolicy", 
        env, 
        verbose=1,
        device='cpu',
        **params
    )
    
    # Train the model
    start_time = time.time()
    print("Starting training...")
    model.learn(total_timesteps=total_timesteps, callback=callback)
    training_time = time.time() - start_time
    
    # Save the model
    os.makedirs(training_config["models_dir"], exist_ok=True)
    model.save(save_path)
    
    print(f"{algorithm_name} training completed in {training_time:.2f} seconds")
    print(f"Model saved to: {save_path}.zip")
    
    return model, callback, training_time

In [5]:
def evaluate_model(model, env, algorithm_name, num_episodes=10, eval_seed=42):
    """Evaluate a trained model and return performance metrics."""
    
    episode_rewards = []
    episode_loss_rates = []
    episode_alphas = []
    episode_lengths = []
    
    for episode in range(num_episodes):
        # Use same seed for fair comparison across algorithms
        obs, _ = env.reset(seed=eval_seed + episode)  # ← FIXED SEED PER EPISODE
        episode_reward = 0
        episode_alpha_list = []
        step_count = 0
        
        while True:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)
            episode_reward += reward
            episode_alpha_list.append(info['alpha'])
            step_count += 1
            
            if terminated or truncated:
                episode_rewards.append(episode_reward)
                episode_loss_rates.append(info['total_packet_loss_rate'])
                episode_alphas.append(np.mean(episode_alpha_list))
                episode_lengths.append(step_count)
                break
    
    return {
        'algorithm': algorithm_name,
        'avg_reward': np.mean(episode_rewards),
        'std_reward': np.std(episode_rewards),
        'avg_loss_rate': np.mean(episode_loss_rates),
        'std_loss_rate': np.std(episode_loss_rates),
        'avg_alpha': np.mean(episode_alphas),
        'avg_episode_length': np.mean(episode_lengths),
        'episode_rewards': episode_rewards,
        'episode_loss_rates': episode_loss_rates,
        'episode_alphas': episode_alphas
    }

In [6]:
def evaluate_drl_agent(model, env, algorithm_name: str, num_episodes=None, eval_seed=None):
    """Evaluate a trained DRL agent."""
    
    training_config = get_training_config()
    if num_episodes is None:
        num_episodes = training_config["eval_episodes"]
    if eval_seed is None:
        eval_seed = training_config["eval_seed"]
    
    print(f"\n{'='*60}")
    print(f"Evaluating {algorithm_name} Agent")
    print(f"{'='*60}")
    
    episode_rewards = []
    episode_loss_rates = []
    episode_alphas = []
    episode_lengths = []
    episode_actions = []
    
    for episode in range(num_episodes):
        obs, _ = env.reset(seed=eval_seed + episode)
        episode_reward = 0
        episode_alpha_list = []
        episode_action_list = []
        step_count = 0
        
        while True:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)
            episode_reward += reward
            episode_alpha_list.append(info['alpha'])
            episode_action_list.append(action.copy() if hasattr(action, 'copy') else action)
            step_count += 1
            
            if terminated or truncated:
                episode_rewards.append(episode_reward)
                episode_loss_rates.append(info['total_packet_loss_rate'])
                episode_alphas.append(np.mean(episode_alpha_list))
                episode_lengths.append(step_count)
                episode_actions.append(episode_action_list)
                break
    
    # Calculate statistics
    results = {
        'algorithm': algorithm_name,
        'avg_reward': np.mean(episode_rewards),
        'std_reward': np.std(episode_rewards),
        'avg_loss_rate': np.mean(episode_loss_rates),
        'std_loss_rate': np.std(episode_loss_rates),
        'avg_alpha': np.mean(episode_alphas),
        'avg_episode_length': np.mean(episode_lengths),
        'episode_rewards': episode_rewards,
        'episode_loss_rates': episode_loss_rates,
        'episode_alphas': episode_alphas,
        'episode_actions': episode_actions
    }
    
    print(f"Evaluation Results:")
    print(f"  Average Reward: {results['avg_reward']:.4f} ± {results['std_reward']:.4f}")
    print(f"  Average Loss Rate: {results['avg_loss_rate']:.4f} ± {results['std_loss_rate']:.4f}")
    print(f"  Average Alpha: {results['avg_alpha']:.4f}")
    print(f"  Average Episode Length: {results['avg_episode_length']:.2f}")
    
    return results


def plot_training_results(callback, eval_results, algorithm_name: str, save_plots=True):
    """Plot training progress and evaluation results."""
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle(f'{algorithm_name} Agent Training and Evaluation Results', fontsize=16)
    
    # 1. Training Loss Rates Over Time
    if callback.episode_loss_rates and callback.timesteps:
        ax1 = axes[0, 0]
        ax1.plot(callback.timesteps, callback.episode_loss_rates, alpha=0.7, color='blue')
        ax1.set_title('Training: Loss Rate Over Time')
        ax1.set_xlabel('Training Timesteps')
        ax1.set_ylabel('Packet Loss Rate')
        ax1.grid(True, alpha=0.3)
    else:
        axes[0, 0].text(0.5, 0.5, 'No training data available', 
                       transform=axes[0, 0].transAxes, ha='center', va='center')
        axes[0, 0].set_title('Training: Loss Rate Over Time')
    
    # 2. Evaluation Reward Distribution
    ax2 = axes[0, 1]
    ax2.hist(eval_results['episode_rewards'], bins=15, alpha=0.7, color='orange')
    ax2.axvline(eval_results['avg_reward'], color='red', linestyle='--', 
                label=f'Mean: {eval_results["avg_reward"]:.4f}')
    ax2.set_title('Evaluation: Reward Distribution')
    ax2.set_xlabel('Episode Reward')
    ax2.set_ylabel('Frequency')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 3. Evaluation Loss Rate Distribution
    ax3 = axes[0, 2]
    ax3.hist(eval_results['episode_loss_rates'], bins=15, alpha=0.7, color='green')
    ax3.axvline(eval_results['avg_loss_rate'], color='red', linestyle='--',
                label=f'Mean: {eval_results["avg_loss_rate"]:.4f}')
    ax3.set_title('Evaluation: Loss Rate Distribution')
    ax3.set_xlabel('Packet Loss Rate')
    ax3.set_ylabel('Frequency')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 4. Alpha Value Distribution
    ax4 = axes[1, 0]
    ax4.hist(eval_results['episode_alphas'], bins=15, alpha=0.7, color='purple')
    ax4.axvline(eval_results['avg_alpha'], color='red', linestyle='--',
                label=f'Mean: {eval_results["avg_alpha"]:.4f}')
    ax4.set_title('Evaluation: Alpha Value Distribution')
    ax4.set_xlabel('Alpha Value')
    ax4.set_ylabel('Frequency')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    # 5. Action Space Analysis
    ax5 = axes[1, 1]
    if eval_results['episode_actions'] and len(eval_results['episode_actions']) > 0:
        try:
            first_action = eval_results['episode_actions'][0][0]
            if hasattr(first_action, '__len__') and len(first_action) > 1:
                # Continuous actions
                all_actions = []
                for ep_actions in eval_results['episode_actions']:
                    for action in ep_actions:
                        if hasattr(action, '__len__'):
                            all_actions.append(action)
                
                if all_actions:
                    all_actions = np.array(all_actions)
                    # Plot distribution of first few action dimensions
                    colors = ['blue', 'orange', 'green', 'red']
                    for i in range(min(4, all_actions.shape[1])):
                        ax5.hist(all_actions[:, i], bins=20, alpha=0.5, 
                                color=colors[i % len(colors)], label=f'Action {i}')
                    ax5.set_title('Action Distribution (First 4 Dimensions)')
                    ax5.set_xlabel('Action Value')
                    ax5.set_ylabel('Frequency')
                    ax5.legend()
            else:
                # Discrete actions
                all_actions = []
                for ep_actions in eval_results['episode_actions']:
                    all_actions.extend(ep_actions)
                ax5.hist(all_actions, bins=20, alpha=0.7, color='cyan')
                ax5.set_title('Action Distribution (Discrete)')
                ax5.set_xlabel('Action Value')
                ax5.set_ylabel('Frequency')
        except Exception as e:
            ax5.text(0.5, 0.5, f'Action analysis failed:\n{str(e)}', 
                    transform=ax5.transAxes, ha='center', va='center')
            ax5.set_title('Action Analysis (Error)')
    else:
        ax5.text(0.5, 0.5, 'No action data available', 
                transform=ax5.transAxes, ha='center', va='center')
        ax5.set_title('Action Distribution')
    
    ax5.grid(True, alpha=0.3)
    
    # 6. Performance Summary
    ax6 = axes[1, 2]
    metrics = ['Avg Reward', 'Avg Loss Rate', 'Avg Alpha']
    values = [eval_results['avg_reward'], eval_results['avg_loss_rate'], eval_results['avg_alpha']]
    errors = [eval_results['std_reward'], eval_results['std_loss_rate'], 0]
    colors = ['skyblue', 'lightcoral', 'lightgreen']
    
    bars = ax6.bar(metrics, values, yerr=errors, capsize=5, alpha=0.7, color=colors)
    ax6.set_title('Performance Summary')
    ax6.set_ylabel('Value')
    ax6.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for bar, value in zip(bars, values):
        height = bar.get_height()
        ax6.text(bar.get_x() + bar.get_width()/2, height + (0.01 * height if height > 0 else -0.01 * abs(height)),
                f'{value:.4f}', ha='center', va='bottom' if height > 0 else 'top')
    
    plt.tight_layout()
    
    if save_plots:
        training_config = get_training_config()
        os.makedirs(training_config["plots_dir"], exist_ok=True)
        plot_path = f'{training_config["plots_dir"]}/{algorithm_name.lower()}_results.png'
        plt.savefig(plot_path, dpi=300, bbox_inches='tight')
        print(f"Plot saved to: {plot_path}")
    
    plt.show()

In [7]:
def load_and_evaluate(algorithm_name: str, model_path: str, action_mode="full_action", num_episodes=None):
    """Load a trained model and evaluate it."""
    
    print(f"Loading {algorithm_name} model from {model_path}")
    
    # Create environment
    env = create_environment(action_mode=action_mode)
    
    # Get algorithm class
    config = get_algorithm_config(algorithm_name, env)
    algorithm_class = config["class"]
    
    # Load model
    model = algorithm_class.load(model_path)
    
    # Evaluate
    eval_results = evaluate_drl_agent(model, env, algorithm_name, num_episodes)
    
    env.close()
    return model, eval_results


def main(algorithm_name: str = "SAC", total_timesteps: int = None, action_mode: str = "full_action"):
    """Main function to train and evaluate a DRL agent."""
    
    print("="*80)
    print(f"{algorithm_name} Agent Training and Evaluation")
    print("="*80)
    
    # Get training configuration
    training_config = get_training_config()
    if total_timesteps is None:
        total_timesteps = training_config["total_timesteps"]
    
    # Create environment
    print("Creating environment...")
    env = create_environment(action_mode=action_mode, seed=42)
    
    # Check environment
    print("Checking environment...")
    check_env(env.unwrapped)
    print("Environment check passed!")
    
    # Train DRL agent
    model, callback, training_time = train_drl_agent(algorithm_name, env, total_timesteps)
    
    # Evaluate DRL agent
    eval_results = evaluate_drl_agent(model, env, algorithm_name)
    
    # Print final summary
    print(f"\n{'='*80}")
    print("FINAL SUMMARY")
    print(f"{'='*80}")
    print(f"Algorithm: {algorithm_name}")
    print(f"Action mode: {action_mode}")
    print(f"Training completed in: {training_time:.2f} seconds")
    print(f"Total training timesteps: {total_timesteps}")
    print(f"Average evaluation reward: {eval_results['avg_reward']:.4f} ± {eval_results['std_reward']:.4f}")
    print(f"Average packet loss rate: {eval_results['avg_loss_rate']:.4f} ± {eval_results['std_loss_rate']:.4f}")
    print(f"Average alpha value: {eval_results['avg_alpha']:.4f}")
    print(f"Model saved as: {training_config['models_dir']}/{algorithm_name.lower()}_mdp_scheduling.zip")
    
    env.close()
    return model, eval_results

In [8]:
if __name__ == "__main__":
    # Configuration - change these parameters as needed
    ALGORITHM = "SAC"           # Options: "SAC", "PPO", "A2C", "TD3", "DQN"
    TIMESTEPS = 200000         # Training timesteps
    ACTION_MODE = "full_action" # Options: "full_action", "alpha_only", "discrete_alpha"
    
    
    # Run training and evaluation
    print(f"\nStarting training with {ALGORITHM}...")
    model, results = main(
        algorithm_name=ALGORITHM, 
        total_timesteps=TIMESTEPS, 
        action_mode=ACTION_MODE
    )
    
    # Example of loading and evaluating a saved model:
    # print("\nExample: Loading and evaluating saved model...")
    # model, results = load_and_evaluate("SAC", "models/sac_mdp_scheduling.zip")


Starting training with SAC...
SAC Agent Training and Evaluation
Creating environment...
 Loaded real traffic data from: Results/TrafficData/trafficData.pkl
   Traffic data shape: (104066,)
Checking environment...
 Loaded real traffic data from: Results/TrafficData/trafficData.pkl
   Traffic data shape: (104066,)
Environment check passed!

Training SAC Agent
Total timesteps: 200000
Environment: 4 users, 30 bandwidth
Save path: models/sac_mdp_scheduling_N_u=4,B=30.zip
Using cpu device
Wrapping the env in a DummyVecEnv.
Starting training...
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -116     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 161      |
|    time_elapsed    | 12       |
|    total_timesteps | 2000     |
| train/             |          |
|    actor_loss      | -30.1    |
|    critic_loss     | 0.315    |
|    ent_coef        | 0.741    |
|    ent_coef