In [None]:
from environment_sar import SARrobotEnv
from agents import QLearningAgentFlat, QLearningAgentFlatLLM, QLearningAgentFlatAttention, QLearningAgentFlatActionToggle, LearningAgentFlat, QLearningAgentMaxInfoRL, QLearningAgentMaxInfoRL_ADVANCED
from hierarchical_agents import QLearningAgentHierarchical, QLearningAgentHierarchicalLLM, QLearningAgentHierarchicalAttention, QLearningAgentHierarchicalActionToggle, LearningAgentHierarchical
from robot_utils import RunningParameters, agent_config
from evaluation import main_evaluation, compute_all_agents_metrics, evaluate_trained_policy, plot_accumulated_rewards, plot_accumulated_rewards_v2, save_training_results, load_training_results, plot_average_steps, plot_metric_bars
import time
param = RunningParameters()

In [None]:
from robot_utils import RunningParameters, agent_config
param = RunningParameters()

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pickle
from environment_sar import SARrobotEnv
from agents import QLearningAgentFlat, QLearningAgentMaxInfoRL_ADVANCED

# Configuration
GRID_ROWS = 4
GRID_COLS = 4
INFO_POINTS = 3  # Number of information points to collect
NUM_EPISODES = 5000
LOG_DIR = "./logs/comparison_" + datetime.now().strftime("%Y%m%d_%H%M%S")
POLICY_DIR = os.path.join(LOG_DIR, "policies")

# Create output directories
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(POLICY_DIR, exist_ok=True)

# RL parameters
ALPHA = 0.1
GAMMA = 0.99
EPSILON_MAX = 1.0
DECAY_RATE = 2
EPSILON_MIN = 0.05

# Define experiments
experiments = [
    {
        "name": "Flat_Static",
        "agent_class": QLearningAgentFlat,
        "sparse_reward": False,
        "reward_shaping": False,
        "attention": False,
        "hierarchical": False,
        "change_priorities": None  # No changes
    },
    {
        "name": "Flat_Dynamic",
        "agent_class": QLearningAgentFlat,
        "sparse_reward": False,
        "reward_shaping": False,
        "attention": False,
        "hierarchical": False,
        "change_priorities": {
            2500: {'X': 2, 'Y': 0, 'Z': 1},  # Change from X-Y-Z to Y-Z-X
            #3500: {'X': 1, 'Y': 2, 'Z': 0},   # Change to Z-X-Y

        }
    },
    {
        "name": "MaxInfoRL_Static",
        "agent_class": QLearningAgentMaxInfoRL_ADVANCED,
        "sparse_reward": False,
        "reward_shaping": False,
        "attention": False,
        "hierarchical": False,
        "change_priorities": None  # No changes
    },
    {
        "name": "MaxInfoRL_Dynamic",
        "agent_class": QLearningAgentMaxInfoRL_ADVANCED,
        "sparse_reward": False,
        "reward_shaping": False,
        "attention": False,
        "hierarchical": False,
        "change_priorities": {
            2500: {'X': 2, 'Y': 0, 'Z': 1},  # Change from X-Y-Z to Y-Z-X
            #3500: {'X': 1, 'Y': 2, 'Z': 0},   # Change to Z-X-Y

        }
    }
]

results = {}

# Run experiments
for exp in experiments:
    print(f"\n{'='*80}")
    print(f"Running experiment: {exp['name']}")
    print(f"{'='*80}")
    
    # Create environment
    env = SARrobotEnv(
        grid_rows=GRID_ROWS,
        grid_cols=GRID_COLS,
        info_number_needed=INFO_POINTS,
        sparse_reward=exp["sparse_reward"],
        reward_shaping=exp["reward_shaping"],
        attention=exp["attention"],
        hierarchical=exp["hierarchical"],
        render_mode=None
    )
    
    # Create agent
    agent = exp["agent_class"](
        env=env,
        ALPHA=ALPHA,
        GAMMA=GAMMA,
        EPSILON_MAX=EPSILON_MAX,
        DECAY_RATE=DECAY_RATE,
        EPSILON_MIN=EPSILON_MIN,
        log_rewards_dir=os.path.join(LOG_DIR, exp["name"]),
        learned_policy_dir=os.path.join(POLICY_DIR, exp["name"])
    )
    
    # Train agent
    rewards, steps, metrics = agent.train(NUM_EPISODES, change_priorities_at=exp["change_priorities"])
    
    # Store results
    results[exp["name"]] = {
        "rewards": rewards,
        "steps": steps,
        "metrics": metrics,
        "agent_class": exp["agent_class"].__name__
    }

# Save results
with open(os.path.join(LOG_DIR, 'results.pkl'), 'wb') as f:
    pickle.dump(results, f)

In [None]:
agent

In [None]:
# Create comparison visualizations
# 1. Reward trends - Separate plots for static and dynamic
window_size = 20  # For smoothing

# Static environment plot
plt.figure(figsize=(12, 5))
for exp_name, data in results.items():
    if "Static" in exp_name:  # Only include static experiments
        rewards = data["rewards"]
        # Smooth rewards using moving average
        smoothed_rewards = np.convolve(rewards, np.ones(window_size)/window_size, mode='valid')
        plt.plot(smoothed_rewards, label=exp_name)

plt.title('Reward Trends During Training (Static Environment)')
plt.xlabel('Episodes')
plt.ylabel('Average Reward (Smoothed)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(LOG_DIR, 'reward_trends_static.png'))

# Dynamic environment plot
plt.figure(figsize=(12, 5))
dynamic_smoothed_rewards = {}  # Store for min/max calculation

for exp_name, data in results.items():
    if "Dynamic" in exp_name:  # Only include dynamic experiments
        rewards = data["rewards"]
        # Smooth rewards using moving average
        smoothed_rewards = np.convolve(rewards, np.ones(window_size)/window_size, mode='valid')
        plt.plot(smoothed_rewards, label=exp_name)
        dynamic_smoothed_rewards[exp_name] = smoothed_rewards

# Add priority change markers (if we have data for dynamic experiments)
if dynamic_smoothed_rewards:
    # Calculate global min/max for consistent text placement
    all_rewards = np.concatenate(list(dynamic_smoothed_rewards.values()))
    min_reward = np.min(all_rewards)
    max_reward = np.max(all_rewards)
    
    # Get the change priority episodes from one of the dynamic experiments
    for exp in experiments:
        if exp["change_priorities"] is not None:
            for episode in exp["change_priorities"].keys():
                if episode >= window_size//2:
                    adjusted_episode = episode - window_size//2
                    plt.axvline(x=adjusted_episode, color='r', linestyle='--', alpha=0.5)
                    plt.text(adjusted_episode, min_reward + (max_reward-min_reward)*0.1, 
                            f"Priority\nChange", rotation=90, color='r', alpha=0.7)
            break  # Only need one experiment's change points as they're the same

plt.title('Reward Trends During Training (Dynamic Environment)')
plt.xlabel('Episodes')
plt.ylabel('Average Reward (Smoothed)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(os.path.join(LOG_DIR, 'reward_trends_dynamic.png'))

# 2. Adaptation metrics comparison (for dynamic experiments)
plt.figure(figsize=(10, 8))

# Extract adaptation metrics
flat_changes = []
maxinfo_changes = []

for exp_name, data in results.items():
    if "Dynamic" in exp_name and "priority_changes" in data["metrics"]:
        changes = data["metrics"]["priority_changes"]
        if "Flat" in exp_name:
            flat_changes = changes
        else:
            maxinfo_changes = changes

# Compare adaptation metrics side by side
if flat_changes and maxinfo_changes:
    # Setup plot for adaptation time
    plt.subplot(2, 1, 1)
    labels = [f"Change {i+1}" for i in range(min(len(flat_changes), len(maxinfo_changes)))]
    x = np.arange(len(labels))
    width = 0.35
    
    # Extract steps to adapt
    flat_steps = [change.get('steps_to_adapt', 0) for change in flat_changes[:len(labels)]]
    maxinfo_steps = [change.get('steps_to_adapt', 0) for change in maxinfo_changes[:len(labels)]]
    
    plt.bar(x - width/2, flat_steps, width, label='Flat Agent')
    plt.bar(x + width/2, maxinfo_steps, width, label='MaxInfoRL Agent')
    plt.ylabel('Steps to Adapt')
    plt.title('Adaptation Time After Priority Changes')
    plt.xticks(x, labels)
    plt.legend()
    
    # Setup plot for success rate improvement
    plt.subplot(2, 1, 2)
    
    # Calculate success rate improvement (after - before)
    flat_improvements = [change.get('success_rate_after', 0) - change.get('success_rate_before', 0) 
                        for change in flat_changes[:len(labels)]]
    maxinfo_improvements = [change.get('success_rate_after', 0) - change.get('success_rate_before', 0) 
                           for change in maxinfo_changes[:len(labels)]]
    
    plt.bar(x - width/2, flat_improvements, width, label='Flat Agent')
    plt.bar(x + width/2, maxinfo_improvements, width, label='MaxInfoRL Agent')
    plt.ylabel('Success Rate Improvement (%)')
    plt.title('Performance Improvement After Adaptation')
    plt.xticks(x, labels)
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(LOG_DIR, 'adaptation_metrics.png'))

# 3. Overall performance comparison
plt.figure(figsize=(12, 6))

# Key metrics to compare
metrics_to_compare = [
    'mission_success_rate', 
    'info_collection_success_rate',
    'average_steps_per_episode',
    'mission_success_no_collisions_rate'
]

metric_labels = {
    'mission_success_rate': 'Mission Success (%)',
    'info_collection_success_rate': 'Info Collection (%)',
    'average_steps_per_episode': 'Avg Steps',
    'mission_success_no_collisions_rate': 'Success Without Collisions (%)'
}

# Setup bar chart
x = np.arange(len(metrics_to_compare))
width = 0.2
exp_names = list(results.keys())

for i, exp_name in enumerate(exp_names):
    values = [results[exp_name]['metrics'].get(metric, 0) for metric in metrics_to_compare]
    offset = width * (i - len(exp_names)/2 + 0.5)
    plt.bar(x + offset, values, width, label=exp_name)

plt.xlabel('Metrics')
plt.ylabel('Value')
plt.title('Performance Comparison Across Experiments')
plt.xticks(x, [metric_labels[metric] for metric in metrics_to_compare])
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=len(exp_names))
plt.tight_layout()
plt.savefig(os.path.join(LOG_DIR, 'performance_comparison.png'))

# Print summary table
print("\n" + "="*100)
print("PERFORMANCE COMPARISON SUMMARY")
print("="*100)
print(f"{'Metric':<40} | {'Flat_Static':<15} | {'Flat_Dynamic':<15} | {'MaxInfoRL_Static':<15} | {'MaxInfoRL_Dynamic':<15}")
print("="*100)

metrics_to_print = [
    'mission_success_rate',
    'info_collection_success_rate', 
    'mission_success_no_collisions_rate',
    'average_steps_per_episode',
    'average_reward_per_episode',
    'exploration_exploitation_ratio'
]

for metric in metrics_to_print:
    values = []
    for exp_name in ["Flat_Static", "Flat_Dynamic", "MaxInfoRL_Static", "MaxInfoRL_Dynamic"]:
        if exp_name in results:
            value = results[exp_name]["metrics"].get(metric, "N/A")
            if isinstance(value, (int, float)):
                values.append(f"{value:.2f}")
            else:
                values.append(str(value))
        else:
            values.append("N/A")
    
    metric_name = metric_labels.get(metric, metric)
    print(f"{metric_name:<40} | {values[0]:<15} | {values[1]:<15} | {values[2]:<15} | {values[3]:<15}")

# Print adaptation summary for dynamic experiments
if flat_changes and maxinfo_changes:
    print("\n" + "="*100)
    print("ADAPTATION METRICS SUMMARY")
    print("="*100)
    print(f"{'Metric':<30} | {'Flat_Dynamic':<15} | {'MaxInfoRL_Dynamic':<15} | {'Improvement':<15}")
    print("="*100)
    
    # Average steps to adapt
    flat_avg_steps = np.mean([change.get('steps_to_adapt', 0) for change in flat_changes if change.get('adaptation_completed', False)])
    maxinfo_avg_steps = np.mean([change.get('steps_to_adapt', 0) for change in maxinfo_changes if change.get('adaptation_completed', False)])
    step_diff = maxinfo_avg_steps - flat_avg_steps
    step_pct = (flat_avg_steps - maxinfo_avg_steps) / flat_avg_steps * 100 if flat_avg_steps > 0 else 0
    step_sign = "+" if step_pct > 0 else ""
    
    print(f"{'Avg Steps to Adapt':<30} | {flat_avg_steps:.1f}:<15 | {maxinfo_avg_steps:.1f}:<15 | {step_sign}{step_pct:.1f}%")
    
    # Average success rate improvement
    flat_avg_improve = np.mean([change.get('success_rate_after', 0) - change.get('success_rate_before', 0) 
                               for change in flat_changes if change.get('adaptation_completed', False)])
    maxinfo_avg_improve = np.mean([change.get('success_rate_after', 0) - change.get('success_rate_before', 0) 
                                  for change in maxinfo_changes if change.get('adaptation_completed', False)])
    improve_diff = maxinfo_avg_improve - flat_avg_improve
    improve_sign = "+" if improve_diff > 0 else ""
    
    print(f"{'Avg Success Rate Improvement':<30} | {flat_avg_improve:.1f}%:<15 | {maxinfo_avg_improve:.1f}%:<15 | {improve_sign}{improve_diff:.1f}%")

print("\nExperiment complete! Results saved to:", LOG_DIR)

In [None]:
from evaluation import evaluate_trained_policy
evaluate_trained_policy(agent, "logs/comparison_20250428_134413/policies/MaxInfoRL_Dynamic/q_extrinsic_table_episode_5000.npy")
# evaluate_trained_policy(agent_maxinfo, "policies/ADV/q_extrinsic_table_episode_5000.npy")

In [None]:
### FOR TESTING ### newwww

In [None]:
import numpy as np
# List of models to test
models = ["tulu3:8b", "hermes3", "gemma2", "llama3.1", "qwen2.5"]

# Dictionary to store results for each model
model_results = {}

for model in models:
    print(f"\n\n==== Training with {model} model ====\n")

    # Create environment once per model
    env_hier = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=6,
        sparse_reward=True,
        reward_shaping=False,
        attention=False,
        hierarchical=True,
        render_mode='None'
    )
    
    all_total_rewards_AGENT_hierLLM = []  # List to store total rewards from each run
    all_total_steps_AGENT_hierLLM = []    # List to store total steps from each run
    all_metrics_AGENT_hierLLM = []        # List to store metrics from each run
    
    for run in range(param.testing_runs):
        print(f"Starting run {run+1}/{param.testing_runs} with model {model}")
        
        # Create manager with the current model
        manager_hierLLM = QLearningAgentHierarchicalLLM(
            env_hier, param.manager_action_space_size, 
            param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
            log_rewards_dir=f"curves/HIER-LLM-{model}", 
            learned_policy_dir=f"policies/HIER-LLM-{model}-manager",
            model=model
        )
        
        # Create workers with the current model
        explore_worker_hierLLM = QLearningAgentHierarchicalLLM(
            env_hier, param.explore_action_space_size, 
            param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
            log_rewards_dir=None, 
            learned_policy_dir=f"policies/HIER-LLM-{model}-explore",
            model=model
        )
        
        collect_worker_hierLLM = QLearningAgentHierarchicalLLM(
            env_hier, param.collect_action_space_size, 
            param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
            log_rewards_dir=None, 
            learned_policy_dir=f"policies/HIER-LLM-{model}-collect",
            model=model
        )
        
        operate_worker_hierLLM = QLearningAgentHierarchicalLLM(
            env_hier, param.operate_action_space_size, 
            param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
            log_rewards_dir=None, 
            learned_policy_dir=f"policies/HIER-LLM-{model}-operate",
            model=model
        )
        
        workers_hierLLM = {
            0: explore_worker_hierLLM,  # Worker for EXPLORE
            1: collect_worker_hierLLM,  # Worker for COLLECT
            2: operate_worker_hierLLM   # Worker for OPERATE
        }
        
        # Train the agent
        rewards_hierLLM, steps_hierLLM, metrics_hierLLM, workers_LLM = manager_hierLLM.train(
            manager_hierLLM, workers_hierLLM, param.EPISODES
        )
        
        all_total_rewards_AGENT_hierLLM.append(rewards_hierLLM)
        all_total_steps_AGENT_hierLLM.append(steps_hierLLM)
        all_metrics_AGENT_hierLLM.append(metrics_hierLLM)
    
    # Save results for this model
    save_training_results(
        f"{agent_config['labels'][6]}_{model}", 
        all_total_rewards_AGENT_hierLLM, 
        all_total_steps_AGENT_hierLLM, 
        all_metrics_AGENT_hierLLM, 
        save_dir=f'saved_results_no_sparse/{model}'
    )
    
    # Store results in dictionary for comparison
    model_results[model] = {
        'rewards': all_total_rewards_AGENT_hierLLM,
        'steps': all_total_steps_AGENT_hierLLM,
        'metrics': all_metrics_AGENT_hierLLM
    }
    
    time.sleep(param.sleeping_time)

In [None]:
import numpy as np
import pickle
import os

# List of models to evaluate
models = ["tulu3:8b", "hermes3", "gemma2", "llama3.1", "qwen2.5"]

# Dictionary to store loaded results for each model
model_results = {}

# Define which metrics to compute
metric_keys = [
    'mission_success_rate', 
    'info_collection_success_rate', 
    'collection_success_rate', 
    'average_steps_per_episode', 
    'average_reward_per_episode', 
    'collision_rate_in_successful_episodes',
    'mission_success_no_collisions_rate', 
    'predictor_stats.overall_success_rate', 
    'llm_timing.average_time_per_call'
]

# Function to load saved results
def load_saved_results(model, save_dir='saved_results_no_sparse'):
    file_path = f"{save_dir}/{model}/Q-learning-Hierarchical-LLM_{model}_metrics.pkl"
    
    try:
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
            return data
        else:
            print(f"File not found: {file_path}")
            return None
    except Exception as e:
        print(f"Error loading results for {model}: {e}")
        return None

# Function to get a nested dictionary value
def get_nested_value(data, key_path):
    """Get a value from a nested dictionary using a dot-separated key path"""
    keys = key_path.split('.')
    current = data
    
    for key in keys:
        if isinstance(current, dict) and key in current:
            current = current[key]
        else:
            return None
    
    return current

# Function to process metrics and calculate averages
def process_metrics_list(metrics_list):
    """Process a list of metrics dictionaries and compute averages for specified metrics"""
    results = {}
    
    if not metrics_list or len(metrics_list) == 0:
        return results
    
    # Process each metric key
    for metric_key in metric_keys:
        if '.' in metric_key:
            # Handle nested metrics (e.g., predictor_stats.overall_success_rate)
            values = [get_nested_value(m, metric_key) for m in metrics_list]
            values = [v for v in values if v is not None]  # Filter out None values
        else:
            # Handle top-level metrics
            values = [m.get(metric_key) for m in metrics_list if metric_key in m]
        
        # Calculate average if we have values
        if values:
            results[metric_key] = (np.mean(values), np.std(values))
    
    return results

# Load results for each model
for model in models:
    print(f"Loading results for {model}...")
    data = load_saved_results(model)
    if data is not None:
        model_results[model] = data
    else:
        print(f"Failed to load results for {model}")

# Process and print comparison summary
print("\n==== Model Comparison Summary ====\n")

# Print each model's metrics
for model in models:
    if model in model_results:
        print(f"Model: {model}")
        processed_results = process_metrics_list(model_results[model])
        
        for metric_key in metric_keys:
            if metric_key in processed_results:
                mean, std = processed_results[metric_key]
                
                # Format the output based on metric type
                if metric_key == 'llm_timing.average_time_per_call':
                    print(f" Average LLM call time: {mean:.4f} ± {std:.4f} seconds")
                elif metric_key == 'predictor_stats.overall_success_rate':
                    print(f" Average predictor success rate: {mean:.2f}% ± {std:.2f}%")
                elif metric_key == 'average_steps_per_episode':
                    print(f" Average steps: {mean:.2f} ± {std:.2f}")
                elif metric_key == 'average_reward_per_episode':
                    print(f" Average reward: {mean:.2f} ± {std:.2f}")
                elif metric_key == 'collection_success_rate':
                    print(f" Collection success rate: {mean:.2f}% ± {std:.2f}%")
                elif metric_key == 'collision_rate_in_successful_episodes':
                    print(f" Collision rate in successful episodes: {mean:.2f}% ± {std:.2f}%")
                elif metric_key == 'mission_success_no_collisions_rate':
                    print(f" Mission success without collisions rate: {mean:.2f}% ± {std:.2f}%")
                else:
                    print(f" {metric_key.replace('_', ' ').title()}: {mean:.2f}% ± {std:.2f}%")
        
        print()

In [None]:
### FOR TESTING ###

In [None]:
## 1) Q-learning-flat -- TESTING
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=False,
            reward_shaping=False,
            attention=False,
            hierarchical=False,
            render_mode='None'
        )
all_total_rewards_AGENT_flat = []  # List to store total rewards from each run
all_total_steps_AGENT_flat = []  # List to store total rewards from each run
all_metrics_AGENT_flat = [] # New list to store metrics from each run

for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = 0.1
    DECAY_RATE = param.DECAY_RATE
    agent_flat = QLearningAgentFlat(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                    log_rewards_dir="curves/flat", learned_policy_dir="policies/flat")
    rewards_flat, steps_flat, metrics_flat = agent_flat.train(5000)

    all_total_rewards_AGENT_flat.append(rewards_flat)
    all_total_steps_AGENT_flat.append(steps_flat)
    all_metrics_AGENT_flat.append(metrics_flat)  # Store the metrics
# save_training_results(agent_config['labels'][0], all_total_rewards_AGENT_flat, all_total_steps_AGENT_flat, all_metrics_AGENT_flat, save_dir='saved_results_no_sparse')
# time.sleep(param.sleeping_time)

In [None]:
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=False,
            reward_shaping=False,
            attention=False,
            hierarchical=False,
            render_mode='None'
        )
all_total_rewards_AGENTmaxinfo = []  # List to store total rewards from each run
all_total_steps_AGENTmaxinfo = []  # List to store total rewards from each run
all_metrics_AGENTmaxinfo = [] # New list to store metrics from each run

for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent_maxinfo = QLearningAgentMaxInfoRL(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                    log_rewards_dir="curves/maxinfo", learned_policy_dir="policies/maxinfo")
    rewards_maxinfo, steps_maxinfo, metrics_maxinfo = agent_maxinfo.train(1000)

    all_total_rewards_AGENTmaxinfo.append(rewards_maxinfo)
    all_total_steps_AGENTmaxinfo.append(steps_maxinfo)
    all_metrics_AGENTmaxinfo.append(metrics_maxinfo)  # Store the metrics

In [None]:
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=False,
            reward_shaping=False,
            attention=False,
            hierarchical=False,
            render_mode='None'
        )
all_total_rewards_AGENTmaxinfo = []  # List to store total rewards from each run
all_total_steps_AGENTmaxinfo = []  # List to store total rewards from each run
all_metrics_AGENTmaxinfo = [] # New list to store metrics from each run

for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = 0.1
    DECAY_RATE = param.DECAY_RATE
    agent_maxinfo = QLearningAgentMaxInfoRL_ADVANCED(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                    log_rewards_dir="curves/ADV", learned_policy_dir="policies/ADV")
    rewards_maxinfo, steps_maxinfo, metrics_maxinfo = agent_maxinfo.train(5000)

    all_total_rewards_AGENTmaxinfo.append(rewards_maxinfo)
    all_total_steps_AGENTmaxinfo.append(steps_maxinfo)
    all_metrics_AGENTmaxinfo.append(metrics_maxinfo)  # Store the metrics

In [None]:
## modular flat learning agent  -- TESTING

env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=4,
            sparse_reward=False,
            reward_shaping=False,
            attention=True,
            hierarchical=False,
            render_mode='None'
        )
all_total_rewards_AGENT = []  # List to store total rewards from each run
all_total_steps_AGENT = []  # List to store total rewards from each run
all_metrics_AGENT = [] # New list to store metrics from each run

for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent = LearningAgentFlat(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                    log_rewards_dir="curves/flatnewatt", learned_policy_dir="policies/flatnewatt",
                                    use_llm=False, use_attention=True, use_action_toggle=False)
    agent.global_epsilon_exploit = True
    rewards, steps, metrics = agent.train(EPISODES)

    all_total_rewards_AGENT.append(rewards)
    all_total_steps_AGENT.append(steps)
    all_metrics_AGENT.append(metrics)  # Store the metrics
# save_training_results(agent_config['labels'][0], all_total_rewards_AGENT, all_total_steps_AGENT_flat, all_metrics_AGENT_flat, save_dir='saved_results_no_sparse')
# time.sleep(param.sleeping_time)

In [None]:
## modular hierarchical learning agent  -- TESTING
env_hier = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=3,
        sparse_reward=False,
        reward_shaping=False,
        attention=True,
        hierarchical=True,
        render_mode='None'
        )
all_total_rewards_AGENT_hier = []  # List to store total rewards from each rum
all_total_steps_AGENT_hier = []  # List to store total rewards from each run
all_metrics_AGENT_hier = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    manager_hier = LearningAgentHierarchical(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
                                              log_rewards_dir="curves/HIER", learned_policy_dir="policies/HIER-manager",
                                              use_llm=False, use_attention=False, use_action_toggle=True)
    explore_worker_hier = LearningAgentHierarchical(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-explore",
                                                     use_llm=False, use_attention=False, use_action_toggle=True)
    collect_worker_hier = LearningAgentHierarchical(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-collect",
                                                     use_llm=False, use_attention=False, use_action_toggle=True)
    operate_worker_hier = LearningAgentHierarchical(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-operate",
                                                     use_llm=False, use_attention=False, use_action_toggle=True)
    workers_hier = {
        0: explore_worker_hier,  # Worker for EXPLORE
        1: collect_worker_hier,  # Worker for COLLECT
        2: operate_worker_hier   # Worker for OPERATE
    }
    manager_hier.global_epsilon_exploit = True
    rewards_hier, steps_hier, metrics_hier, workers = manager_hier.train(manager_hier, workers_hier, param.EPISODES)
    all_total_rewards_AGENT_hier.append(rewards_hier)
    all_total_steps_AGENT_hier.append(steps_hier)
    all_metrics_AGENT_hier.append(metrics_hier)  # Store the metrics
# save_training_results(agent_config['labels'][5], all_total_rewards_AGENT_hier, all_total_steps_AGENT_hier, all_metrics_AGENT_hier, save_dir='saved_results_no_sparse')
# time.sleep(param.sleeping_time)

In [None]:
## 2) Q-learning-LLM -- TESTING
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=False,
            reward_shaping=False,
            attention=False,
            hierarchical=False,
            render_mode='None'
        )       
all_total_rewards_AGENT_flatLLM = []  # List to store total rewards from each run
all_total_steps_AGENT_flatLLM = []  # List to store total rewards from each run
all_metrics_AGENT_flatLLM = [] # New list to store metrics from each run
for _ in range(param.testing_runs): # changed
    EPISODES = param.EPISODES # changed
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent_flatLLM = QLearningAgentFlatLLM(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                          log_rewards_dir='curves/flat-LLM', learned_policy_dir='policies/flat-LLM')
    rewards_flatLLM, steps_flatLLM, metrics_flatLLM = agent_flatLLM.train(EPISODES)

    all_total_rewards_AGENT_flatLLM.append(rewards_flatLLM)
    all_total_steps_AGENT_flatLLM.append(steps_flatLLM)
    all_metrics_AGENT_flatLLM.append(metrics_flatLLM)  # Store the metrics
save_training_results(agent_config['labels'][1], all_total_rewards_AGENT_flatLLM, all_total_steps_AGENT_flatLLM, all_metrics_AGENT_flatLLM, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
## 3) Q-learning-PolicyShaping -- TESTING
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=False,
            reward_shaping=False,
            attention=True,
            hierarchical=False,
            render_mode='None'
        ) 
all_total_rewards_AGENT_att = []  # List to store total rewards from each run
all_total_steps_AGENT_att = []  # List to store total rewards from each run
all_metrics_AGENT_att = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent_att = QLearningAgentFlatAttention(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                            log_rewards_dir="curves/flat-ATT-PS", learned_policy_dir="policies/flat-ATT-PS")
    agent_att.global_epsilon_exploit = True
    rewards_att, steps_att, metrics_att = agent_att.train(EPISODES)

    all_total_rewards_AGENT_att.append(rewards_att)
    all_total_steps_AGENT_att.append(steps_att)
    all_metrics_AGENT_att.append(metrics_att)  # Store the metrics
# save_training_results(agent_config['labels'][3], all_total_rewards_AGENT_att, all_total_steps_AGENT_att, all_metrics_AGENT_att, save_dir='saved_results_no_sparse')
# time.sleep(param.sleeping_time)

In [None]:
## 4) Q-learning-RewardShaping -- TESTING
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=True,
            reward_shaping=True,
            attention=True,
            hierarchical=False,
            render_mode='None'
        ) 
all_total_rewards_AGENT_attRS = []  # List to store total rewards from each run
all_total_steps_AGENT_attRS = []  # List to store total rewards from each run
all_metrics_AGENT_attRS = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent_attRS = QLearningAgentFlat(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, 
                                            log_rewards_dir="curves/flat-ATT-RS", learned_policy_dir="policies/flat-ATT-RS")
    rewards_attRS, steps_attRS, metrics_attRS = agent_attRS.train(EPISODES)

    all_total_rewards_AGENT_attRS.append(rewards_attRS)
    all_total_steps_AGENT_attRS.append(steps_attRS)
    all_metrics_AGENT_attRS.append(metrics_attRS)  # Store the metrics
save_training_results(agent_config['labels'][4], all_total_rewards_AGENT_attRS, all_total_steps_AGENT_attRS, all_metrics_AGENT_attRS, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
# 5) Q-learning-ActionToggle -- TESTING
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=True,
            reward_shaping=False,
            attention=True,
            hierarchical=False,
            render_mode='None'
        ) 
all_total_rewards_AGENT_tog = []  # List to store total rewards from each run
all_total_steps_AGENT_tog = []  # List to store total rewards from each run
all_metrics_AGENT_tog = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent_tog = QLearningAgentFlatActionToggle(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN,
                                            log_rewards_dir="curves/flat-ATT-AS", learned_policy_dir="policies/flat-ATT-AS") 
    rewards_tog, steps_tog, metrics_tog = agent_tog.train(EPISODES)

    all_total_rewards_AGENT_tog.append(rewards_tog)
    all_total_steps_AGENT_tog.append(steps_tog)
    all_metrics_AGENT_tog.append(metrics_tog)  # Store the metrics
save_training_results(agent_config['labels'][2], all_total_rewards_AGENT_tog, all_total_steps_AGENT_tog, all_metrics_AGENT_tog, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
# 6) Q-learning-hierarchical -- TESTING
env_hier = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=3,
        sparse_reward=False,
        reward_shaping=False,
        attention=False,
        hierarchical=True,
        render_mode='None'
        )
all_total_rewards_AGENT_hier = []  # List to store total rewards from each rum
all_total_steps_AGENT_hier = []  # List to store total rewards from each run
all_metrics_AGENT_hier = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    manager_hier = QLearningAgentHierarchical(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
                                              log_rewards_dir="curves/HIER", learned_policy_dir="policies/HIER-manager")
    explore_worker_hier = QLearningAgentHierarchical(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-explore")
    collect_worker_hier = QLearningAgentHierarchical(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-collect")
    operate_worker_hier = QLearningAgentHierarchical(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-operate")
    workers_hier = {
        0: explore_worker_hier,  # Worker for EXPLORE
        1: collect_worker_hier,  # Worker for COLLECT
        2: operate_worker_hier   # Worker for OPERATE
    }
    rewards_hier, steps_hier, metrics_hier, workers = manager_hier.train(manager_hier, workers_hier, param.EPISODES)
    all_total_rewards_AGENT_hier.append(rewards_hier)
    all_total_steps_AGENT_hier.append(steps_hier)
    all_metrics_AGENT_hier.append(metrics_hier)  # Store the metrics
save_training_results(agent_config['labels'][5], all_total_rewards_AGENT_hier, all_total_steps_AGENT_hier, all_metrics_AGENT_hier, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
# 7) Q-learning-hierarchical-LLM -- TESTING
env_hier = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=3,
        sparse_reward=True,
        reward_shaping=False,
        attention=False,
        hierarchical=True,
        render_mode='None'
        )
all_total_rewards_AGENT_hierLLM = []  # List to store total rewards from each rum
all_total_steps_AGENT_hierLLM = []  # List to store total rewards from each run
all_metrics_AGENT_hierLLM = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    manager_hierLLM = QLearningAgentHierarchicalLLM(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
                                                    log_rewards_dir="curves/HIER-LLM", learned_policy_dir="policies/HIER-LLM-manager")
    explore_worker_hierLLM = QLearningAgentHierarchicalLLM(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                           log_rewards_dir=None, learned_policy_dir="policies/HIER-LLM-explore")
    collect_worker_hierLLM = QLearningAgentHierarchicalLLM(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                           log_rewards_dir=None, learned_policy_dir="policies/HIER-LLM-collect")
    operate_worker_hierLLM = QLearningAgentHierarchicalLLM(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                           log_rewards_dir=None, learned_policy_dir="policies/HIER-LLM-operate")
    workers_hierLLM = {
        0: explore_worker_hierLLM,  # Worker for EXPLORE
        1: collect_worker_hierLLM,  # Worker for COLLECT
        2: operate_worker_hierLLM   # Worker for OPERATE
    }
    rewards_hierLLM, steps_hierLLM, metrics_hierLLM, workers_LLM = manager_hierLLM.train(manager_hierLLM, workers_hierLLM, param.EPISODES)
    all_total_rewards_AGENT_hierLLM.append(rewards_hierLLM)
    all_total_steps_AGENT_hierLLM.append(steps_hierLLM)
    all_metrics_AGENT_hierLLM.append(metrics_hierLLM)  # Store the metrics
save_training_results(agent_config['labels'][6], all_total_rewards_AGENT_hierLLM, all_total_steps_AGENT_hierLLM, all_metrics_AGENT_hierLLM, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
# 8) Q-learning-hierarchical-PolicyShaping -- TESTING
all_total_rewards_AGENT_hier_att = []  # List to store total rewards from each run
all_total_steps_AGENT_hier_att = []  # List to store total rewards from each run
all_metrics_AGENT_hier_att = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    env_hier = SARrobotEnv(
    grid_rows=4,
    grid_cols=4,
    info_number_needed=3,
    sparse_reward=True,
    reward_shaping=False,
    attention=True,
    hierarchical=True,
    render_mode='None'
    )
    manager_hier_att = QLearningAgentHierarchicalAttention(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
                                                           log_rewards_dir="curves/HIER-PS-manager", learned_policy_dir="policies/HIER-PS-manager")
    explore_worker_hier_att = QLearningAgentHierarchicalAttention(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                  log_rewards_dir=None, learned_policy_dir="policies/HIER-PS-explore") 
    collect_worker_hier_att = QLearningAgentHierarchicalAttention(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                  log_rewards_dir=None, learned_policy_dir="policies/HIER-PS-collect") 
    operate_worker_hier_att = QLearningAgentHierarchicalAttention(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                  log_rewards_dir=None, learned_policy_dir="policies/HIER-PS-operate")
    workers_hier_att = {
        0: explore_worker_hier_att,  # Worker for EXPLORE
        1: collect_worker_hier_att,  # Worker for COLLECT
        2: operate_worker_hier_att   # Worker for OPERATE
    }
    manager_hier_att.global_epsilon_exploit = True
    rewards_hier_att, steps_hier_att, metrics_hier_att, workers_att = manager_hier_att.train(manager_hier_att, workers_hier_att, param.EPISODES)
    all_total_rewards_AGENT_hier_att.append(rewards_hier_att)
    all_total_steps_AGENT_hier_att.append(steps_hier_att)
    all_metrics_AGENT_hier_att.append(metrics_hier_att)  # Store the metrics
save_training_results(agent_config['labels'][8], all_total_rewards_AGENT_hier_att, all_total_steps_AGENT_hier_att, all_metrics_AGENT_hier_att, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
## 9) Q-learning-hierarchical-RewardShaping -- TESTING
env_hier = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=3,
        sparse_reward=True,
        reward_shaping=True,
        attention=True,
        hierarchical=True,
        render_mode='None'
        )
all_total_rewards_AGENT_hierRS = []  # List to store total rewards from each rum
all_total_steps_AGENT_hierRS = []  # List to store total rewards from each run
all_metrics_AGENT_hierRS = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    manager_hierRS = QLearningAgentHierarchical(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
                                              log_rewards_dir="curves/HIER-RS", learned_policy_dir="policies/HIER-manager-RS")
    explore_worker_hierRS = QLearningAgentHierarchical(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-explore-RS")
    collect_worker_hierRS = QLearningAgentHierarchical(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-collect-RS")
    operate_worker_hierRS = QLearningAgentHierarchical(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-operate-RS")
    workers_hierRS = {
        0: explore_worker_hierRS,  # Worker for EXPLORE
        1: collect_worker_hierRS,  # Worker for COLLECT
        2: operate_worker_hierRS   # Worker for OPERATE
    }
    rewards_hier_attRS, steps_hier_attRS, metrics_hier_attRS, workers_hier_attRS = manager_hierRS.train(manager_hierRS, workers_hierRS, param.EPISODES)
    all_total_rewards_AGENT_hierRS.append(rewards_hier_attRS)
    all_total_steps_AGENT_hierRS.append(steps_hier_attRS)
    all_metrics_AGENT_hierRS.append(metrics_hier_attRS)  # Store the metrics
save_training_results(agent_config['labels'][9], all_total_rewards_AGENT_hierRS, all_total_steps_AGENT_hierRS, all_metrics_AGENT_hierRS, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
# 10) Q-learning-hierarchical-ActionToggle -- TESTING
all_total_rewards_AGENT_hier_tog = []  # List to store total rewards from each run
all_total_steps_AGENT_hier_tog = []  # List to store total rewards from each run
all_metrics_AGENT_hier_tog = [] # New list to store metrics from each run
for _ in range(param.testing_runs):
    env_hier = SARrobotEnv(
    grid_rows=4,
    grid_cols=4,
    info_number_needed=3,
    sparse_reward=True,
    reward_shaping=False,
    attention=True,
    hierarchical=True,
    render_mode='None'
    )
    # Manager for choosing options
    manager_hier_tog = QLearningAgentHierarchicalActionToggle(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                              log_rewards_dir="curves/HIER-AS", learned_policy_dir="policies/HIER-manager-AS")
    explore_worker_hier_tog = QLearningAgentHierarchicalActionToggle(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-explore-AS") 
    collect_worker_hier_tog = QLearningAgentHierarchicalActionToggle(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-collect-AS")
    operate_worker_hier_tog = QLearningAgentHierarchicalActionToggle(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                     log_rewards_dir=None, learned_policy_dir="policies/HIER-operate-AS")
    workers_hier_tog = {
        0: explore_worker_hier_tog,  # Worker for EXPLORE
        1: collect_worker_hier_tog,  # Worker for COLLECT
        2: operate_worker_hier_tog   # Worker for OPERATE
    }
    rewards_hier_tog, steps_hier_tog, metrics_hier_tog, workers_hier_tog = manager_hier_tog.train(manager_hier_tog, workers_hier_tog, param.EPISODES)
    all_total_rewards_AGENT_hier_tog.append(rewards_hier_tog)
    all_total_steps_AGENT_hier_tog.append(steps_hier_tog)
    all_metrics_AGENT_hier_tog.append(metrics_hier_tog)  # Store the metrics
save_training_results(agent_config['labels'][7], all_total_rewards_AGENT_hier_tog, all_total_steps_AGENT_hier_tog, all_metrics_AGENT_hier_tog, save_dir='saved_results_no_sparse')
time.sleep(param.sleeping_time)

In [None]:
# Create a dictionary to store loaded metrics
all_agent_metrics = {}

# Load metrics for each agent from saved files
for i, label in enumerate(agent_config['labels']):
    # Load metrics using the existing function
    loaded_data = load_training_results(
        base_name=label,
        data_type='metrics',
        file_format='pickle',
        save_dir='saved_results_no_sparse'
    )
    
    # Add to dictionary if metrics were successfully loaded
    if loaded_data is not None:
        all_agent_metrics[label]= loaded_data
    else:
        print(f"Warning: Could not load metrics for {label}")

# Define which metrics to compute
metric_keys = ['mission_success_rate', 'info_collection_success_rate', 'collection_success_rate', 'average_steps_per_episode', 'average_reward_per_episode', 'collision_rate_in_successful_episodes',
               'mission_success_no_collisions_rate', 'predictor_stats.overall_success_rate', 'llm_timing.average_time_per_call']

# Compute metrics for all available agents
results = compute_all_agents_metrics(all_agent_metrics, metric_keys)

In [None]:
evaluate_trained_policy(manager_hierLLM, "policies/HIER-LLM-manager/manager_q_table_episode_1000.npy")

In [None]:
# Create a dictionary to store loaded rewards
all_agent_rewards = {}

# Load rewards for each agent from saved files
for i, label in enumerate(agent_config['labels']):
    # Load rewards using the existing function
    loaded_data = load_training_results(
        base_name=label,
        data_type='rewards',  # Change from 'metrics' to 'rewards'
        file_format='pickle',
        save_dir='saved_results_no_sparse'
    )
    
    # Add to dictionary if rewards were successfully loaded
    if loaded_data is not None:
        all_agent_rewards[label] = loaded_data
    else:
        print(f"Warning: Could not load rewards for {label}")


In [None]:
# Call the plotting function with your data
fig, ax = plot_accumulated_rewards(
    reward_list=[all_agent_rewards[label] for label in agent_config['labels'] if label in all_agent_rewards],
    labels=[label for label in agent_config['labels'] if label in all_agent_rewards],
    colors=agent_config['colors'],
    window_size=150,
    figsize=(10, 6),
    save_path='agent_performance_comparison',
    use_savgol=False
)

In [None]:
plot_metric_bars(all_agent_metrics, agent_config, metric_keys)

In [None]:
### FOR TESTING ###

In [None]:
## 7) Q-PS-RS (Q with attention mechanism - policy shaping + reward shaping)
env = SARrobotEnv(
            grid_rows=4,
            grid_cols=4,
            info_number_needed=3,
            sparse_reward=True,
            reward_shaping=True,
            attention=True,
            hierarchical=False,
            render_mode='None'
        ) 
all_total_rewards_AGENT_attPSRS = []  # List to store total rewards from each run
all_total_steps_AGENT_attPSRS = []  # List to store total rewards from each run
for _ in range(param.testing_runs):
    EPISODES = param.EPISODES
    ALPHA = param.ALPHA
    GAMMA = param.GAMMA
    EPSILON_MAX = param.EPSILON_MAX
    EPSILON_MIN = param.EPSILON_MIN
    DECAY_RATE = param.DECAY_RATE
    agent_attPSRS = QLearningAgentFlatAttention(env, ALPHA, GAMMA, EPSILON_MAX, DECAY_RATE, EPSILON_MIN, 
                                            log_rewards_dir="curves-sparse/ATT-PSRS", learned_policy_dir="policies-sparse/ATT-PSRS")
    returns_attPSRS, steps_attPSRS = agent_attPSRS.train(EPISODES)

    all_total_rewards_AGENT_attPSRS.append(returns_attPSRS)
    all_total_steps_AGENT_attPSRS.append(steps_attPSRS)

time.sleep(param.sleeping_time)

In [None]:
## 8) HierQ-PS-RS (hierarchical Q-learning with attention mechanism - policy shaping + reward shaping)
all_total_rewards_AGENT_hier_attPSRS = []  # List to store total rewards from each run
for _ in range(param.testing_runs):
    env_hier = SARrobotEnv(
    grid_rows=4,
    grid_cols=4,
    info_number_needed=3,
    sparse_reward=True,
    reward_shaping=True,
    attention=True,
    hierarchical=True,
    render_mode='None'
    )
    # Manager for choosing options
    manager_hier_attPSRS = QLearningAgentHierarchicalAttention(env_hier, param.manager_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN, 
                                                           log_rewards_dir="curves-sparse/HRL-PS-RS-manager", learned_policy_dir="policies-sparse/HRL-PS-RS-manager")
    explore_worker_hier_attPSRS = QLearningAgentHierarchicalAttention(env_hier, param.explore_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                  log_rewards_dir=None, learned_policy_dir="policies-sparse/HRL-PS-RS-explore") 
    collect_worker_hier_attPSRS = QLearningAgentHierarchicalAttention(env_hier, param.collect_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                  log_rewards_dir=None, learned_policy_dir="policies-sparse/HRL-PS-RS-collect") 
    operate_worker_hier_attPSRS = QLearningAgentHierarchicalAttention(env_hier, param.operate_action_space_size, param.ALPHA, param.GAMMA, param.EPSILON_MAX, param.DECAY_RATE, param.EPSILON_MIN,
                                                                  log_rewards_dir=None, learned_policy_dir="policies-sparse/HRL-PS-RS-operate")
    workers_hier_attPSRS = {
        0: explore_worker_hier_attPSRS,  # Worker for EXPLORE
        1: collect_worker_hier_attPSRS,  # Worker for COLLECT
        2: operate_worker_hier_attPSRS   # Worker for OPERATE
    }
    hier_returns_attPSRS, attentionPSRS, workers_simple_attPSRS = manager_hier_attPSRS.train(manager_hier_attPSRS, workers_hier_attPSRS, param.EPISODES)
    all_total_rewards_AGENT_hier_attPSRS.append(hier_returns_attPSRS)

time.sleep(param.sleeping_time)

In [None]:
env = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=3,
        sparse_reward=True,
        reward_shaping=False,
        attention=False,
        hierarchical=False,
        render_mode='None'
    )
all_total_rewards_DDQN = []  # List to store rewards from each trial
for _ in range(param.testing_runs):
    agent = DoubleDQNAgent(
        env,
        hidden_dims=[64, 64],
        learning_rate=0.001,
        gamma=0.99,
        buffer_size=5000,
        batch_size=256,
        target_update=100,
        epsilon_start=1.0,
        epsilon_end=0.01,
        epsilon_decay=0.995
    )
    
    agent.train(num_episodes=param.EPISODES, minimal_size=1000, save_interval=100)
    all_total_rewards_DDQN.append(agent.episode_rewards)

In [None]:
from stable_baselines3 import DQN, PPO
from stable_baselines3.common.callbacks import BaseCallback
from environment_sar import SARrobotEnv
import numpy as np
from tqdm.auto import tqdm

class TqdmProgressCallback(BaseCallback):
    """
    Custom callback for monitoring training progress with tqdm and displaying rewards.
    """
    def __init__(self, total_timesteps, log_interval=100, verbose=0):
        super().__init__(verbose)
        self.total_timesteps = total_timesteps
        self.log_interval = log_interval
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_count = 0
        self.pbar = None
        # Track the current episode
        self.current_episode_reward = 0
        self.current_episode_length = 0
        
    def _on_training_start(self):
        self.pbar = tqdm(total=self.total_timesteps, desc="Training")
        self.episode_rewards = []
        self.episode_lengths = []
        self.episode_count = 0
        self.current_episode_reward = 0
        self.current_episode_length = 0
        
    def _on_step(self):
        # Update progress bar with each step
        self.pbar.update(1)
        
        # Increment episode length counter
        self.current_episode_length += 1
        
        # Add the current reward to the episode total
        if 'rewards' in self.locals:
            self.current_episode_reward += self.locals['rewards'][0]
        
        # Track episode rewards and lengths
        if self.locals.get('dones')[0]:
            self.episode_count += 1
            # Store the episode results
            self.episode_rewards.append(self.current_episode_reward)
            self.episode_lengths.append(self.current_episode_length)
            
            # Reset counters for the next episode
            self.current_episode_reward = 0
            self.current_episode_length = 0
            
            # Log every log_interval episodes
            if self.episode_count % self.log_interval == 0:
                mean_reward = np.mean(self.episode_rewards[-self.log_interval:])
                mean_length = np.mean(self.episode_lengths[-self.log_interval:])
                self.pbar.set_postfix({
                    'episodes': self.episode_count,
                    'mean_reward': f'{mean_reward:.2f}',
                    'mean_length': f'{mean_length:.2f}'
                })
                # Also update the description for better visibility
                self.pbar.set_description(
                    f"Training | Episodes: {self.episode_count} | Reward: {mean_reward:.2f}"
                )
                
        return True
        
    def _on_training_end(self):
        # Close the progress bar
        if self.pbar is not None:
            self.pbar.close()
            self.pbar = None

# Example usage
if __name__ == "__main__":
    # Create environment
    env = SARrobotEnv(
        grid_rows=4,
        grid_cols=4,
        info_number_needed=1,
        sparse_reward=False,
        reward_shaping=False,
        attention=False,
        hierarchical=False,
        render_mode='None'
    )
    
    # Set up total timesteps
    total_timesteps = 100000
    
    # Create the callback
    progress_callback = TqdmProgressCallback(
        total_timesteps=total_timesteps,
        log_interval=100  # Log every 100 episodes
    )

    # Create the model with custom neural network architecture
    policy_kwargs = dict(
        net_arch=[64, 64],  # 3 hidden layers with 256, 128, and 64 neurons
    )
    
    # Create and train the agent
    model = DQN(
        "MlpPolicy", 
        env, 
        learning_rate=0.001,
        buffer_size=2000,
        learning_starts=1000,
        batch_size=64,
        gamma=0.95,
        target_update_interval=100,
        exploration_initial_eps=1.0,
        exploration_fraction=0.3,
        exploration_final_eps=0.01,
        policy_kwargs=policy_kwargs,  # Add the custom architecture here
        verbose=1,
        tensorboard_log="./dqn_sar_tensorboard/"
    )

    # # Define custom network architecture
    # policy_kwargs = dict(
    #     net_arch=[dict(pi=[64, 64], vf=[64, 64])]  # Separate networks for policy and value
    # )

    # # Create and train the PPO agent
    # model = PPO(
    #     "MlpPolicy", 
    #     env,
    #     learning_rate=1e-3,
    #     n_steps=2048,           # Steps to run for each environment per update
    #     batch_size=256,          # Minibatch size for optimization
    #     n_epochs=10,            # Number of epochs to optimize for
    #     gamma=0.99,             # Discount factor
    #     gae_lambda=0.95,        # Factor for trade-off of bias vs variance for GAE
    #     clip_range=0.2,         # Clipping parameter for PPO
    #     ent_coef=0.1,          # Entropy coefficient for exploration
    #     vf_coef=0.5,            # Value function coefficient
    #     max_grad_norm=0.5,      # Clipping of gradients
    #     policy_kwargs=policy_kwargs,
    #     tensorboard_log="./dqn_sar_tensorboard/"
    # )
    
    # Train with the callback
    model.learn(
        total_timesteps=total_timesteps,
        callback=progress_callback
    )
    
    # Save the model
    model.save("dqn_sar_robot")
    
    print("Training completed!")

In [None]:
from stable_baselines3 import DQN
from tqdm.auto import tqdm
from environment_sar import SARrobotEnv
import numpy as np
import time

def visualize_evaluation(model, env, n_eval_episodes=10, render=False):
    """
    Evaluate a model and visualize each step
    """
    all_rewards = []
    
    for episode in range(n_eval_episodes):
        print(f"\n--- Episode {episode+1}/{n_eval_episodes} ---")
        obs, _ = env.reset()
        done = False
        total_reward = 0
        step = 0
        
        while not done:
            # Get action from model
            action, _states = model.predict(obs, deterministic=True)
            
            # Get action name if available
            action_name = f"Action {action}"
            if hasattr(env, '_get_action_name'):
                action_name = env._get_action_name(None, action)
            
            # Print current step info
            print(f"Step {step+1}: Taking {action_name}")
            
            # Render if requested
            if render:
                env.render()
                time.sleep(0.5)  # Add delay to make rendering visible
            
            # Take action
            obs, reward, done, _, _ = env.step(action)
            total_reward += reward
            
            # Print result of action
            print(f"  Reward: {reward:.2f}, Total: {total_reward:.2f}")
            
            
            step += 1
        
        # Episode summary
        print(f"Episode {episode+1} finished with reward: {total_reward:.2f} in {step} steps")
        all_rewards.append(total_reward)
    
    # Overall statistics
    mean_reward = np.mean(all_rewards)
    std_reward = np.std(all_rewards)
    print(f"\nEvaluation complete: {n_eval_episodes} episodes")
    print(f"Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")
    
    return mean_reward, std_reward

# Create the environment (choose render_mode based on what you want)
# Use 'human' for visual rendering, 'None' for no rendering
eval_env = SARrobotEnv(
    grid_rows=4,
    grid_cols=4,
    info_number_needed=1,
    sparse_reward=False,
    reward_shaping=False,
    attention=False,
    hierarchical=False,
    render_mode='none'  # Set to 'human' to see visualization
)

# Load the trained model
model = DQN.load("dqn_sar_robot")

# Run evaluation with visualization
mean_reward, std_reward = visualize_evaluation(
    model,
    eval_env,
    n_eval_episodes=1,  # Start with fewer episodes to see details
    render=True  # Set to True to see visual rendering
)