In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pathlib import Path

%matplotlib inline

In [None]:
# Load Evaluation Results
paths = {
    'baseline': Path("forecaster_ablation_oracle_baseline.pkl"),
    'oracle_target': Path("forecaster_ablation_oracle_target.pkl"),
    'self_target': Path("forecaster_ablation_self_target.pkl")
}

data = {}
for key, path in paths.items():
    try:
        with open(path, 'rb') as f:
            data[key] = pickle.load(f)
        print(f"Loaded {path}")
    except FileNotFoundError:
        print(f"Warning: {path} not found. Some plots may be empty.")

def merge_round_data(sources):
    merged = {}
    for source_key in sources:
        if source_key in data:
            # Extract round_by_round_data
            if 'round_by_round_data' in data[source_key]:
                rd = data[source_key]['round_by_round_data']
                for mode, mode_data in rd.items():
                    merged[mode] = mode_data
    return merged

# 1. VS ORACLE Dataset
round_data_vs_oracle = merge_round_data(['baseline', 'oracle_target'])

# 2. VS SELF Dataset
round_data_vs_self = merge_round_data(['baseline', 'self_target'])

In [None]:
# Clean MAPE data - remove values > 200 (likely division by zero issues)
def clean_mape_data(round_data, threshold=200):
    """Remove MAPE values greater than threshold from the dataset."""
    cleaned_count = 0
    for mode, mode_rounds in round_data.items():
        for round_num, metrics in mode_rounds.items():
            if 'mape' in metrics:
                original_len = len(metrics['mape'])
                metrics['mape'] = [v for v in metrics['mape'] if v is None or np.isnan(v) or v <= threshold]
                cleaned_count += original_len - len(metrics['mape'])
    return cleaned_count

# Apply cleaning to both datasets
cleaned_oracle = clean_mape_data(round_data_vs_oracle)
cleaned_self = clean_mape_data(round_data_vs_self)

print(f"Removed {cleaned_oracle} MAPE values > 200 from vs_oracle dataset")
print(f"Removed {cleaned_self} MAPE values > 200 from vs_self dataset")

In [None]:
def plot_results(round_data, target_name, metric='rmse', ylabel='RMSE'):
    if not round_data:
        print(f"No data for {target_name} ({metric})")
        return

    plt.figure(figsize=(14, 8))
    
    colors = {
        'Oracle': 'black',
        'ClassicFrequency': 'blue',
        'CUHK': 'red',
        'Bayesian': 'green',
        'WindowedFrequency': 'purple',
        'ConflictBased': 'orange',
        'StepwiseCOMB': 'brown',
        'ExpectationCOMB': 'cyan'
    }
    
    # Sort modes (Oracle first)
    modes = sorted(list(round_data.keys()))
    if 'Oracle' in modes:
        modes.remove('Oracle')
        modes.insert(0, 'Oracle')
        
    for mode in modes:
        if mode == 'Oracle':
             lbl = 'Oracle (Baseline)'
        else:
             lbl = mode
             
        mode_rounds = round_data[mode]
        rounds = sorted(mode_rounds.keys())
        means = []
        stds = []
        valid_rounds = []
        
        for r in rounds:
            if metric in mode_rounds[r]:
                vals = mode_rounds[r][metric]
                # Clean data
                vals = [v for v in vals if v is not None and not np.isnan(v)]
                if vals:
                    means.append(np.mean(vals))
                    stds.append(np.std(vals))
                    valid_rounds.append(r)
        
        if not valid_rounds:
            continue
            
        means = np.array(means)
        stds = np.array(stds)
        
        color = colors.get(mode, 'gray')
        style = '--' if mode == 'Oracle' else '-'
        width = 3 if mode == 'Oracle' else 1.5
        alpha_fill = 0.1
        
        plt.plot(valid_rounds, means, label=lbl, color=color, linestyle=style, linewidth=width)
        # plt.fill_between(valid_rounds, means - stds, means + stds, color=color, alpha=alpha_fill)

    plt.title(f"Forecaster Performance: {metric.upper()} vs {target_name.upper()}", fontsize=16)
    plt.xlabel("Negotiation Round", fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
def plot_violinplots(round_data, target_name, metric='rmse', ylabel='RMSE', ylim=None):
    if not round_data:
        print(f"No data for {target_name} ({metric})")
        return

    # Aggregate all values across all rounds for each mode
    aggregated_data = {}
    for mode, mode_rounds in round_data.items():
        all_values = []
        for round_num in mode_rounds.keys():
            if metric in mode_rounds[round_num]:
                vals = mode_rounds[round_num][metric]
                # Clean data
                vals = [v for v in vals if v is not None and not np.isnan(v)]
                all_values.extend(vals)
        if all_values:
            aggregated_data[mode] = all_values
    
    if not aggregated_data:
        print(f"No valid data for {target_name} ({metric})")
        return
    
    # Sort modes (Oracle first)
    modes = sorted(list(aggregated_data.keys()))
    if 'Oracle' in modes:
        modes.remove('Oracle')
        modes.insert(0, 'Oracle')
    
    # Prepare data for violin plot - create long format
    plot_data = []
    for mode in modes:
        for value in aggregated_data[mode]:
            label = 'Oracle (Baseline)' if mode == 'Oracle' else mode
            plot_data.append({'Mode': label, 'Value': value, 'Mode_key': mode})
    
    df_plot = pd.DataFrame(plot_data)
    
    # Color mapping
    colors = {
        'Oracle': 'black',
        'ClassicFrequency': 'blue',
        'CUHK': 'red',
        'Bayesian': 'green',
        'WindowedFrequency': 'purple',
        'ConflictBased': 'orange',
        'StepwiseCOMB': 'brown',
        'ExpectationCOMB': 'cyan'
    }
    
    # Create color palette in the right order
    palette = [colors.get(mode, 'gray') for mode in modes]
    
    plt.figure(figsize=(14, 6))
    
    # Create violin plot
    ax = sns.violinplot(data=df_plot, x='Mode', y='Value', palette=palette, 
                        inner='box', linewidth=1.5, cut=0)
    
    # Highlight Oracle with different styling
    for i, (patch, mode) in enumerate(zip(ax.collections[::2], modes)):  # Every other collection is a violin body
        if mode == 'Oracle':
            patch.set_alpha(0.8)
            patch.set_edgecolor('black')
            patch.set_linewidth(2)
        else:
            patch.set_alpha(0.6)
    
    plt.title(f"Forecaster Performance Distribution: {metric.upper()} vs {target_name.upper()}", fontsize=16)
    plt.ylabel(ylabel, fontsize=14)
    plt.xlabel("Forecaster Mode", fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    
    # Set y-axis limits if provided
    if ylim is not None:
        plt.ylim(ylim)
    
    plt.tight_layout()
    plt.show()

In [None]:
def plot_violinplots_end_of_session(round_data, target_name, metric='rmse', ylabel='RMSE', ylim=None):
    if not round_data:
        print(f"No data for {target_name} ({metric})")
        return

    # Get only the last round data for each mode
    end_session_data = {}
    for mode, mode_rounds in round_data.items():
        if mode_rounds:
            # Find the maximum round number for this mode
            max_round = max(mode_rounds.keys())
            if max_round in mode_rounds and metric in mode_rounds[max_round]:
                vals = mode_rounds[max_round][metric]
                # Clean data
                vals = [v for v in vals if v is not None and not np.isnan(v)]
                if vals:
                    end_session_data[mode] = vals
    
    if not end_session_data:
        print(f"No valid end-of-session data for {target_name} ({metric})")
        return
    
    # Sort modes (Oracle first)
    modes = sorted(list(end_session_data.keys()))
    if 'Oracle' in modes:
        modes.remove('Oracle')
        modes.insert(0, 'Oracle')
    
    # Prepare data for violin plot - create long format
    plot_data = []
    for mode in modes:
        for value in end_session_data[mode]:
            label = 'Oracle (Baseline)' if mode == 'Oracle' else mode
            plot_data.append({'Mode': label, 'Value': value, 'Mode_key': mode})
    
    df_plot = pd.DataFrame(plot_data)
    
    # Color mapping
    colors = {
        'Oracle': 'black',
        'ClassicFrequency': 'blue',
        'CUHK': 'red',
        'Bayesian': 'green',
        'WindowedFrequency': 'purple',
        'ConflictBased': 'orange',
        'StepwiseCOMB': 'brown',
        'ExpectationCOMB': 'cyan'
    }
    
    # Create color palette in the right order
    palette = [colors.get(mode, 'gray') for mode in modes]
    
    plt.figure(figsize=(14, 6))
    
    # Create violin plot
    ax = sns.violinplot(data=df_plot, x='Mode', y='Value', palette=palette, 
                        inner='box', linewidth=1.5, cut=0)
    
    # Highlight Oracle with different styling
    for i, (patch, mode) in enumerate(zip(ax.collections[::2], modes)):  # Every other collection is a violin body
        if mode == 'Oracle':
            patch.set_alpha(0.8)
            patch.set_edgecolor('black')
            patch.set_linewidth(2)
        else:
            patch.set_alpha(0.6)
    
    plt.title(f"End-of-Session Forecaster Performance: {metric.upper()} vs {target_name.upper()}", fontsize=16)
    plt.ylabel(ylabel, fontsize=14)
    plt.xlabel("Forecaster Mode", fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    
    # Set y-axis limits if provided
    if ylim is not None:
        plt.ylim(ylim)
    
    plt.tight_layout()
    plt.show()

In [None]:
# 1. RMSE vs Oracle
plot_results(round_data_vs_oracle, "Oracle (Actual Future)", metric='rmse', ylabel='RMSE (Utility)')

In [None]:
# 3. RMSE vs Self
plot_results(round_data_vs_self, "Self (Opponent Model Expectation)", metric='rmse', ylabel='RMSE (Utility)')

In [None]:
# 2. MAPE vs Oracle
plot_results(round_data_vs_oracle, "Oracle (Actual Future)", metric='mape', ylabel='MAPE (%)')

In [None]:
# 4. MAPE vs Self
plot_results(round_data_vs_self, "Self (Opponent Model Expectation)", metric='mape', ylabel='MAPE (%)')

## Violin Plots

The following violin plots show the distribution of forecaster performance across all negotiation rounds. Each violin displays:
- **Width**: Probability density at each value (wider = more data points at that value)
- **Inner box**: Interquartile range with median line
- **Thin lines**: Whiskers extending to data extremes
- **Shape**: Full distribution of the data (shows multimodality, skewness, etc.)

In [None]:
# Violin Plot 1: RMSE vs Oracle
plot_violinplots(round_data_vs_oracle, "Oracle (Actual Future)", metric='rmse', ylabel='RMSE (Utility)', ylim=(0, 1))

In [None]:
# Violin Plot 2: RMSE vs Self
plot_violinplots(round_data_vs_self, "Self (Opponent Model Expectation)", metric='rmse', ylabel='RMSE (Utility)', ylim=(0, 1))

In [None]:
# Violin Plot 3: MAPE vs Oracle
plot_violinplots(round_data_vs_oracle, "Oracle (Actual Future)", metric='mape', ylabel='MAPE (%)', ylim=(0, 100))

In [None]:
# Violin Plot 4: MAPE vs Self
plot_violinplots(round_data_vs_self, "Self (Opponent Model Expectation)", metric='mape', ylabel='MAPE (%)', ylim=(0, 100))

## Violin Plots - End of Session Only

The following violin plots show the distribution of forecaster performance **only from the last round** of each negotiation session. This provides insight into how well each forecaster performs at the end of negotiations when the most information is available.

In [None]:
# End-of-Session Violin Plot 1: RMSE vs Oracle
plot_violinplots_end_of_session(round_data_vs_oracle, "Oracle (Actual Future)", metric='rmse', ylabel='RMSE (Utility)', ylim=(0, 1))

In [None]:
# End-of-Session Violin Plot 2: RMSE vs Self
plot_violinplots_end_of_session(round_data_vs_self, "Self (Opponent Model Expectation)", metric='rmse', ylabel='RMSE (Utility)', ylim=(0, 1))

In [None]:
# End-of-Session Violin Plot 3: MAPE vs Oracle
plot_violinplots_end_of_session(round_data_vs_oracle, "Oracle (Actual Future)", metric='mape', ylabel='MAPE (%)', ylim=(0, 100))

In [None]:
# End-of-Session Violin Plot 4: MAPE vs Self
plot_violinplots_end_of_session(round_data_vs_self, "Self (Opponent Model Expectation)", metric='mape', ylabel='MAPE (%)', ylim=(0, 100))

## Significance Tests

In [None]:
import sys
import os
# Add parent directory to path to import significance_tests
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

from strategy.ablations.significance_tests import iterative_compare_models_per_metric, prepare_data

def run_significance_analysis(name, df):
    if df is None or df.empty:
        print(f"Skipping {name}: No data")
        return
        
    print(f"\nRunning significance tests for: {name}")
    
    # Prepare for minimization (negate values)
    test_df = df.copy()
    if 'rmse' in test_df.columns:
        test_df['rmse'] = test_df['rmse'].apply(lambda x: [-v for v in x])
    if 'mape' in test_df.columns:
        test_df['mape'] = test_df['mape'].apply(lambda x: [-v for v in x])
        
    results_df = iterative_compare_models_per_metric(test_df)
    
    output_file = f"significance_{name}.xlsx"
    results_df.to_excel(output_file, index=False)
    print(f"Saved results to {output_file}")
    display(results_df) # For notebook display

# Prepare all data
all_rounds_vs_oracle = prepare_data(round_data_vs_oracle, False)
all_rounds_vs_self = prepare_data(round_data_vs_self, False)
end_session_vs_oracle = prepare_data(round_data_vs_oracle, True)
end_session_vs_self = prepare_data(round_data_vs_self, True)

# Run for all scenarios
scenarios = {
    "end_session_vs_self": end_session_vs_self,
    "end_session_vs_oracle": end_session_vs_oracle,
    "all_rounds_vs_self": all_rounds_vs_self,
    "all_rounds_vs_oracle": all_rounds_vs_oracle
}

for name, df in scenarios.items():
    run_significance_analysis(name, df)
