# LLM-PEFT-PPM Results Analysis

- Performance metric extraction and comparison
- Statistical significance testing
- Comparison with original paper results
- Visualization of findings

---

## 1. Setup and Data Loading

In [None]:
import os
import sys
import json
import re
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('default')
sns.set_palette("husl")

# Project setup
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "replication_results").exists():
    PROJECT_ROOT = Path.cwd() / "llm-peft-ppm-replication"
    os.chdir(PROJECT_ROOT)

RESULTS_DIR = PROJECT_ROOT / "replication_results"
PLOTS_DIR = RESULTS_DIR / "plots"
PLOTS_DIR.mkdir(exist_ok=True)

print(f"Working directory: {PROJECT_ROOT}")
print(f"Results directory: {RESULTS_DIR}")
print(f"Plots directory: {PLOTS_DIR}")

In [None]:
# Load experiment results
def load_results(filename):
    filepath = RESULTS_DIR / filename
    if filepath.exists():
        with open(filepath, 'r') as f:
            return json.load(f)
    else:
        print(f" File not found: {filename}")
        return {}

# Load all result files
rnn_results = load_results("rnn_baseline_results.json")
llm_results = load_results("llm_peft_results.json")
competitor_results = load_results("competitor_baseline_results.json")

print(f"Loaded results:")
print(f"  - RNN experiments: {len(rnn_results)} datasets")
print(f"  - LLM experiments: {len(llm_results)} datasets")
print(f"  - Competitor experiments: {len(competitor_results)} datasets")

## 2. Performance Metrics Extraction

In [None]:
def extract_metrics_from_log(log_file_path):
    """
    Extract performance metrics from experiment log files.
    """
    metrics = {}
    
    if not Path(log_file_path).exists():
        return metrics
    
    try:
        with open(log_file_path, 'r') as f:
            log_content = f.read()
        
        # Extract accuracy for next activity prediction
        accuracy_pattern = r"Test accuracy[:\s]+([0-9.]+)"
        accuracy_matches = re.findall(accuracy_pattern, log_content, re.IGNORECASE)
        if accuracy_matches:
            metrics['next_activity_accuracy'] = float(accuracy_matches[-1])  # Take last/best
        
        # Extract MSE for remaining time prediction
        mse_pattern = r"Test MSE[:\s]+([0-9.]+)"
        mse_matches = re.findall(mse_pattern, log_content, re.IGNORECASE)
        if mse_matches:
            metrics['remaining_time_mse'] = float(mse_matches[-1])  # Take last/best
        
        # Extract training time
        time_pattern = r"Training time[:\s]+([0-9.]+)\s*(?:seconds|s)"
        time_matches = re.findall(time_pattern, log_content, re.IGNORECASE)
        if time_matches:
            metrics['training_time'] = float(time_matches[-1])
        
        # Extract epoch information
        epoch_pattern = r"Epoch\s+([0-9]+)"
        epoch_matches = re.findall(epoch_pattern, log_content)
        if epoch_matches:
            metrics['epochs_completed'] = max([int(e) for e in epoch_matches])
        
        # Look for final evaluation metrics
        final_pattern = r"Final\s+(?:test\s+)?(?:results?[:\s]+)?.*accuracy[:\s]+([0-9.]+).*mse[:\s]+([0-9.]+)"
        final_matches = re.findall(final_pattern, log_content, re.IGNORECASE | re.DOTALL)
        if final_matches:
            metrics['next_activity_accuracy'] = float(final_matches[-1][0])
            metrics['remaining_time_mse'] = float(final_matches[-1][1])
        
    except Exception as e:
        print(f"Error extracting metrics from {log_file_path}: {e}")
    
    return metrics


def build_results_dataframe(all_results):
    """
    Build a comprehensive DataFrame from all experiment results.
    """
    rows = []
    
    for exp_type, results in all_results.items():
        for dataset_name, dataset_results in results.items():
            if isinstance(dataset_results, dict):
                for config_name, config_results in dataset_results.items():
                    if isinstance(config_results, dict):
                        if "status" in config_results:  # Direct result
                            row = {
                                'experiment_type': exp_type,
                                'dataset': dataset_name,
                                'model': 'RNN' if exp_type == 'RNN' else config_name,
                                'configuration': config_name,
                                'status': config_results['status'],
                                'runtime': config_results.get('runtime', 0)
                            }
                            
                            # Extract metrics from log file if available
                            if 'log_file' in config_results:
                                metrics = extract_metrics_from_log(config_results['log_file'])
                                row.update(metrics)
                            
                            rows.append(row)
                        
                        else:  # Nested results (LLM experiments)
                            for strategy_name, strategy_results in config_results.items():
                                if isinstance(strategy_results, dict) and "status" in strategy_results:
                                    row = {
                                        'experiment_type': exp_type,
                                        'dataset': dataset_name,
                                        'model': config_name,
                                        'peft_strategy': strategy_name,
                                        'configuration': f"{config_name}_{strategy_name}",
                                        'status': strategy_results['status'],
                                        'runtime': strategy_results.get('runtime', 0)
                                    }
                                    
                                    # Extract metrics from log file if available
                                    if 'log_file' in strategy_results:
                                        metrics = extract_metrics_from_log(strategy_results['log_file'])
                                        row.update(metrics)
                                    
                                    rows.append(row)
    
    return pd.DataFrame(rows)


# Build comprehensive results DataFrame
all_results = {
    'RNN': rnn_results,
    'LLM-PEFT': llm_results,
    'Competitor': competitor_results
}

results_df = build_results_dataframe(all_results)

print(f"\n Results DataFrame created with {len(results_df)} experiments")
print(f"Columns: {list(results_df.columns)}")
print(f"\nExperiment status distribution:")
print(results_df['status'].value_counts())

In [None]:
# Display sample of results
if not results_df.empty:
    print("\n Sample Results:")
    print(results_df.head(10))
    
    # Show successful experiments only
    successful_df = results_df[results_df['status'] == 'success']
    print(f"\n Successful experiments: {len(successful_df)}/{len(results_df)}")
    
    if not successful_df.empty:
        print("\nMetrics available for successful experiments:")
        metric_columns = ['next_activity_accuracy', 'remaining_time_mse', 'training_time', 'epochs_completed']
        available_metrics = [col for col in metric_columns if col in successful_df.columns and successful_df[col].notna().any()]
        print(f"Available metrics: {available_metrics}")
        
        if available_metrics:
            print("\n📈 Sample metrics:")
            sample_metrics = successful_df[['dataset', 'model', 'configuration'] + available_metrics].head(5)
            print(sample_metrics)
else:
    print(" No results found. Please run experiments first.")

## 3. Performance Comparison Tables

In [None]:
# Create performance comparison tables
if not results_df.empty and len(results_df[results_df['status'] == 'success']) > 0:
    
    successful_df = results_df[results_df['status'] == 'success'].copy()
    
    # Next Activity Prediction Performance
    if 'next_activity_accuracy' in successful_df.columns:
        print("\n Next Activity Prediction Performance")
        print("=" * 60)
        
        # Best performance by dataset and model type
        na_performance = successful_df.groupby(['dataset', 'experiment_type'])['next_activity_accuracy'].agg(['max', 'mean', 'std']).round(4)
        print(na_performance)
        
        # Save to CSV
        na_performance.to_csv(RESULTS_DIR / "next_activity_performance.csv")
    
    # Remaining Time Prediction Performance
    if 'remaining_time_mse' in successful_df.columns:
        print("\n Remaining Time Prediction Performance (MSE - lower is better)")
        print("=" * 70)
        
        # Best performance by dataset and model type
        rt_performance = successful_df.groupby(['dataset', 'experiment_type'])['remaining_time_mse'].agg(['min', 'mean', 'std']).round(4)
        print(rt_performance)
        
        # Save to CSV
        rt_performance.to_csv(RESULTS_DIR / "remaining_time_performance.csv")
    
    # Best performing configurations per dataset
    print("\n Best Performing Configurations per Dataset")
    print("=" * 60)
    
    for dataset in successful_df['dataset'].unique():
        dataset_df = successful_df[successful_df['dataset'] == dataset]
        
        print(f"\n {dataset}:")
        
        if 'next_activity_accuracy' in dataset_df.columns:
            best_na = dataset_df.loc[dataset_df['next_activity_accuracy'].idxmax()]
            print(f"  Best Next Activity: {best_na['configuration']} (Acc: {best_na['next_activity_accuracy']:.4f})")
        
        if 'remaining_time_mse' in dataset_df.columns:
            best_rt = dataset_df.loc[dataset_df['remaining_time_mse'].idxmin()]
            print(f"  Best Remaining Time: {best_rt['configuration']} (MSE: {best_rt['remaining_time_mse']:.4f})")
    
    # Save detailed results
    successful_df.to_csv(RESULTS_DIR / "detailed_results.csv", index=False)
    print(f"\n Detailed results saved to: {RESULTS_DIR / 'detailed_results.csv'}")

else:
    print(" No successful experiments found for analysis.")

## 4. Statistical Analysis

In [None]:
# Statistical significance testing
if not results_df.empty and len(results_df[results_df['status'] == 'success']) > 0:
    
    successful_df = results_df[results_df['status'] == 'success'].copy()
    
    print("\n Statistical Analysis")
    print("=" * 40)
    
    # Compare RNN vs LLM-PEFT performance
    if 'next_activity_accuracy' in successful_df.columns:
        rnn_scores = successful_df[successful_df['experiment_type'] == 'RNN']['next_activity_accuracy'].dropna()
        llm_scores = successful_df[successful_df['experiment_type'] == 'LLM-PEFT']['next_activity_accuracy'].dropna()
        
        if len(rnn_scores) > 1 and len(llm_scores) > 1:
            # Perform t-test
            t_stat, p_value = stats.ttest_ind(llm_scores, rnn_scores)
            
            print(f"\n🔬 Next Activity Prediction: LLM-PEFT vs RNN")
            print(f"  RNN Mean Accuracy: {rnn_scores.mean():.4f} ± {rnn_scores.std():.4f} (n={len(rnn_scores)})")
            print(f"  LLM Mean Accuracy: {llm_scores.mean():.4f} ± {llm_scores.std():.4f} (n={len(llm_scores)})")
            print(f"  T-statistic: {t_stat:.4f}")
            print(f"  P-value: {p_value:.4f}")
            print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'}")
    
    # PEFT strategy comparison
    if 'peft_strategy' in successful_df.columns and 'next_activity_accuracy' in successful_df.columns:
        llm_df = successful_df[successful_df['experiment_type'] == 'LLM-PEFT']
        
        print(f"\n PEFT Strategy Comparison")
        strategy_performance = llm_df.groupby('peft_strategy')['next_activity_accuracy'].agg(['count', 'mean', 'std']).round(4)
        print(strategy_performance)
        
        # ANOVA test for strategy differences
        strategies = llm_df['peft_strategy'].unique()
        if len(strategies) > 2:
            strategy_groups = [llm_df[llm_df['peft_strategy'] == s]['next_activity_accuracy'].dropna() for s in strategies]
            strategy_groups = [g for g in strategy_groups if len(g) > 0]
            
            if len(strategy_groups) > 1:
                f_stat, p_value = stats.f_oneway(*strategy_groups)
                print(f"\n  ANOVA F-statistic: {f_stat:.4f}")
                print(f"  P-value: {p_value:.4f}")
                print(f"  Significant strategy differences: {'Yes' if p_value < 0.05 else 'No'}")
    
    # Multi-task vs Single-task analysis (if data available)
    # Note: This would require additional information about task configuration
    
    # Training efficiency analysis
    if 'training_time' in successful_df.columns:
        print(f"\n Training Efficiency Analysis")
        efficiency = successful_df.groupby('experiment_type')['training_time'].agg(['count', 'mean', 'std']).round(2)
        print(efficiency)
        
        # Training time comparison
        rnn_times = successful_df[successful_df['experiment_type'] == 'RNN']['training_time'].dropna()
        llm_times = successful_df[successful_df['experiment_type'] == 'LLM-PEFT']['training_time'].dropna()
        
        if len(rnn_times) > 1 and len(llm_times) > 1:
            t_stat, p_value = stats.ttest_ind(llm_times, rnn_times)
            print(f"\n  Training Time Comparison (seconds):")
            print(f"    RNN: {rnn_times.mean():.1f} ± {rnn_times.std():.1f}")
            print(f"    LLM-PEFT: {llm_times.mean():.1f} ± {llm_times.std():.1f}")
            print(f"    P-value: {p_value:.4f}")

else:
    print(" Insufficient data for statistical analysis.")

## 5. Visualization

In [None]:
# Create visualizations
if not results_df.empty and len(results_df[results_df['status'] == 'success']) > 0:
    
    successful_df = results_df[results_df['status'] == 'success'].copy()
    
    # Performance comparison plots
    if 'next_activity_accuracy' in successful_df.columns:
        
        # 1. Box plot comparison by experiment type
        plt.figure(figsize=(12, 6))
        sns.boxplot(data=successful_df, x='dataset', y='next_activity_accuracy', hue='experiment_type')
        plt.title('Next Activity Prediction Accuracy by Dataset and Method')
        plt.xticks(rotation=45)
        plt.ylabel('Accuracy')
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / 'accuracy_comparison_boxplot.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        # 2. PEFT strategy comparison
        if 'peft_strategy' in successful_df.columns:
            llm_df = successful_df[successful_df['experiment_type'] == 'LLM-PEFT']
            if not llm_df.empty:
                plt.figure(figsize=(12, 6))
                sns.barplot(data=llm_df, x='peft_strategy', y='next_activity_accuracy', ci=95)
                plt.title('PEFT Strategy Performance Comparison')
                plt.xticks(rotation=45)
                plt.ylabel('Next Activity Accuracy')
                plt.tight_layout()
                plt.savefig(PLOTS_DIR / 'peft_strategy_comparison.png', dpi=300, bbox_inches='tight')
                plt.show()
    
    # Training time comparison
    if 'training_time' in successful_df.columns:
        plt.figure(figsize=(10, 6))
        sns.boxplot(data=successful_df, x='experiment_type', y='training_time')
        plt.title('Training Time Comparison')
        plt.ylabel('Training Time (seconds)')
        plt.yscale('log')  # Log scale for better visualization
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / 'training_time_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    # Performance vs runtime scatter plot
    if 'next_activity_accuracy' in successful_df.columns and 'runtime' in successful_df.columns:
        plt.figure(figsize=(10, 6))
        for exp_type in successful_df['experiment_type'].unique():
            type_df = successful_df[successful_df['experiment_type'] == exp_type]
            plt.scatter(type_df['runtime'], type_df['next_activity_accuracy'], 
                       label=exp_type, alpha=0.7, s=60)
        
        plt.xlabel('Runtime (seconds)')
        plt.ylabel('Next Activity Accuracy')
        plt.title('Performance vs Runtime')
        plt.legend()
        plt.xscale('log')
        plt.tight_layout()
        plt.savefig(PLOTS_DIR / 'performance_vs_runtime.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    # Dataset performance heatmap
    if 'next_activity_accuracy' in successful_df.columns:
        # Create pivot table for heatmap
        pivot_data = successful_df.pivot_table(
            values='next_activity_accuracy', 
            index='dataset', 
            columns='experiment_type', 
            aggfunc='max'
        )
        
        if not pivot_data.empty:
            plt.figure(figsize=(10, 6))
            sns.heatmap(pivot_data, annot=True, fmt='.3f', cmap='viridis')
            plt.title('Best Accuracy by Dataset and Method')
            plt.tight_layout()
            plt.savefig(PLOTS_DIR / 'performance_heatmap.png', dpi=300, bbox_inches='tight')
            plt.show()
    
    print(f"\n Plots saved to: {PLOTS_DIR}")

else:
    print(" No successful experiments found for visualization.")

## 6. Comparison with Original Paper

In [None]:
# Original paper results (manually entered based on paper)
# Note: These would need to be updated with actual values from the paper

original_results = {
    "BPI12": {
        "RNN_baseline": {"accuracy": 0.85, "mse": 2.1},  # Example values
        "LLM_PEFT_best": {"accuracy": 0.92, "mse": 1.8},
        "S_NAP": {"accuracy": 0.88, "mse": 2.0}
    },
    "BPI17": {
        "RNN_baseline": {"accuracy": 0.82, "mse": 3.2},
        "LLM_PEFT_best": {"accuracy": 0.89, "mse": 2.9},
        "S_NAP": {"accuracy": 0.85, "mse": 3.1}
    }
    # Add more datasets as needed
}

print("\n Comparison with Original Paper Results")
print("=" * 50)
print("Note: Original paper values are examples and should be updated with actual results.")

if not results_df.empty and len(results_df[results_df['status'] == 'success']) > 0:
    successful_df = results_df[results_df['status'] == 'success'].copy()
    
    comparison_data = []
    
    for dataset in original_results.keys():
        if dataset in successful_df['dataset'].values:
            dataset_df = successful_df[successful_df['dataset'] == dataset]
            
            # Get our best results
            if 'next_activity_accuracy' in dataset_df.columns:
                rnn_results_ours = dataset_df[dataset_df['experiment_type'] == 'RNN']['next_activity_accuracy']
                llm_results_ours = dataset_df[dataset_df['experiment_type'] == 'LLM-PEFT']['next_activity_accuracy']
                
                our_best_rnn = rnn_results_ours.max() if not rnn_results_ours.empty else None
                our_best_llm = llm_results_ours.max() if not llm_results_ours.empty else None
                
                # Compare with original
                orig_rnn = original_results[dataset]["RNN_baseline"]["accuracy"]
                orig_llm = original_results[dataset]["LLM_PEFT_best"]["accuracy"]
                
                comparison_data.append({
                    'Dataset': dataset,
                    'Method': 'RNN',
                    'Original_Paper': orig_rnn,
                    'Our_Replication': our_best_rnn,
                    'Difference': (our_best_rnn - orig_rnn) if our_best_rnn else None
                })
                
                comparison_data.append({
                    'Dataset': dataset,
                    'Method': 'LLM-PEFT',
                    'Original_Paper': orig_llm,
                    'Our_Replication': our_best_llm,
                    'Difference': (our_best_llm - orig_llm) if our_best_llm else None
                })
    
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        print("\n Replication Accuracy Comparison:")
        print(comparison_df.round(4))
        
        # Save comparison
        comparison_df.to_csv(RESULTS_DIR / "replication_comparison.csv", index=False)
        
        # Calculate replication fidelity
        valid_comparisons = comparison_df.dropna(subset=['Difference'])
        if not valid_comparisons.empty:
            mean_diff = valid_comparisons['Difference'].mean()
            std_diff = valid_comparisons['Difference'].std()
            
            print(f"\n Replication Fidelity:")
            print(f"  Mean difference: {mean_diff:.4f} ± {std_diff:.4f}")
            print(f"  Close replication (±0.05): {(abs(valid_comparisons['Difference']) <= 0.05).sum()}/{len(valid_comparisons)}")
    
else:
    print(" No replication results available for comparison.")

## 7. Key Findings Summary

In [None]:
# Generate key findings summary
print("\n Key Findings Summary")
print("=" * 40)

findings = []

if not results_df.empty and len(results_df[results_df['status'] == 'success']) > 0:
    successful_df = results_df[results_df['status'] == 'success'].copy()
    
    # Overall success rate
    success_rate = len(successful_df) / len(results_df) * 100
    findings.append(f"Experiment success rate: {success_rate:.1f}% ({len(successful_df)}/{len(results_df)})")
    
    # Performance comparison
    if 'next_activity_accuracy' in successful_df.columns:
        exp_performance = successful_df.groupby('experiment_type')['next_activity_accuracy'].agg(['mean', 'max']).round(4)
        
        for exp_type in exp_performance.index:
            findings.append(f"{exp_type} - Mean accuracy: {exp_performance.loc[exp_type, 'mean']:.4f}, Best: {exp_performance.loc[exp_type, 'max']:.4f}")
    
    # Best performing configurations
    if 'next_activity_accuracy' in successful_df.columns:
        overall_best = successful_df.loc[successful_df['next_activity_accuracy'].idxmax()]
        findings.append(f"Best overall performance: {overall_best['configuration']} ({overall_best['next_activity_accuracy']:.4f} accuracy on {overall_best['dataset']})")
    
    # PEFT strategy insights
    if 'peft_strategy' in successful_df.columns:
        llm_df = successful_df[successful_df['experiment_type'] == 'LLM-PEFT']
        if not llm_df.empty and 'next_activity_accuracy' in llm_df.columns:
            best_strategy = llm_df.groupby('peft_strategy')['next_activity_accuracy'].mean().idxmax()
            best_strategy_score = llm_df.groupby('peft_strategy')['next_activity_accuracy'].mean().max()
            findings.append(f"Best PEFT strategy: {best_strategy} (mean accuracy: {best_strategy_score:.4f})")
    
    # Training efficiency
    if 'training_time' in successful_df.columns:
        efficiency = successful_df.groupby('experiment_type')['training_time'].mean().round(1)
        findings.append(f"Training time comparison: {dict(efficiency)}")
    
    # Dataset insights
    dataset_performance = successful_df.groupby('dataset')['next_activity_accuracy'].max().round(4)
    easiest_dataset = dataset_performance.idxmax()
    hardest_dataset = dataset_performance.idxmin()
    findings.append(f"Dataset difficulty: Easiest - {easiest_dataset} ({dataset_performance.max():.4f}), Hardest - {hardest_dataset} ({dataset_performance.min():.4f})")

else:
    findings.append("No successful experiments to analyze")

# Print findings
for i, finding in enumerate(findings, 1):
    print(f"{i}. {finding}")

# Save findings to file
findings_file = RESULTS_DIR / "key_findings.txt"
with open(findings_file, 'w') as f:
    f.write("LLM-PEFT-PPM Replication Study - Key Findings\n")
    f.write("=" * 50 + "\n\n")
    for i, finding in enumerate(findings, 1):
        f.write(f"{i}. {finding}\n")

print(f"\n Key findings saved to: {findings_file}")

Replication result files generated and saved in `replication_results/` directory.
<!-- ## 8. Files Generated

This analysis notebook has generated the following files in `replication_results/`:

### 📊 Data Files:
- `detailed_results.csv` - Complete results with all metrics
- `next_activity_performance.csv` - Next activity prediction performance summary
- `remaining_time_performance.csv` - Remaining time prediction performance summary
- `replication_comparison.csv` - Comparison with original paper results
- `key_findings.txt` - Summary of key findings

### 📈 Visualizations:
- `accuracy_comparison_boxplot.png` - Performance comparison by method
- `peft_strategy_comparison.png` - PEFT strategy effectiveness
- `training_time_comparison.png` - Training efficiency comparison
- `performance_vs_runtime.png` - Performance vs computational cost
- `performance_heatmap.png` - Dataset-method performance matrix

### 🔬 Research Questions Addressed:

**RQ1: Do PEFT-adapted LLMs outperform traditional approaches?**
- Statistical comparison between RNN and LLM-PEFT methods
- Performance metrics across all datasets

**RQ2: Multi-task vs single-task learning effectiveness?**
- Analysis of multi-task configurations (requires additional experiment data)

**RQ3: Optimal PEFT strategy identification?**
- Comparison of LoRA vs layer freezing strategies
- Best performing configurations per dataset

### 📝 Next Steps:
1. Update original paper comparison values with actual results
2. Conduct deeper analysis on specific PEFT configurations
3. Analyze Traffic Fines dataset results (when available)
4. Create final replication report with conclusions

---

**Analysis Complete!** 🎉

Use these results to write your replication report and compare findings with the original Oyamada et al. paper. -->