In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import pandas as pd
import os
from itertools import product
from joblib import Parallel, delayed
from src.utils import prepare_df, linear_prob_combiner
from src.incremental_softmax_recovery import incremental_softmax_recovery
from src.evaluation import compute_sktr_vs_argmax_metrics


In [None]:
# Configure logging with selective DEBUG for our modules only
logging.basicConfig(
    level=logging.INFO,           # Set root to INFO (reduces third-party noise)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    force=True                    # Force override of any existing handlers (useful in Jupyter)
)

# Enable DEBUG for our specific modules only
our_modules = [
    'src.classes', 
    'src.incremental_softmax_recovery', 
    'src.utils', 
    'src.conformance_checking',
    'src.data_processing',
    'src.petri_model',
    'src.calibration'
]

for module_name in our_modules:
    logging.getLogger(module_name).setLevel(logging.DEBUG)

# Silence noisy third-party libraries
logging.getLogger('graphviz').setLevel(logging.WARNING)  # Only show warnings/errors from graphviz
logging.getLogger('matplotlib').setLevel(logging.WARNING)  # Silence matplotlib if present
logging.getLogger('PIL').setLevel(logging.WARNING)  # Silence PIL if present

print("‚úÖ Logging configured: DEBUG for our modules, INFO+ for third-party libraries")


In [None]:
# Load data
result = prepare_df('50salads')

if len(result) == 2:
    df, softmax_lst = result
else:
    df, softmax_lst, _ = result


In [None]:
# Hyperparameter Grid Search for Conditioning Parameters (Parallelized)
# Define hyperparameter grid
alphas = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
interpolation_strategies = {
    'unigram_heavy': [0.6, 0.25, 0.15],
    'balanced': [0.4, 0.35, 0.25],
    'trigram_heavy': [0.15, 0.25, 0.6]
}

# ============================================================================
# PARALLELIZATION CONFIGURATION
# ============================================================================
# Choose parallelization strategy for each hyperparameter run:
#   - None: Sequential processing (no parallelization within each run)
#   - 'trace': Trace-level parallelization (parallel_processing=True)
#              Each test trace processed independently in parallel
#   - 'dataset': Dataset-level parallelization (dataset_parallelization=True)
#                Test dataset split into chunks, processed in parallel
#                Model discovered once and shared across workers
#   - 'both': Use both hyperparameter-level (joblib) AND dataset-level parallelization
#             (hyperparameter runs still parallelized by joblib, but each run also
#              uses dataset-level parallelization internally)
#
# Recommendation:
#   - 'dataset': Best for large test sets where each trace is relatively fast
#   - 'trace': Best when traces are long/complex and benefit from independent processing
#   - None: Best for debugging or when overhead outweighs benefits
# ============================================================================
PARALLELIZATION_STRATEGY = 'dataset'  # Options: None, 'trace', 'dataset', 'both'
MAX_WORKERS = None  # None = auto-detect (use all CPUs), or specify number like 4

print(f"üîß Parallelization strategy: {PARALLELIZATION_STRATEGY}")
print(f"üîß Max workers: {MAX_WORKERS if MAX_WORKERS else 'auto (all CPUs)'}")

# Results storage - ensure it's created next to the notebook file
# Get notebook's directory (works in both local and server Jupyter environments)
# This ensures results are always saved next to the notebook file, regardless of working directory

def get_notebook_directory():
    """
    Get the directory containing the current notebook file.
    Tries multiple methods to ensure it works in different Jupyter environments.
    """
    # Method 1: Try using ipynbname library (most reliable, but requires installation)
    try:
        import ipynbname
        nb_path = ipynbname.path()
        return os.path.dirname(os.path.abspath(nb_path))
    except ImportError:
        pass  # Library not installed, try next method
    except Exception:
        pass  # Other error, try next method
    
    # Method 2: Use IPython's %pwd magic (works in most Jupyter setups)
    # In standard Jupyter/Lab, the working directory is set to the notebook's directory
    try:
        from IPython import get_ipython
        ipython = get_ipython()
        if ipython:
            notebook_dir = ipython.run_line_magic('pwd', '')
            # Verify this directory contains .ipynb files (sanity check)
            import glob
            if glob.glob(os.path.join(notebook_dir, '*.ipynb')):
                return notebook_dir
    except Exception:
        pass
    
    # Method 3: Fallback to current working directory
    # This works if Jupyter sets cwd to notebook directory (which it usually does)
    return os.getcwd()

# Get notebook directory using the function above
notebook_dir = get_notebook_directory()
notebook_dir = os.path.abspath(notebook_dir)  # Ensure absolute path

# Create results folder in the same directory as the notebook
# This ensures results are ALWAYS saved next to the notebook file, regardless of where code is run
results_dir = os.path.join(notebook_dir, 'results')
os.makedirs(results_dir, exist_ok=True)
results_abs_path = os.path.abspath(results_dir)

# Verify the setup
print(f"üìÅ Notebook directory: {notebook_dir}")
print(f"üìÅ Results folder location: {results_abs_path}")
print(f"üìÅ Results folder exists: {os.path.exists(results_dir)}")
print(f"‚úÖ Results will ALWAYS be saved in: {results_abs_path}")
print(f"   (This folder is next to the notebook file, regardless of where you run the code)")
print(f"üí° For maximum reliability, install: pip install ipynbname")

# Base config
base_config = {
    # === Data Splitting ===
    'n_train_traces': 10,
    'n_test_traces': None,  # None = use all available cases (including training cases if allow_train_cases_in_test=True)
    'train_cases': None,
    'test_cases': None,
    'ensure_train_variant_diversity': True,
    'ensure_test_variant_diversity': True,
    'use_same_traces_for_train_test': False,
    'allow_train_cases_in_test': True,  # Set to True to include training cases in test set
    'compute_marking_transition_map': True,

    # === Sampling Configuration ===
    'sequential_sampling': True,
    'n_indices': None,
    'n_per_run': 10000,
    'independent_sampling': True,

    # === Conformance Parameters ===
    'prob_threshold': 1e-6,
    'chunk_size': 11,
    'conformance_switch_penalty_weight': 1.0,
    'merge_mismatched_boundaries': False,

    # === Conditioned Probability Adjustment ===
    'conditioning_combine_fn': linear_prob_combiner,
    
    # === Multi-Label Conditioning ===
    'max_hist_len': 3,
    'conditioning_n_prev_labels': 3,
    'use_collapsed_runs': True,
    
    # === Cost Function ===
    'cost_function': "linear",
    'model_move_cost': 1.0,
    'log_move_cost': 1.0,
    'tau_move_cost': 0.0,
    'non_sync_penalty': 1.0,

    # === Temperature Calibration ===
    'use_calibration': True,
    'temp_bounds': (1.0, 10.0),
    'temperature': None,

    # === Logging ===
    'verbose': True,
    'log_level': logging.INFO,

    # === Miscellaneous ===
    'round_precision': 2,
    'random_seed': 101,
    'save_model_path': None,  # Will be set below after results_dir is defined
    'save_model': True,
    
    # === Parallelization (set based on PARALLELIZATION_STRATEGY) ===
    'parallel_processing': False,
    'dataset_parallelization': False,
    'max_workers': MAX_WORKERS,
}

# Update base_config with absolute path for save_model_path
base_config['save_model_path'] = os.path.join(results_dir, 'discovered_petri_net')

# Set parallelization flags based on strategy
if PARALLELIZATION_STRATEGY == 'trace':
    base_config['parallel_processing'] = True
    base_config['dataset_parallelization'] = False
elif PARALLELIZATION_STRATEGY == 'dataset':
    base_config['parallel_processing'] = False
    base_config['dataset_parallelization'] = True
elif PARALLELIZATION_STRATEGY == 'both':
    base_config['parallel_processing'] = False
    base_config['dataset_parallelization'] = True  # Use dataset-level for internal parallelization
elif PARALLELIZATION_STRATEGY is None:
    base_config['parallel_processing'] = False
    base_config['dataset_parallelization'] = False
else:
    raise ValueError(f"Invalid PARALLELIZATION_STRATEGY: {PARALLELIZATION_STRATEGY}. Must be None, 'trace', 'dataset', or 'both'")

print(f"‚úì Configured parallelization: parallel_processing={base_config['parallel_processing']}, dataset_parallelization={base_config['dataset_parallelization']}")

def run_single_hyperparameter(alpha, strategy_name, weights, idx, total_runs, df, softmax_lst, base_config, results_dir):
    """Run a single hyperparameter combination"""
    print(f"[{idx}/{total_runs}] Running: alpha={alpha}, strategy={strategy_name}, weights={weights}")
    
    # Update config with current hyperparameters
    run_config = base_config.copy()
    run_config['conditioning_alpha'] = alpha
    run_config['conditioning_interpolation_weights'] = weights
    # Use unique save path per run to avoid overwrites (use absolute path)
    run_config['save_model_path'] = os.path.join(results_dir, f'discovered_petri_net_alpha_{alpha}_weights_{strategy_name}')
    
    # Run recovery
    output = incremental_softmax_recovery(
        df=df,
        softmax_lst=softmax_lst,
        **run_config 
    )
    
    # Unpack results
    results_df, accuracy_dict, prob_dict = output
    
    # Save results CSV
    csv_filename = f'hyperparam_search_alpha_{alpha}_weights_{strategy_name}.csv'
    csv_path = os.path.join(results_dir, csv_filename)
    results_df.to_csv(csv_path, index=False)
    print(f"  [{idx}/{total_runs}] ‚úì Saved results to: {csv_path}")
    
    # Compute metrics
    metrics = compute_sktr_vs_argmax_metrics(
        csv_path,
        case_col='case:concept:name',
        sktr_pred_col='sktr_activity',
        argmax_pred_col='argmax_activity',
        gt_col='ground_truth',
        background=0
    )
    
    # Return summary row
    result = {
        'conditioning_alpha': alpha,
        'interpolation_weights_strategy': strategy_name,
        'interpolation_weights': str(weights),  # Convert to string for CSV compatibility
        'sktr_acc_micro': metrics['sktr']['acc_micro'],
        'sktr_edit': metrics['sktr']['edit'],
        'sktr_f1@10': metrics['sktr']['f1@10'],
        'sktr_f1@25': metrics['sktr']['f1@25'],
        'sktr_f1@50': metrics['sktr']['f1@50'],
        'argmax_acc_micro': metrics['argmax']['acc_micro'],
        'argmax_edit': metrics['argmax']['edit'],
        'argmax_f1@10': metrics['argmax']['f1@10'],
        'argmax_f1@25': metrics['argmax']['f1@25'],
        'argmax_f1@50': metrics['argmax']['f1@50'],
        'results_csv_path': csv_path
    }
    
    print(f"  [{idx}/{total_runs}] ‚úì SKTR metrics: edit={metrics['sktr']['edit']:.2f}, f1@10={metrics['sktr']['f1@10']:.2f}")
    return result

# Prepare all parameter combinations
param_combinations = list(product(alphas, interpolation_strategies.items()))
total_runs = len(param_combinations)

print(f"Starting hyperparameter grid search: {len(alphas)} alphas √ó {len(interpolation_strategies)} strategies = {total_runs} total runs")
print(f"Hyperparameter-level parallelization: joblib (across hyperparameter combinations)")
if PARALLELIZATION_STRATEGY:
    print(f"Run-level parallelization: {PARALLELIZATION_STRATEGY} (within each hyperparameter run)")
else:
    print(f"Run-level parallelization: None (sequential)")
print("=" * 80)

# Run hyperparameter combinations in parallel using joblib
# Note: If PARALLELIZATION_STRATEGY == 'both', we get two levels of parallelization:
#   - Outer: joblib parallelizes across hyperparameter combinations
#   - Inner: dataset_parallelization splits test dataset within each run
n_jobs = -1  # Use all available CPUs for hyperparameter-level parallelization
summary_rows = Parallel(n_jobs=n_jobs, verbose=10)(
    delayed(run_single_hyperparameter)(
        alpha, strategy_name, weights, idx, total_runs, df, softmax_lst, base_config, results_dir
    )
    for idx, (alpha, (strategy_name, weights)) in enumerate(param_combinations, 1)
)

# Create summary dataframe
summary_df = pd.DataFrame(summary_rows)

# Display sorted by best sktr_acc_micro (descending - higher is better)
print("\n" + "=" * 80)
print("HYPERPARAMETER SEARCH SUMMARY")
print("=" * 80)
print("\nTop 10 combinations by SKTR Accuracy (higher is better):")
display(summary_df.sort_values('sktr_acc_micro', ascending=False).head(10))

print("\nFull Results Summary (sorted by SKTR Accuracy, descending):")
display(summary_df.sort_values('sktr_acc_micro', ascending=False))

# Save summary
summary_path = os.path.join(results_dir, 'hyperparameter_search_summary.csv')
summary_df.to_csv(summary_path, index=False)
print(f"\n‚úì Summary saved to: {summary_path}")
print(f"‚úì Total runs completed: {len(summary_rows)}")
