In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import logging
import pandas as pd
import os
from itertools import product
from src.utils import prepare_df, linear_prob_combiner
from src.incremental_softmax_recovery import incremental_softmax_recovery
from src.evaluation import compute_sktr_vs_argmax_metrics


In [None]:
# Configure logging with selective DEBUG for our modules only
logging.basicConfig(
    level=logging.INFO,           # Set root to INFO (reduces third-party noise)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    force=True                    # Force override of any existing handlers (useful in Jupyter)
)

# Enable DEBUG for our specific modules only
our_modules = [
    'src.classes', 
    'src.incremental_softmax_recovery', 
    'src.utils', 
    'src.conformance_checking',
    'src.data_processing',
    'src.petri_model',
    'src.calibration'
]

for module_name in our_modules:
    logging.getLogger(module_name).setLevel(logging.DEBUG)

# Silence noisy third-party libraries
logging.getLogger('graphviz').setLevel(logging.WARNING)  # Only show warnings/errors from graphviz
logging.getLogger('matplotlib').setLevel(logging.WARNING)  # Silence matplotlib if present
logging.getLogger('PIL').setLevel(logging.WARNING)  # Silence PIL if present

print("✅ Logging configured: DEBUG for our modules, INFO+ for third-party libraries")


In [None]:
# Load data
result = prepare_df('50salads')

if len(result) == 2:
    df, softmax_lst = result
else:
    df, softmax_lst, _ = result


In [None]:
# Hyperparameter Grid Search for Conditioning Parameters
# Define hyperparameter grid
alphas = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
interpolation_strategies = {
    'unigram_heavy': [0.6, 0.25, 0.15],
    'balanced': [0.4, 0.35, 0.25],
    'trigram_heavy': [0.15, 0.25, 0.6]
}

# Results storage
summary_rows = []
results_dir = 'results'
os.makedirs(results_dir, exist_ok=True)

# Base config
base_config = {
    # === Data Splitting ===
    'n_train_traces': 10,
    'n_test_traces': 10,
    'train_cases': None,
    'test_cases': None,
    'ensure_train_variant_diversity': True,
    'ensure_test_variant_diversity': True,
    'use_same_traces_for_train_test': False,
    'compute_marking_transition_map': True,

    # === Sampling Configuration ===
    'sequential_sampling': True,
    'n_indices': None,
    'n_per_run': 10000,
    'independent_sampling': True,

    # === Conformance Parameters ===
    'prob_threshold': 1e-6,
    'chunk_size': 11,
    'conformance_switch_penalty_weight': 1.0,
    'merge_mismatched_boundaries': False,

    # === Conditioned Probability Adjustment ===
    'conditioning_combine_fn': linear_prob_combiner,
    
    # === Multi-Label Conditioning ===
    'max_hist_len': 3,
    'conditioning_n_prev_labels': 3,
    'use_collapsed_runs': True,
    
    # === Cost Function ===
    'cost_function': "linear",
    'model_move_cost': 1.0,
    'log_move_cost': 1.0,
    'tau_move_cost': 0.0,
    'non_sync_penalty': 1.0,

    # === Temperature Calibration ===
    'use_calibration': True,
    'temp_bounds': (1.0, 10.0),
    'temperature': None,

    # === Logging ===
    'verbose': True,
    'log_level': logging.INFO,

    # === Miscellaneous ===
    'round_precision': 2,
    'random_seed': 101,
    'save_model_path': "./results/discovered_petri_net",
    'save_model': True,
}

print(f"Starting hyperparameter grid search: {len(alphas)} alphas × {len(interpolation_strategies)} strategies = {len(alphas) * len(interpolation_strategies)} total runs")
print("=" * 80)

# Grid search loop
for idx, (alpha, (strategy_name, weights)) in enumerate(product(alphas, interpolation_strategies.items()), 1):
    print(f"\n[{idx}/{len(alphas) * len(interpolation_strategies)}] Running: alpha={alpha}, strategy={strategy_name}, weights={weights}")
    
    # Update config with current hyperparameters
    run_config = base_config.copy()
    run_config['conditioning_alpha'] = alpha
    run_config['conditioning_interpolation_weights'] = weights
    
    # Run recovery
    output = incremental_softmax_recovery(
        df=df,
        softmax_lst=softmax_lst,
        **run_config 
    )
    
    # Unpack results
    results_df, accuracy_dict, prob_dict = output
    
    # Save results CSV
    csv_filename = f'hyperparam_search_alpha_{alpha}_weights_{strategy_name}.csv'
    csv_path = os.path.join(results_dir, csv_filename)
    results_df.to_csv(csv_path, index=False)
    print(f"  ✓ Saved results to: {csv_path}")
    
    # Compute metrics
    metrics = compute_sktr_vs_argmax_metrics(
        csv_path,
        case_col='case:concept:name',
        sktr_pred_col='sktr_activity',
        argmax_pred_col='argmax_activity',
        gt_col='ground_truth',
        background=0
    )
    
    # Store summary row
    summary_rows.append({
        'conditioning_alpha': alpha,
        'interpolation_weights_strategy': strategy_name,
        'interpolation_weights': str(weights),  # Convert to string for CSV compatibility
        'sktr_acc_micro': metrics['sktr']['acc_micro'],
        'sktr_edit': metrics['sktr']['edit'],
        'sktr_f1@10': metrics['sktr']['f1@10'],
        'sktr_f1@25': metrics['sktr']['f1@25'],
        'sktr_f1@50': metrics['sktr']['f1@50'],
        'argmax_acc_micro': metrics['argmax']['acc_micro'],
        'argmax_edit': metrics['argmax']['edit'],
        'argmax_f1@10': metrics['argmax']['f1@10'],
        'argmax_f1@25': metrics['argmax']['f1@25'],
        'argmax_f1@50': metrics['argmax']['f1@50'],
        'results_csv_path': csv_path
    })
    
    print(f"  ✓ SKTR metrics: edit={metrics['sktr']['edit']:.2f}, f1@10={metrics['sktr']['f1@10']:.2f}")

# Create summary dataframe
summary_df = pd.DataFrame(summary_rows)

# Display sorted by best sktr_acc_micro (descending - higher is better)
print("\n" + "=" * 80)
print("HYPERPARAMETER SEARCH SUMMARY")
print("=" * 80)
print("\nTop 10 combinations by SKTR Accuracy (higher is better):")
display(summary_df.sort_values('sktr_acc_micro', ascending=False).head(10))

print("\nFull Results Summary (sorted by SKTR Accuracy, descending):")
display(summary_df.sort_values('sktr_acc_micro', ascending=False))

# Save summary
summary_path = os.path.join(results_dir, 'hyperparameter_search_summary.csv')
summary_df.to_csv(summary_path, index=False)
print(f"\n✓ Summary saved to: {summary_path}")
print(f"✓ Total runs completed: {len(summary_rows)}")
