In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging
import pandas as pd
import os
from itertools import product
from joblib import Parallel, delayed
from src.utils import prepare_df, linear_prob_combiner
from src.incremental_softmax_recovery import incremental_softmax_recovery
from src.evaluation import compute_sktr_vs_argmax_metrics


In [None]:
# Configure logging with selective DEBUG for our modules only
logging.basicConfig(
    level=logging.INFO,           # Set root to INFO (reduces third-party noise)
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    force=True                    # Force override of any existing handlers (useful in Jupyter)
)

# Enable DEBUG for our specific modules only
our_modules = [
    'src.classes', 
    'src.incremental_softmax_recovery', 
    'src.utils', 
    'src.conformance_checking',
    'src.data_processing',
    'src.petri_model',
    'src.calibration'
]

for module_name in our_modules:
    logging.getLogger(module_name).setLevel(logging.DEBUG)

# Silence noisy third-party libraries
logging.getLogger('graphviz').setLevel(logging.WARNING)  # Only show warnings/errors from graphviz
logging.getLogger('matplotlib').setLevel(logging.WARNING)  # Silence matplotlib if present
logging.getLogger('PIL').setLevel(logging.WARNING)  # Silence PIL if present

print("✅ Logging configured: DEBUG for our modules, INFO+ for third-party libraries")


In [None]:
# Choose dataset
dataset_name = '50salads'

# Load data
result = prepare_df(dataset_name)

if len(result) == 2:
    df, softmax_lst = result
else:
    df, softmax_lst, _ = result


In [None]:
# Experiment configuration and helpers
from pathlib import Path

alphas = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
interpolation_strategies = {
    'unigram_super_heavy': [0.75, 0.15, 0.1],
    'unigram_heavy': [0.6, 0.25, 0.15],
    'balanced': [0.4, 0.35, 0.25],
    'trigram_heavy': [0.15, 0.25, 0.6],
}

TRAIN_TRACE_SWEEP = {
    '50salads': list(range(1, 11)),
    'gtea': list(range(1, 8)),
}
PARALLELIZATION_STRATEGY = 'dataset'  # 'trace', 'dataset', 'both', or None
MAX_WORKERS = None
HYPERPARAM_N_JOBS = 40
SAVE_PROCESS_MODEL_ONCE = True


def get_notebook_directory() -> Path:
    '''Best effort to locate the directory that contains this notebook.'''
    try:
        import ipynbname
        return Path(ipynbname.path()).parent.resolve()
    except Exception:
        pass
    try:
        from IPython import get_ipython
        ipython = get_ipython()
        if ipython:
            return Path(ipython.run_line_magic('pwd', '')).resolve()
    except Exception:
        pass
    return Path.cwd()


def resolve_results_dir(dataset: str) -> Path:
    nb_dir = get_notebook_directory()
    results_dir = nb_dir / 'results' / dataset
    results_dir.mkdir(parents=True, exist_ok=True)
    print(f"📁 Notebook directory: {nb_dir}")
    print(f"📁 Results directory: {results_dir}")
    return results_dir


def build_base_config(dataset: str, results_dir: Path) -> dict:
    base = {
        'n_train_traces': 7,
        'n_test_traces': None,
        'train_cases': None,
        'test_cases': None,
        'ensure_train_variant_diversity': True,
        'ensure_test_variant_diversity': True,
        'use_same_traces_for_train_test': False,
        'allow_train_cases_in_test': True,
        'compute_marking_transition_map': True,
        'sequential_sampling': True,
        'n_indices': None,
        'n_per_run': 10000,
        'independent_sampling': True,
        'prob_threshold': 1e-6,
        'chunk_size': 11,
        'conformance_switch_penalty_weight': 1.0,
        'merge_mismatched_boundaries': False,
        'conditioning_combine_fn': linear_prob_combiner,
        'max_hist_len': 3,
        'conditioning_n_prev_labels': 3,
        'use_collapsed_runs': True,
        'cost_function': "linear",
        'model_move_cost': 1.0,
        'log_move_cost': 1.0,
        'tau_move_cost': 0.0,
        'non_sync_penalty': 1.0,
        'use_calibration': True,
        'temp_bounds': (1.0, 10.0),
        'temperature': None,
        'verbose': True,
        'log_level': logging.INFO,
        'round_precision': 2,
        'random_seed': 101,
        'save_model_path': str(results_dir / f'discovered_petri_net_{dataset}'),
        'save_model': True,
        'dataset_name': dataset,
        'parallel_processing': False,
        'dataset_parallelization': False,
        'max_workers': MAX_WORKERS,
    }
    parallel_modes = {
        'trace': (True, False),
        'dataset': (False, True),
        'both': (False, True),
        None: (False, False),
    }
    base['parallel_processing'], base['dataset_parallelization'] = parallel_modes.get(
        PARALLELIZATION_STRATEGY, (False, False)
    )
    print(
        f"🔧 Parallelization: {PARALLELIZATION_STRATEGY or 'none'} "
        f"(max_workers={MAX_WORKERS or 'auto'})"
    )
    return base


results_dir = resolve_results_dir(dataset_name)
base_config = build_base_config(dataset_name, results_dir)

train_trace_values = TRAIN_TRACE_SWEEP.get(dataset_name.lower())
if not train_trace_values:
    train_trace_values = [base_config['n_train_traces']]
    print(
        f"⚠️ Dataset '{dataset_name}' not in TRAIN_TRACE_SWEEP; "
        f"using n_train_traces={train_trace_values[-1]}."
    )
else:
    print(f"🧪 Training-trace sweep: {train_trace_values}")
base_config['n_train_traces'] = train_trace_values[-1]



In [None]:
# Run hyperparameter search

def run_single_hyperparameter(
    n_train_traces,
    alpha,
    strategy_name,
    weights,
    idx,
    total_runs,
    df,
    softmax_lst,
    base_config,
    results_dir,
):
    dataset_name = base_config.get('dataset_name', 'unknown_dataset')
    print(
        f"[{idx}/{total_runs}] Running on {dataset_name}: "
        f"train_traces={n_train_traces}, alpha={alpha}, strategy={strategy_name}"
    )

    run_config = base_config.copy()
    run_config['conditioning_alpha'] = alpha
    run_config['conditioning_interpolation_weights'] = weights
    run_config['n_train_traces'] = n_train_traces

    save_model_this_run = run_config.get('save_model', False)
    if SAVE_PROCESS_MODEL_ONCE:
        save_model_this_run = (idx == 1)
    run_config['save_model'] = save_model_this_run
    run_config['save_model_path'] = base_config['save_model_path']

    run_config.pop('dataset_name', None)

    results_df, accuracy_dict, prob_dict_uncollapsed, prob_dict_collapsed = incremental_softmax_recovery(
        df=df,
        softmax_lst=softmax_lst,
        **run_config,
    )

    csv_filename = (
        f"{dataset_name}_train_{n_train_traces}_hyperparam_search_alpha_{alpha}_"
        f"weights_{strategy_name}.csv"
    )
    csv_path = os.path.join(results_dir, csv_filename)
    results_df.to_csv(csv_path, index=False)

    metrics = compute_sktr_vs_argmax_metrics(
        csv_path,
        case_col='case:concept:name',
        sktr_pred_col='sktr_activity',
        argmax_pred_col='argmax_activity',
        gt_col='ground_truth',
        background=0,
    )

    return {
        'dataset_name': dataset_name,
        'n_train_traces': n_train_traces,
        'conditioning_alpha': alpha,
        'interpolation_weights_strategy': strategy_name,
        'interpolation_weights': str(weights),
        'sktr_acc_micro': metrics['sktr']['acc_micro'],
        'sktr_edit': metrics['sktr']['edit'],
        'sktr_f1@10': metrics['sktr']['f1@10'],
        'sktr_f1@25': metrics['sktr']['f1@25'],
        'sktr_f1@50': metrics['sktr']['f1@50'],
        'argmax_acc_micro': metrics['argmax']['acc_micro'],
        'argmax_edit': metrics['argmax']['edit'],
        'argmax_f1@10': metrics['argmax']['f1@10'],
        'argmax_f1@25': metrics['argmax']['f1@25'],
        'argmax_f1@50': metrics['argmax']['f1@50'],
        'results_csv_path': csv_path,
    }


# Build parameter grid and run experiments
param_combinations = list(product(train_trace_values, alphas, interpolation_strategies.items()))
total_runs = len(param_combinations)

print(
    f"Starting hyperparameter grid: {len(train_trace_values)} train-trace settings × "
    f"{len(alphas)} alphas × {len(interpolation_strategies)} strategies = {total_runs} runs"
)
print(f"Hyperparameter-level parallelization: joblib (n_jobs={HYPERPARAM_N_JOBS})")
print(f"Run-level parallelization: {PARALLELIZATION_STRATEGY or 'none'}")
print("=" * 80)

summary_rows = Parallel(n_jobs=HYPERPARAM_N_JOBS, verbose=10)(
    delayed(run_single_hyperparameter)(
        n_train_traces,
        alpha,
        strategy_name,
        weights,
        idx,
        total_runs,
        df,
        softmax_lst,
        base_config,
        results_dir,
    )
    for idx, (n_train_traces, alpha, (strategy_name, weights)) in enumerate(param_combinations, 1)
)

summary_df = pd.DataFrame(summary_rows)

# Display and save results
print()
print("=" * 80)
print("HYPERPARAMETER SEARCH SUMMARY")
print("=" * 80)
print()
print("Top 10 combinations by SKTR Accuracy:")
display(summary_df.sort_values('sktr_acc_micro', ascending=False).head(10))
print()
print("Full results (sorted by SKTR Accuracy):")
display(summary_df.sort_values('sktr_acc_micro', ascending=False))

summary_path = os.path.join(
    results_dir,
    f"{base_config['dataset_name']}_hyperparameter_search_summary.csv",
)
summary_df.to_csv(summary_path, index=False)
print()
print(f"✓ Summary saved to: {summary_path}")
print(f"✓ Total runs completed: {len(summary_df)}")
