# Unified Variant Experiment

This notebook runs experiments on video activity segmentation datasets by controlling which trace variants are used for training and testing.

**Supported Datasets:** 50salads, gtea, breakfast

**Model Sources:** ASFormer, MS-TCN2, Original (pickle files)

## Notebook Structure

1. **Setup & Data Loading** - Load dataset and analyze variants
2. **Hyperparameter Search** - Fixed train/test split, sweep over alpha and interpolation strategies
3. **Final Experiment** - Fixed hyperparameters, sweep over number of training variants

## 1. Setup

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import os
import math
import time
import logging
from pathlib import Path
from typing import List

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import Parallel, delayed

# Setup workspace path
workspace_root = '/home/dsi/eli-bogdanov/sktr_for_long_traces'
if workspace_root not in sys.path:
    sys.path.insert(0, workspace_root)

from src.utils import (
    prepare_df, prepare_df_from_model, linear_prob_combiner,
    get_variant_info, get_cases_for_variants
)
from src.incremental_softmax_recovery import incremental_softmax_recovery
from src.evaluation import compute_sktr_vs_argmax_metrics

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', force=True)
for mod in ['src.classes', 'src.incremental_softmax_recovery', 'src.utils', 'src.conformance_checking', 'src.data_processing', 'src.petri_model', 'src.calibration']:
    logging.getLogger(mod).setLevel(logging.DEBUG)
for mod in ['graphviz', 'matplotlib', 'PIL']:
    logging.getLogger(mod).setLevel(logging.WARNING)

print(f"Workspace: {workspace_root}")
print(f"Running in tmux: {'TMUX' in os.environ}")

## 2. Configuration

**Edit this cell to select your dataset and model source.**

In [None]:
# =============================================================================
# CONFIGURATION - Edit these values
# =============================================================================

# --- Dataset Selection ---
# Options: '50salads', 'gtea', 'breakfast'
DATASET_NAME = '50salads'

# --- Model Source ---
# Options: 'asformer', 'mstcn2', 'original' (pickle files)
MODEL_SOURCE = 'asformer'

# --- General ---
RANDOM_SEED = 42

# --- Parallelization ---
N_PARALLEL_RUNS = 1  # Sequential hyperparameter experiments
DATASET_PARALLELIZATION = True
N_DATASET_WORKERS = 10  # Workers for dataset parallelization

# --- Output ---
SAVE_PROCESS_MODELS = True

print(f"Configuration:")
print(f"  Dataset: {DATASET_NAME}")
print(f"  Model Source: {MODEL_SOURCE}")
print(f"  Random Seed: {RANDOM_SEED}")
print(f"  Parallelization: {N_DATASET_WORKERS} workers")

## 3. Load Dataset

In [None]:
def load_dataset(dataset_name: str, model_source: str):
    """Load dataset from specified source."""
    print(f"Loading {dataset_name} from {model_source}...")
    
    if model_source == 'original':
        result = prepare_df(dataset_name)
        df, softmax_lst = result[:2]
    else:
        df, softmax_lst = prepare_df_from_model(dataset_name, model_source)
    
    print(f"  Loaded {len(softmax_lst)} cases, {len(df)} events")
    return df, softmax_lst

df, softmax_lst = load_dataset(DATASET_NAME, MODEL_SOURCE)

print(f"\nDataset: {DATASET_NAME}")
print(f"  Events: {len(df):,}")
print(f"  Cases: {df['case:concept:name'].nunique()}")
print(f"  Activities: {df['concept:name'].nunique()}")

## 4. Analyze Variants

In [None]:
variant_df = get_variant_info(df)
n_unique_variants = len(variant_df)
print(f"Total unique variants in dataset: {n_unique_variants}")

# Display variant distribution
print(f"\nTop 10 most frequent variants:")
display(variant_df[['variant_id', 'frequency', 'trace_length']].head(10))

# Setup results directory
results_dir = Path(workspace_root) / 'results' / DATASET_NAME / 'variant_experiment' / MODEL_SOURCE
results_dir.mkdir(parents=True, exist_ok=True)
print(f"\nResults directory: {results_dir}")

## 5. Helper Functions

In [None]:
def build_base_config() -> dict:
    """Build base configuration for experiments."""
    return {
        'n_train_traces': None, 'n_test_traces': None,
        'train_cases': None, 'test_cases': None,
        'ensure_train_variant_diversity': True,
        'ensure_test_variant_diversity': True,
        'use_same_traces_for_train_test': False, 'allow_train_cases_in_test': True,
        'compute_marking_transition_map': False, 'sequential_sampling': True,
        'n_indices': None, 'n_per_run': 10000, 'independent_sampling': True,
        'prob_threshold': 1e-6, 'chunk_size': 11, 'conformance_switch_penalty_weight': 1.0,
        'merge_mismatched_boundaries': False, 'conditioning_combine_fn': linear_prob_combiner,
        'max_hist_len': 3, 'conditioning_n_prev_labels': 3, 'use_collapsed_runs': True,
        'cost_function': 'linear', 'model_move_cost': 1.0, 'log_move_cost': 1.0,
        'tau_move_cost': 0.0, 'non_sync_penalty': 1.0,
        'use_calibration': True, 'temp_bounds': (1.0, 10.0), 'temperature': None,
        'verbose': True, 'log_level': logging.INFO, 'round_precision': 2,
        'random_seed': RANDOM_SEED,
        'save_model_path': None, 'save_model': False,
        'parallel_processing': False,
        'dataset_parallelization': DATASET_PARALLELIZATION,
        'max_workers': N_DATASET_WORKERS,
        'conditioning_state_mode': 'topm', 'conditioning_top_m': 3,
        'candidate_top_p': 0.9, 'candidate_top_k': 15, 'candidate_min_k': 1,
        'candidate_source': 'observed', 'candidate_apply_to_sync': True,
    }


def run_single_experiment(n_train_traces, test_cases, alpha, strategy, weights,
                          idx, total, df, softmax_lst, base_cfg, results_dir, prefix):
    """Run a single experiment and return metrics including timing info."""
    print(f"[{idx}/{total}] n_train_traces={n_train_traces}, alpha={alpha}, strategy={strategy}")
    
    cfg = base_cfg.copy()
    cfg.update({
        'conditioning_alpha': alpha,
        'conditioning_interpolation_weights': weights,
        'n_train_traces': n_train_traces,
        'test_cases': test_cases,
        'n_test_traces': len(test_cases) if test_cases else None,
        'save_model': SAVE_PROCESS_MODELS,
        'save_model_path': str(results_dir / f'petri_net_{prefix}'),
    })
    
    # Time the recovery process
    start_time = time.time()
    results_df, _, _ = incremental_softmax_recovery(df=df, softmax_lst=softmax_lst, **cfg)
    end_time = time.time()
    
    total_time = end_time - start_time
    n_test = results_df['case:concept:name'].nunique()
    avg_time_per_trace = total_time / n_test if n_test > 0 else 0
    
    csv_path = results_dir / f"{DATASET_NAME}_{MODEL_SOURCE}_{prefix}_alpha_{alpha}_weights_{strategy}.csv"
    results_df.to_csv(csv_path, index=False)
    
    metrics = compute_sktr_vs_argmax_metrics(
        str(csv_path),
        case_col='case:concept:name',
        sktr_pred_col='sktr_activity',
        argmax_pred_col='argmax_activity',
        gt_col='ground_truth',
        background=None,
        dataset_name=DATASET_NAME,
    )
    
    print(f"  -> Time: {total_time:.1f}s total, {avg_time_per_trace:.2f}s/trace ({n_test} traces)")
    
    return {
        'n_train_traces': n_train_traces, 'n_test_cases': len(test_cases) if test_cases else 'all',
        'alpha': alpha, 'strategy': strategy,
        'sktr_acc': metrics['sktr']['acc_micro'], 'sktr_edit': metrics['sktr']['edit'],
        'sktr_f1@10': metrics['sktr']['f1@10'], 'sktr_f1@25': metrics['sktr']['f1@25'],
        'sktr_f1@50': metrics['sktr']['f1@50'],
        'argmax_acc': metrics['argmax']['acc_micro'], 'argmax_edit': metrics['argmax']['edit'],
        'argmax_f1@10': metrics['argmax']['f1@10'], 'argmax_f1@25': metrics['argmax']['f1@25'],
        'argmax_f1@50': metrics['argmax']['f1@50'],
        'total_time_sec': round(total_time, 2), 'avg_time_per_trace_sec': round(avg_time_per_trace, 3),
    }

base_config = build_base_config()
print("Base config ready.")

---

# Part A: Hyperparameter Search

Use all unique variants for training, sweep over alpha and interpolation strategies.

In [None]:
# Hyperparameter search configuration
HP_N_TRAIN_TRACES = n_unique_variants  # Use all unique variants

HP_ALPHAS = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95]
HP_STRATEGIES = {
    'unigram_super_heavy': [0.75, 0.15, 0.1],
    'balanced': [0.33, 0.33, 0.34],
    'bigram_heavy': [0.2, 0.6, 0.2],
    'trigram_heavy': [0.1, 0.15, 0.75],
}

print(f"Hyperparameter Search Setup:")
print(f"  Training traces (unique variants): {HP_N_TRAIN_TRACES}")
print(f"  Test cases: all")
print(f"  Alphas: {HP_ALPHAS}")
print(f"  Strategies: {list(HP_STRATEGIES.keys())}")
print(f"  Total experiments: {len(HP_ALPHAS) * len(HP_STRATEGIES)}")

In [None]:
# Run hyperparameter search
hp_results_dir = results_dir / 'hyperparameter_search'
hp_results_dir.mkdir(parents=True, exist_ok=True)

hp_params = [(HP_N_TRAIN_TRACES, None, a, s, w)
             for a in HP_ALPHAS
             for s, w in HP_STRATEGIES.items()]

n_jobs = N_PARALLEL_RUNS if N_PARALLEL_RUNS is not None else -1

print(f"Running {len(hp_params)} hyperparameter experiments...")
print("=" * 60)

hp_results = Parallel(n_jobs=n_jobs, verbose=10)(
    delayed(run_single_experiment)(
        n_train, test, alpha, strategy, weights,
        i, len(hp_params), df, softmax_lst, base_config, hp_results_dir, "hp_search"
    )
    for i, (n_train, test, alpha, strategy, weights) in enumerate(hp_params, 1)
)

hp_summary_df = pd.DataFrame(hp_results).sort_values('sktr_acc', ascending=False)
hp_summary_path = hp_results_dir / f"{DATASET_NAME}_{MODEL_SOURCE}_hp_search_summary.csv"
hp_summary_df.to_csv(hp_summary_path, index=False)
print(f"\nSaved: {hp_summary_path}")

### Hyperparameter Search Results

In [None]:
# Display hyperparameter search results
print("Hyperparameter Search Results (sorted by SKTR accuracy):\n")
display(hp_summary_df[['alpha', 'strategy', 'sktr_acc', 'argmax_acc', 'sktr_edit', 'argmax_edit', 'sktr_f1@25', 'argmax_f1@25']])

# Best hyperparameters
best_hp = hp_summary_df.iloc[0]
print(f"\nBest hyperparameters:")
print(f"  Alpha: {best_hp['alpha']}")
print(f"  Strategy: {best_hp['strategy']}")
print(f"  SKTR Accuracy: {best_hp['sktr_acc']:.4f}")
print(f"  SKTR Edit: {best_hp['sktr_edit']:.4f}")

In [None]:
# Visualize hyperparameter search results
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

plot_cols = ['sktr_acc', 'sktr_edit', 'sktr_f1@25']
y_min = hp_summary_df[plot_cols].min().min()
y_max = hp_summary_df[plot_cols].max().max()
y_lower = math.floor(y_min / 10) * 10
y_max = max(y_max, 80)
tick_start = y_lower
tick_end = math.ceil(y_max / 10) * 10
y_limits = (tick_start, tick_end)
y_ticks = list(range(int(tick_start), int(tick_end) + 1, 10))

for ax, metric in zip(axes, plot_cols):
    pivot = hp_summary_df.pivot(index='alpha', columns='strategy', values=metric)
    pivot.plot(kind='bar', ax=ax, rot=0)
    ax.set_xlabel('Alpha')
    ax.set_ylabel(metric.replace('sktr_', '').replace('_', ' ').title())
    ax.set_title(f'SKTR {metric.replace("sktr_", "").replace("_", " ").title()}')
    ax.legend(title='Strategy', fontsize=8)
    ax.grid(True, alpha=0.3, axis='y')
    if y_limits is not None:
        ax.set_ylim(*y_limits)
    if y_ticks is not None:
        ax.set_yticks(y_ticks)

plt.suptitle(f'Hyperparameter Search Results ({DATASET_NAME} - {MODEL_SOURCE})', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(hp_results_dir / f'{DATASET_NAME}_{MODEL_SOURCE}_hp_search_plots.png', dpi=150)
plt.show()

---

# Part B: Final Experiment (Training Sweep)

Use the best hyperparameters from Part A and sweep over number of training variants (1 to N unique variants).

In [None]:
# Final experiment configuration - use best hyperparameters from search
FINAL_ALPHA = best_hp['alpha']
FINAL_STRATEGY = best_hp['strategy']
FINAL_WEIGHTS = HP_STRATEGIES[best_hp['strategy']]

# Predefined sweep ranges per dataset (based on unique variants in ground truth)
# GTEA: 28 videos, 28 unique variants
# 50salads: 50 videos, 50 unique variants
# Breakfast: 1712 videos, 267 unique variants (TBD)
TRAIN_TRACE_SWEEP = {
    '50salads': [1, 5, 10, 20, 30, 40, 50],  # 50 unique variants
    'gtea': [1, 5, 10, 15, 20, 28],           # 28 unique variants
    'breakfast': list(range(1, 15)),          # TBD - waiting for ASFormer predictions
}

sweep_values = TRAIN_TRACE_SWEEP.get(DATASET_NAME, list(range(1, n_unique_variants + 1)))

print(f"Final Experiment Configuration:")
print(f"  Alpha: {FINAL_ALPHA}")
print(f"  Strategy: {FINAL_STRATEGY}")
print(f"  Weights: {FINAL_WEIGHTS}")
print(f"  Test cases: all")
print(f"  Training sweep: {sweep_values}")
print(f"  Total experiments: {len(sweep_values)}")

In [None]:
# Run final experiment sweep
final_results_dir = results_dir / 'final_experiment'
final_results_dir.mkdir(parents=True, exist_ok=True)

sweep_params = [(n, None, FINAL_ALPHA, FINAL_STRATEGY, FINAL_WEIGHTS)
                for n in sweep_values]

print(f"Running {len(sweep_params)} sweep experiments...")
print("=" * 60)

sweep_results = Parallel(n_jobs=n_jobs, verbose=10)(
    delayed(run_single_experiment)(
        n_train, test, alpha, strategy, weights,
        i, len(sweep_params), df, softmax_lst, base_config, final_results_dir, f"sweep_n{n_train}"
    )
    for i, (n_train, test, alpha, strategy, weights) in enumerate(sweep_params, 1)
)

sweep_summary_df = pd.DataFrame(sweep_results).sort_values('n_train_traces')
sweep_summary_path = final_results_dir / f"{DATASET_NAME}_{MODEL_SOURCE}_sweep_summary.csv"
sweep_summary_df.to_csv(sweep_summary_path, index=False)
print(f"\nSaved: {sweep_summary_path}")

### Final Experiment Results

In [None]:
# Display sweep results
print("Training Sweep Results:\n")
display_cols = ['n_train_traces', 'sktr_acc', 'argmax_acc', 'sktr_edit', 'argmax_edit', 'sktr_f1@25', 'argmax_f1@25']
if 'avg_time_per_trace_sec' in sweep_summary_df.columns:
    display_cols.append('avg_time_per_trace_sec')
display(sweep_summary_df[display_cols])

In [None]:
# Improvement analysis
analysis = sweep_summary_df.copy()
analysis['acc_gain'] = analysis['sktr_acc'] - analysis['argmax_acc']
analysis['edit_gain'] = analysis['sktr_edit'] - analysis['argmax_edit']
analysis['f1@25_gain'] = analysis['sktr_f1@25'] - analysis['argmax_f1@25']

print("SKTR Improvement over Argmax:")
print(f"  Accuracy:  mean={analysis['acc_gain'].mean():+.4f}, max={analysis['acc_gain'].max():+.4f}")
print(f"  Edit:      mean={analysis['edit_gain'].mean():+.4f}, max={analysis['edit_gain'].max():+.4f}")
print(f"  F1@25:     mean={analysis['f1@25_gain'].mean():+.4f}, max={analysis['f1@25_gain'].max():+.4f}")

# Timing summary
if 'avg_time_per_trace_sec' in sweep_summary_df.columns:
    valid_times = sweep_summary_df['avg_time_per_trace_sec'].dropna()
    if len(valid_times) > 0:
        print(f"\nTiming Summary:")
        print(f"  Avg time per trace: {valid_times.mean():.3f}s (min={valid_times.min():.3f}s, max={valid_times.max():.3f}s)")
        total_times = sweep_summary_df['total_time_sec'].dropna()
        if len(total_times) > 0:
            print(f"  Total experiment time: {total_times.sum():.1f}s ({total_times.sum()/60:.1f} min)")

best_idx = analysis['sktr_acc'].idxmax()
print(f"\nBest SKTR accuracy: {analysis.loc[best_idx, 'sktr_acc']:.4f} at n_train_traces={analysis.loc[best_idx, 'n_train_traces']}")

In [None]:
# Visualization of sweep results
sns.set_theme(style='whitegrid', context='notebook', palette='deep')

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

metrics_config = [
    ('acc', 'Accuracy'),
    ('edit', 'Edit Score'),
    ('f1@10', 'F1@10'),
    ('f1@25', 'F1@25'),
    ('f1@50', 'F1@50'),
]

method_styles = {
    'sktr': {'color': '#1f77b4', 'marker': 'o', 'label': 'SKTR', 'linestyle': '-'},
    'argmax': {'color': '#ff7f0e', 'marker': 's', 'label': 'Argmax', 'linestyle': '--'},
}

# Collect all plot columns for y-axis scaling
plot_cols = []
for metric_suffix, _ in metrics_config:
    for method in method_styles:
        col_name = f'{method}_{metric_suffix}'
        if col_name in sweep_summary_df.columns:
            plot_cols.append(col_name)

# Compute y-axis limits
y_limits = None
y_ticks = None
if plot_cols:
    y_min = sweep_summary_df[plot_cols].min().min()
    y_max = sweep_summary_df[plot_cols].max().max()
    y_lower = math.floor(y_min / 10) * 10
    y_max = max(y_max, 80)
    tick_start = y_lower
    tick_end = math.ceil(y_max / 10) * 10
    y_limits = (tick_start, tick_end)
    y_ticks = list(range(int(tick_start), int(tick_end) + 1, 10))

for idx, (metric_suffix, title) in enumerate(metrics_config):
    ax = axes[idx]
    for method, style in method_styles.items():
        col_name = f'{method}_{metric_suffix}'
        if col_name in sweep_summary_df.columns:
            sns.lineplot(
                x=sweep_summary_df['n_train_traces'],
                y=sweep_summary_df[col_name],
                ax=ax,
                linewidth=2.5,
                markersize=9,
                **style,
            )
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel('Training Variants')
    ax.set_ylabel('Score')
    ax.set_xticks(sweep_summary_df['n_train_traces'].unique())
    if y_limits is not None:
        ax.set_ylim(*y_limits)
    if y_ticks is not None:
        ax.set_yticks(y_ticks)
    ax.legend().remove()

# Legend in last subplot
ax_legend = axes[5]
ax_legend.axis('off')
handles, labels = axes[0].get_legend_handles_labels()
ax_legend.legend(
    handles, labels,
    loc='center', title='Method',
    fontsize=14, title_fontsize=16,
    frameon=True, fancybox=True, shadow=True,
)

plt.suptitle(f'Performance vs. Training Variants ({DATASET_NAME} - {MODEL_SOURCE})', fontsize=18, y=1.02)
plt.tight_layout()
plt.savefig(final_results_dir / f'{DATASET_NAME}_{MODEL_SOURCE}_sweep_plots.png', dpi=150)
plt.show()

---

## Load Saved Results (Standalone)

This section can be used to reload and visualize previously saved results.

In [None]:
# Load saved results (standalone cell)
from pathlib import Path
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

# Configuration - set these to match your saved results
LOAD_DATASET = '50salads'  # or 'gtea', 'breakfast'
LOAD_MODEL = 'asformer'    # or 'mstcn2', 'original'

workspace_root = globals().get('workspace_root', '/home/dsi/eli-bogdanov/sktr_for_long_traces')

results_dir = Path(workspace_root) / 'results' / LOAD_DATASET / 'variant_experiment' / LOAD_MODEL
final_results_dir = results_dir / 'final_experiment'
sweep_summary_path = final_results_dir / f"{LOAD_DATASET}_{LOAD_MODEL}_sweep_summary.csv"

if sweep_summary_path.exists():
    df = pd.read_csv(sweep_summary_path).sort_values('n_train_traces').reset_index(drop=True)
    print(f"Loaded: {sweep_summary_path}")
    display(df)
else:
    print(f"File not found: {sweep_summary_path}")
    print(f"Run the experiment first or check the dataset/model configuration.")