$$
\Huge \blue{\textbf{Distribution shift \qquad}} \\
$$

# $\text{Analysis of different models over shifted datasets}$

## $\text{Imports \& settings}$

In [24]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    classification_report, roc_auc_score, accuracy_score, 
    f1_score, roc_curve
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from pygam import s, te, f, LogisticGAM
from sklearn.model_selection import train_test_split
from scipy.stats import ortho_group

import xgboost as xgb

from src.data_generation import *
from src.analysis import compare_adversarial_training, ModelEvaluator
from src.utils import *
from src.plotting import visualize_feature_shifts

np.random.seed(0)


# 1. Data Creation

In [25]:
DATA_FOLDER = 'data'

# Parameter definition

num_samples = 1000
num_features = 3

# degree of the polinomio for the attribute relationship
degree = 2

## Training Set

In [26]:
# random multivariate

mean_train = [0.90920214, 0.81962487, 0.88819135]

covariance_train = np.array([[0.726318, 0.20240102, 0.52472545],
                             [0.20240102, 0.11392557, 0.0264108],
                             [0.52472545, 0.0264108, 1.05107627]])

# build the features sample
sample_train = build_multivariate_sample(num_samples, mean_train, covariance_train)
df_train = pd.DataFrame(sample_train, columns=[f'X{i+1}' for i in range(num_features)])

# build target variable y
# random coefficients (otherwise remove coef from build_poly_target and will be randomly generated)
coef = [-0.8061577012389105, -0.3621987584904036, -0.16057091147074054, 0.4803476403769713, -0.10624889645240687, 
        0.3182084398201366, 0.6789895126695962, -0.791324832566177, 0.531479159887424, 0.49115959567000167]

y_train, coef_train = build_poly_target(sample_train, degree, coef)
df_train['Y'] = y_train

# check for balance
df_train['Y'].value_counts()

Y
1    523
0    477
Name: count, dtype: int64

## Testing Sets: Shifted Distribution Mixtures

To be as general as possible, we consider statistical mixtures and study the presumed progressive degradation in performance for increasingly pure mixtures towards the test distribution.

In [27]:
# shifted random multivariate
mean_shift = attributes_quantile(df_train, 0.05)

covariance_shift = [[ 0.16309729,  0.19325742, -0.12621892],
                    [ 0.19325742,  0.25197638, -0.13972381],
                    [-0.12621892, -0.13972381,  0.19160666]]

# Initialize an empty dictionary to store the dataframes
df_dict = {}

# Iterate over mix_prob values
for mix_prob in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    # Generate mixture sample
    sample_mix = build_mixture_sample(num_samples, mean_train, covariance_train, mean_shift, covariance_shift, mix_prob=mix_prob)

    # Create a DataFrame for the features
    df_mix = pd.DataFrame(sample_mix, columns=[f'X{i+1}' for i in range(num_features)])

    # Build the target variable y
    y_mix, coef_mix = build_poly_target(sample_mix, degree, coefficients=coef_train)
    df_mix['Y'] = y_mix

    # Store the DataFrame in the dictionary
    df_dict[mix_prob] = df_mix

Remark: the 0.0 is a sample from the distribution that generated the training set. Since `build_mixture_sample` function do the dample each time, the 0.0 sample can be used as test set.

## Saving Data to Files

In [28]:
# Create a folder
folder_name = os.path.join('data')
os.makedirs(folder_name, exist_ok=True)

for mix_prob, df in df_dict.items():
    df.to_csv(os.path.join(folder_name, f'mix_{mix_prob}.csv'), index=False)
file_name = 'Parameters.txt'
file_path = os.path.join(folder_name, file_name)
df_train.to_csv(os.path.join(folder_name, 'train.csv'), index=False)

with open(file_path, 'w') as f:
  f.write('Polinomial coefficients\n')
  f.write(f'{coef_train}\n')
  f.write('Mean train\n')
  f.write(f'{mean_train}\n')
  f.write('Covariance train\n')
  f.write(f'{covariance_train}\n')
  f.write('Mean shift\n')
  f.write(f'{mean_shift}\n')
  f.write('Covariance shift\n')
  f.write(f'{covariance_shift}\n')

 # 2. Data Visualization


 For higher dimensional data (n > 2), we can either:

 - Visualize a pairwise scatter matrix (e.g., `sns.pairplot`) for a subset of features.

 - Or just visualize a specified pair of features for a quick glimpse.

In [29]:
# visualize_feature_shifts(df_dict=df_dict, features_to_plot= ['X1', 'X2', 'X3'])

# 3. Models Training

In [30]:
# # Load train data

# X_train = df_train.drop('Y', axis=1)
# y_train = df_train['Y']

## GAM Model

In [31]:
# lgam_params = {
#     "terms": s(0) + s(1) + s(2) + te(0, 1) + te(0, 2) + te(1, 2),
#     "max_iter": 100
# }

# lgam_model = LogisticGAM(**lgam_params).fit(X_train, y_train)

In [32]:
# lgam_model.summary()

## Decision Tree Classifier

In [33]:
# # Decision Tree Parameters

# dtc_params = {
#     "max_depth": 4,
#     "min_samples_leaf": 13
# }

# dtc_model = DecisionTreeClassifier(**dtc_params)
# dtc_model.fit(X_train, y_train)

## Gradient Boosting

In [34]:
# # Gradient Boosting Parameters
# gbc_params = {
#     "learning_rate": 0.05,
#     "max_depth": 4,
#     "max_features": 'log2',
#     "min_samples_leaf": 13,
#     "n_estimators": 100,
#     "subsample": 0.7
# }

# gbc_model = GradientBoostingClassifier(**gbc_params)
# gbc_model.fit(X_train, y_train)

## Extreme Gradient Boosting

In [35]:
# XGBoost Parameters
# xgb_params = {
#     "learning_rate":0.025,
#     "max_depth":5,
#     "n_estimators":100,
#     "subsample":0.7
# }

# xgb_model = xgb.XGBClassifier().fit(X_train, y_train)

 # 4. Model Evaluation After the Distribution Shift

In [36]:
# # Define the models to evaluate

# models = {
#     "LogisticGAM" : lgam_model,
#     "DecisionTreeClassifier" : dtc_model,
#     "GradientBoostingClassifier" : gbc_model,
#     # "XGBoost" : xgb_model
#     }

In [37]:
# # Assuming df_dict is a dictionary with keys from 0.1 to 1.0
# test_datasets = [(key, df.drop('Y', axis=1), df['Y']) for key, df in df_dict.items() if 0.0 <= key <= 1.0]

# evaluator = ModelEvaluator(models, test_datasets)
# evaluator.evaluate_models(show_metrics=True)
# evaluator.plot_roc_curves()
# evaluator.plot_roc_curves_per_dataset()
# evaluator.plot_accuracy()

 # 5. Adversarial Training for GradientBoosting (General n_features)

 ### Compare Normal vs. Adversarial Training on Shifted Sets

In [38]:
# for name, info in models.items():
#   compare_adversarial_training(
#     DATA_FOLDER, 
#     model_class=info["class"],
#     model_params=info["params"],
#   )

 # 6. Mechanistic-Interpretability-Guided Robust Boosting

In [39]:
from typing import Optional, Tuple, Union

from sklearn.base import BaseEstimator
from src.robust_training.mechanistic import MechanisticTrainer
try:
    from pygam import LogisticGAM
    PYGAM_AVAILABLE = True
except ImportError:
    PYGAM_AVAILABLE = False


def run_mechanistic_robust_training_and_eval(
    folder: str = "dat",
    target: str = 'Y',
    n_rounds: int = 2,
    model_type: str = 'gbc',  # Options: 'gbc', 'tree', 'gam'
    base_shift_factor: float = 0.1,
    fraction_to_shift: float = 0.7,
    final_train_size: Optional[int] = None,
    random_state: int = 42,
    noise_scale: float = 0.001,
    n_grad_steps: int = 1,
    top_k: int = 5
) -> Tuple[BaseEstimator, BaseEstimator]:
    """
    Trains both a baseline model and a robust model using MechanisticTrainer,
    then evaluates both models on all shifted test files in the specified folder.

    Parameters
    ----------
    folder : str
        Directory containing 'mix_0.0.csv' for training and 'mix_<n>.csv' for testing.
    target : str
        Name of the target variable in the datasets.
    n_rounds : int
        Number of augmentation rounds for MechanisticTrainer.
    model_type : str
        Type of model to use for robust training. Options: 'gbc', 'tree', 'gam'.
    base_shift_factor : float
        Magnitude by which to shift selected features during augmentation.
    fraction_to_shift : float
        Fraction of the dataset to select for augmentation each round.
    final_train_size : int or None
        If specified, downsample the final training set to this size.
    random_state : int
        Seed for reproducibility.
    noise_scale : float
        Standard deviation of Gaussian noise added to augmented samples.
    n_grad_steps : int
        Number of gradient-based steps per sample during augmentation.
    top_k : int
        Number of top features (by gradient magnitude) to shift per sample.

    Returns
    -------
    baseline_model : BaseEstimator
        The baseline model trained on the original data.
    robust_model : BaseEstimator
        The robustly trained model using MechanisticTrainer.
    """

    # ----------------------------------------------------------------------
    # 1) Load original training data from "mix_0.0.csv"
    # ----------------------------------------------------------------------
    train_file = os.path.join(folder, "train.csv")
    if not os.path.exists(train_file):
        raise FileNotFoundError(f"Training file '{train_file}' not found in folder '{folder}'.")

    df_orig = pd.read_csv(train_file)
    if target not in df_orig.columns:
        raise ValueError(f"Target column '{target}' not found in '{train_file}'.")

    X_train = df_orig.drop(columns=[target])
    y_train = df_orig[target]

    print(f"Loaded training data from '{train_file}' with shape = {X_train.shape}")

    # ----------------------------------------------------------------------
    # 2) Train Baseline Model
    #    Ensure baseline uses the same model_type for fair comparison.
    # ----------------------------------------------------------------------
    print("\n=== Training Baseline Model ===")
    if model_type == 'gbc':
        baseline_model = GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.05,
            max_depth=3,
            random_state=random_state
        )
    elif model_type == 'tree':
        baseline_model = DecisionTreeClassifier(
            max_depth=5,
            random_state=random_state
        )
    elif model_type == 'gam':
        if not PYGAM_AVAILABLE:
            raise ImportError("pyGAM is not installed. Install it via `pip install pygam` or choose another model type.")
        baseline_model = LogisticGAM( verbose=False)
    else:
        raise ValueError(f"Unsupported model_type '{model_type}'. Choose from ['gbc', 'tree', 'gam'].")

    baseline_model.fit(X_train, y_train)
    print("Baseline model trained.")

    # ----------------------------------------------------------------------
    # 3) Train Mechanistic-Interpretability-Guided Robust Model
    # ----------------------------------------------------------------------
    print("\n=== Training Mechanistic-Interpretability-Guided Robust Model ===")
    trainer = MechanisticTrainer(
        model_type=model_type,         # 'gbc', 'tree', 'gam'
        base_shift_factor=base_shift_factor,
        n_rounds=n_rounds,
        subset_size_fraction=fraction_to_shift,
        n_grad_steps=n_grad_steps,
        top_k=top_k,
        random_state=random_state,
        noise_scale=noise_scale,
       
        val_fraction=0.1,               # Fraction for validation split
        eps = 0.1
    )

    # Optionally, initialize the model externally if needed
    # e.g., trainer.model = some_pretrained_model

    # Fit the robust model
    trainer.fit(X_train, y_train)
    robust_model = trainer.model
    print("Robust model trained.")

    # If final_train_size is specified, downsample & refit
    if final_train_size is not None and final_train_size < len(trainer.X_final):
        rng = np.random.RandomState(random_state)
        idx_down = rng.choice(len(trainer.X_final), size=final_train_size, replace=False)
        X_down = trainer.X_final.iloc[idx_down].reset_index(drop=True)
        y_down = trainer.y_final.iloc[idx_down].reset_index(drop=True)
        robust_model.fit(X_down, y_down)
        print(f"Final training set downsampled to {final_train_size} samples.")

    # ----------------------------------------------------------------------
    # 4) Evaluate on all shifted test files: "mix_<n>.csv"
    # ----------------------------------------------------------------------
    test_files = [
        f for f in os.listdir(folder)
        if f.startswith("mix_") and f.endswith(".csv") and f #!= "mix_0.0.csv"
    ]

    if not test_files:
        print(f"\nNo shifted test files found in '{folder}' for evaluation.")
        return baseline_model, robust_model

    print("\n=== Evaluation on Shifted Test Files ===")
    for test_file in sorted(test_files):
        test_path = os.path.join(folder, test_file)
        df_test = pd.read_csv(test_path)
        if target not in df_test.columns:
            print(f"Skipping '{test_file}': missing target '{target}'.")
            continue

        X_test = df_test.drop(columns=[target])
        y_test = df_test[target]

        # Evaluate Baseline Model
        y_pred_b = baseline_model.predict(X_test)
        if hasattr(baseline_model, "predict_proba"):
            y_proba_b = baseline_model.predict_proba(X_test)[:, 1]
            try:
                auc_b = roc_auc_score(y_test, y_proba_b)
            except ValueError:
                auc_b = "N/A (only one class present)"
        else:
            y_proba_b = None
            auc_b = "N/A"

        acc_b = accuracy_score(y_test, y_pred_b)
        f1_b = f1_score(y_test, y_pred_b, average='weighted')  # Use 'weighted' for multiclass

        # Evaluate Robust Model
        y_pred_r = robust_model.predict(X_test)
        if hasattr(robust_model, "predict_proba"):
            y_proba_r = robust_model.predict_proba(X_test)[:, 1]
            try:
                auc_r = roc_auc_score(y_test, y_proba_r)
            except ValueError:
                auc_r = "N/A (only one class present)"
        else:
            y_proba_r = None
            auc_r = "N/A"

        acc_r = accuracy_score(y_test, y_pred_r)
        f1_r = f1_score(y_test, y_pred_r, average='weighted')  # Use 'weighted' for multiclass

        print(f"\nTest File: {test_file}")
        print(f"  Baseline Model => Accuracy: {acc_b:.3f}, F1 Score: {f1_b:.3f}, AUC: {auc_b}")
        print(f"  Robust Model   => Accuracy: {acc_r:.3f}, F1 Score: {f1_r:.3f}, AUC: {auc_r}")
        print("-" * 50)

    return baseline_model, robust_model



In [None]:
DATA_FOLDER = 'data'
TARGET_COLUMN = 'Y'

# Run the robust training and evaluation
baseline_model, robust_model = run_mechanistic_robust_training_and_eval(
    folder=DATA_FOLDER,
    target=TARGET_COLUMN,
    n_rounds=1,
    model_type='tree',          # Options: 'gbc', 'tree', 'gam'
    base_shift_factor=100,
    fraction_to_shift=0.9,
    final_train_size=1000,     # Keep the original size
    random_state=42,
    noise_scale=0.0001,
    n_grad_steps=5,
    top_k=3
)


Loaded training data from 'data\train.csv' with shape = (1000, 3)

=== Training Baseline Model ===
Baseline model trained.

=== Training Mechanistic-Interpretability-Guided Robust Model ===
[MechanisticTrainer] Initial fit on training set.
[MechanisticTrainer] Augmentation Round 1/1


In [18]:
# def train_baseline_gb(
#     X_train: pd.DataFrame,
#     y_train: pd.Series,
#     final_train_size: int = None,
#     random_state: int = 42
# ):
#     """
#     Train a standard GradientBoosting on exactly 'final_train_size' points 
#     (if provided). If final_train_size < len(X_train), downsample first.
#     """
#     import numpy as np
#     from sklearn.ensemble import GradientBoostingClassifier

#     rng = np.random.RandomState(random_state)
    
#     if final_train_size is not None and final_train_size < len(X_train):
#         indices = rng.choice(len(X_train), size=final_train_size, replace=False)
#         X_train = X_train.iloc[indices].reset_index(drop=True)
#         y_train = y_train.iloc[indices].reset_index(drop=True)
    
#     model = GradientBoostingClassifier(
#         learning_rate=0.05,
#         max_depth=4,
#         max_features='log2',
#         min_samples_leaf=13,
#         n_estimators=100,
#         subsample=0.7,
#         random_state=random_state
#     )
#     model.fit(X_train, y_train)
#     return model


 # 7. Compare All Models: Original, Adversarial, Mechanistic

In [19]:
# def compare_models_performance(folder: str = "data", target = 'Y'):
#     """
#     Compare performance between:
#       - Original Gradient Boosting
#       - Adversarially Trained Gradient Boosting
#       - Mechanistically-Interpreted (MI) Model
      
#     Plots side-by-side bar charts of Accuracy, F1, AUC for each shifted dataset.
#     """
#     df_orig = pd.read_csv(os.path.join(folder, "mix_0.0.csv"))
#     X_train = df_orig.drop(target, axis=1)
#     y_train = df_orig[target]

#     final_size = len(X_train)
#     gbc_original = train_baseline_gb(X_train, y_train, final_train_size=final_size)
    
#     gbc_adversarial, X_adv, y_adv = adversarial_training(
#         X_train, y_train,
#         epsilon=0.1,
#         max_rounds=3,
#         fraction_to_perturb=0.5,
#         final_train_size=final_size
#     )
    
#     gbc_mech, X_mi, y_mi = mech_interp_boosting(
#         X_train, y_train,
#         base_shift_factor=0.1,
#         n_rounds=3,
#         fraction_to_shift=0.7,
#         use_weighted_fit=True,
#         final_train_size=final_size
#     )

#     # Evaluate these three models on all shifted CSVs
#     test_files = [f for f in os.listdir(folder) if f.startswith("mix_")]
#     test_files = sorted([f for f in os.listdir(folder) if f.startswith("mix_")], 
#                key=lambda x: int(x.split('_')[1].split('.')[0]))
#     results = []
#     model_dict = {
#         "OriginalGB": gbc_original,
#         "AdversarialGB": gbc_adversarial,
#         "MechInterpGB": gbc_mech
#     }

#     for model_name, model_obj in model_dict.items():
#         for test_file in sorted(test_files):
#             df_test = pd.read_csv(os.path.join(folder, sorted(test_files)[test_files.index(test_file)]))
#             X_test = df_test[X_train.columns]
#             y_test = df_test[target]

#             y_pred = model_obj.predict(X_test)
#             y_pred_proba = model_obj.predict_proba(X_test)[:, 1]

#             acc = accuracy_score(y_test, y_pred)
#             f1_ = f1_score(y_test, y_pred)
#             auc_ = roc_auc_score(y_test, y_pred_proba)

#             results.append({
#                 "Model": model_name,
#                 "ShiftedFile": test_file,
#                 "Accuracy": acc,
#                 "F1": f1_,
#                 "AUC": auc_
#             })

#     df_results = pd.DataFrame(results)
#     metrics_to_plot = ["Accuracy", "F1", "AUC"]
    
#     # Updated plotting code with fixed legends
#     fig1, axes1 = plt.subplots(3, 1, figsize=(10, 15))
    
#     for idx, metric in enumerate(metrics_to_plot):
#         sns.barplot(
#             data=df_results, 
#             x="ShiftedFile", 
#             y=metric, 
#             hue="Model",
#             ax=axes1[idx]
#         )
#         axes1[idx].set_title(f"{metric} Comparison")
#         #axes1[idx].set_xticklabels(
#         #    axes1[idx].get_xticklabels(), 
#         #    rotation=45, 
#         #    ha='right'
#         #)
#         #axes1[idx].set_ylim(0, 1.05)
#         axes1[idx].legend(title="Models")
        
#     plt.tight_layout()
#     plt.show()

#     fig2, axes2 = plt.subplots(3, 1, figsize=(10, 15))
    
#     for idx, metric in enumerate(metrics_to_plot):
#         pivot_data = df_results.pivot(
#             index='ShiftedFile', 
#             columns='Model', 
#             values=metric
#         )
        
#         rel_improvement = 100 * (
#             pivot_data - pivot_data['OriginalGB'].values.reshape(-1,1)
#         ) / pivot_data['OriginalGB'].values.reshape(-1,1)
        
#         # Explicitly set labels for legend
#         rel_improvement_plot = rel_improvement[['AdversarialGB', 'MechInterpGB']].reset_index()
#         rel_improvement_plot = pd.melt(
#             rel_improvement_plot, 
#             id_vars=['ShiftedFile'],
#             var_name='Model',
#             value_name='Improvement'
#         )
        
#         sns.barplot(
#             data=rel_improvement_plot,
#             x='ShiftedFile',
#             y='Improvement',
#             hue='Model',
#             ax=axes2[idx]
#         )
        
#         axes2[idx].set_title(f'Relative {metric} Improvement (%)')
#         axes2[idx].axhline(y=0, color='k', linestyle='--', alpha=0.3)
#         #axes2[idx].set_xticklabels(
#         #    axes2[idx].get_xticklabels(), 
#         #    rotation=45,
#         #    ha='right'
#         #)
#         axes2[idx].set_ylabel('Improvement %')
#         axes2[idx].legend(title="Models")
        
#     plt.tight_layout()
#     plt.show()


In [20]:
# compare_models_performance(DATA_FOLDER)

# ONE CELL TO RUN THEM ALL

In [21]:
# 1) Generate data for n_features
# create_shifted_datasets(n_features=4, output_folder="data", base_n_samples=1000, cov_scale=3.5, mean_shift_scale=2.5)

# 2) Visualize shifts
# visualize_feature_shifts(DATA_FOLDER, features_to_plot=['X1', 'X2', 'X3'])

# 3) Evaluate baseline models
# evaluate_models_on_shifts(DATA_FOLDER)

# 4) Compare adv vs normal
# compare_adversarial_training(DATA_FOLDER)

# 5) Mechanistic robust boosting
# demo_mechanistic_robust_boosting(DATA_FOLDER, n_rounds=2)

# 6) Compare all
# compare_models_performance(DATA_FOLDER)


In [22]:
# def tune_mechinterp_hparams(
#     folder: str = "data",
#     base_shift_candidates = [0.05, 0.1, 0.2],
#     fraction_candidates = [0.3, 0.5, 0.7],
#     target = 'Y',
#     n_rounds: int = 3,
#     final_train_size: int = None,
#     random_state: int = 42
# ):
#     """
#     1. Load the original dataset from `folder`.
#     2. Train baseline GB + Adversarial GB for reference.
#     3. For each (base_shift_factor, fraction_to_shift) in the grid:
#        - Train a MechInterp model.
#        - Evaluate on the same shifted CSVs.
#     4. Compare performance + Return a table of all results + Identify best combo.
#     Parameters
#     ----------
#     folder : str
#         Folder containing mix_*.csv.
#     base_shift_candidates : list
#         Values to try for `base_shift_factor`.
#     fraction_candidates : list
#         Values to try for `fraction_to_shift`.
#     n_rounds : int
#         Rounds of MechInterp training.
#     final_train_size : int or None
#         If not None, downsample final training set to keep dataset size fair.
#     random_state : int
#         For reproducibility.
#     Returns
#     -------
#     pd.DataFrame
#         A DataFrame summarizing all runs (Model, base_shift, fraction, ShiftFile, Accuracy, F1, AUC).
#     (float, float)
#         The best (base_shift_factor, fraction_to_shift) combo (based on chosen metric).
#     """
#     # 1) Load original => train data
#     df_orig = pd.read_csv(os.path.join(folder, "mix_0.0.csv"))
#     X_train = df_orig.drop(target, axis=1)
#     y_train = df_orig[target]
#     if final_train_size is None:
#         final_train_size = len(X_train)  # default to full training size
#     # 2) Train baseline
#     gbc_baseline = train_baseline_gb(
#         X_train, y_train, 
#         final_train_size=final_train_size, 
#         random_state=random_state
#     )
#     # 2b) Train adversarial
#     gbc_adv, X_adv, y_adv = adversarial_training_gbc(
#         X_train, y_train,
#         epsilon=0.1,
#         max_rounds=3,
#         fraction_to_perturb=0.5,
#         final_train_size=final_train_size,
#         random_state=random_state
#     )
#     # We'll evaluate on each shifted file
#     test_files = [f for f in os.listdir(folder) if f.startswith("shifted_")]
  
#     # Collect all results in a list of dicts
#     all_results = []
  
#     # Evaluate Baseline + Adversarial once here
#     model_dict = {
#         "OriginalGB": gbc_baseline,
#         "AdversarialGB": gbc_adv
#     }
  
#     for model_name, model_obj in model_dict.items():
#         for test_file in test_files:
#             df_test = pd.read_csv(os.path.join(folder, test_file))
#             X_test = df_test[X_train.columns]
#             y_test = df_test[target]
          
#             y_pred = model_obj.predict(X_test)
#             y_proba = model_obj.predict_proba(X_test)[:, 1]
          
#             all_results.append({
#                 "Model": model_name,
#                 "BaseShiftFactor": None,
#                 "FractionShift": None,
#                 "ShiftedFile": test_file,
#                 "Accuracy": accuracy_score(y_test, y_pred),
#                 "F1": f1_score(y_test, y_pred),
#                 "AUC": roc_auc_score(y_test, y_proba)
#             })
#     # 3) For each combo of base_shift_factor, fraction_to_shift => train MechInterp
#     for base_shift in base_shift_candidates:
#         for fraction_shift in fraction_candidates:
#             model_name_combo = f"MechInterp_s{base_shift}_f{fraction_shift}"
          
#             # Train MechInterp
#             gbc_mech, X_mech, y_mech = mech_interp_boosting(
#                 X_train, y_train,
#                 base_model=None,
#                 base_shift_factor=base_shift,
#                 n_rounds=n_rounds,
#                 fraction_to_shift=fraction_shift,
#                 min_improvement=0.001,
#                 noise_scale=0.01,
#                 n_features_to_shift=2,  # or your choice
#                 final_train_size=final_train_size,
#                 use_weighted_fit=True,
#                 random_state=random_state
#             )
          
#             # Evaluate on each shifted file
#             for test_file in sorted(test_files):
#                 df_test = pd.read_csv(os.path.join(folder, test_file))
#                 X_test = df_test[X_train.columns]
#                 y_test = df_test[target]
              
#                 y_pred = gbc_mech.predict(X_test)
#                 y_proba = gbc_mech.predict_proba(X_test)[:, 1]
              
#                 all_results.append({
#                     "Model": model_name_combo,
#                     "BaseShiftFactor": base_shift,
#                     "FractionShift": fraction_shift,
#                     "ShiftedFile": test_file,
#                     "Accuracy": accuracy_score(y_test, y_pred),
#                     "F1": f1_score(y_test, y_pred),
#                     "AUC": roc_auc_score(y_test, y_proba)
#                 })
#     df_all = pd.DataFrame(all_results)
#     # 4) Identify best combo:
#     # Let's pick "best" as highest average AUC across all shifted files
#     # (You can change to F1 or average of all metrics.)
#     # Filter only rows with Model starting in "MechInterp_"
#     df_mech = df_all[df_all["Model"].str.startswith("MechInterp_s")]
  
#     # Group by (BaseShiftFactor, FractionShift), average the AUC
#     grouped = df_mech.groupby(["BaseShiftFactor", "FractionShift"])["AUC"].mean().reset_index()
#     best_row = grouped.loc[grouped["AUC"].idxmax()]  # row with highest mean AUC
#     best_base_shift = best_row["BaseShiftFactor"]
#     best_fraction = best_row["FractionShift"]
#     best_auc = best_row["AUC"]
  
#     print("\n======== MechInterp Hyperparam Tuning Results ========")
#     print(df_mech.groupby(["BaseShiftFactor", "FractionShift"])[["Accuracy","F1","AUC"]].mean())
#     print("======================================================\n")
#     print(f"Best Combination (by avg AUC): base_shift={best_base_shift}, fraction={best_fraction}, AUC={best_auc:.3f}")
  
#     return df_all, (best_base_shift, best_fraction)

In [23]:
# # 1) Ensure data_folder has train.csv + mix_*.csv
# df_results, (best_bs, best_fr) = tune_mechinterp_hparams(
#     folder="data",
#     base_shift_candidates=[0.05, 0.1, 0.2],
#     fraction_candidates=[0.3, 0.5, 0.7],
#     n_rounds=3,
#     final_train_size=None  # or a specific integer
# )
# print("\nFull Results:\n", df_results.head(20))
# print(f"\nBest MechInterp Hyperparameters found: base_shift_factor={best_bs}, fraction_to_shift={best_fr}")