In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

from sklearn.metrics import (
    classification_report, roc_auc_score, accuracy_score, 
    f1_score, roc_curve
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from pygam import s, te, f, LogisticGAM
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from scipy.stats import ortho_group

import xgboost as xgb

from src.data_generation import *
from src.analysis import ModelEvaluator
from src.utils import *
from src.plotting import visualize_feature_shifts

from src.robust_training.mechanistic import MechanisticTrainer
from src.robust_training.mechanistic import run_mechanistic_robust_training_and_eval_in_memory

np.random.seed(0)

GRID_SEARCH = False
PLOT = False
GEN_DATA = True



In [3]:
from typing import Dict


def evaluate_model(model, df_dict: Dict[float, pd.DataFrame], target: str) -> float:
    """Calculate mean AUC across all shift levels"""
    aucs = []
    for _, df in df_dict.items():
        X_test = df.drop(columns=[target])
        y_test = df[target]
        try:
            y_proba = model.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)
            aucs.append(auc)
        except:
            continue
    return np.mean(aucs)

def grid_search_parameters(
    df_train: pd.DataFrame,
    df_dict: Dict[float, pd.DataFrame],
    target: str = 'Y',
    shift_factors: list = [0.001, 0.01, 0.1],
    fractions: list = [0.3, 0.5, 0.7, 0.9],
    model_types: list = ['tree', 'gbc', 'rfc'],
    random_state: int = 42,
    **kwargs
) -> Dict:
    """
    Grid search over:
      - base_shift_factor in `shift_factors`
      - fraction_to_shift in `fractions`
      - model_type in `model_types`
    Returns a DataFrame of all results plus best details per model.
    """
    results = []
    
    # Train across all combos
    for model_type in model_types:
        for shift_factor in shift_factors:
            for fraction in fractions:
                print(f"\nTesting shift_factor={shift_factor}, fraction={fraction}, model_type={model_type}")
                
                baseline_model, robust_model = run_mechanistic_robust_training_and_eval_in_memory(
                    df_train=df_train,
                    df_dict=df_dict,
                    target=target,
                    base_shift_factor=shift_factor,
                    fraction_to_shift=fraction,
                    model_type=model_type,
                    random_state=random_state,
                    **kwargs
                )
                
                robust_score = evaluate_model(robust_model, df_dict, target)
                baseline_score = evaluate_model(baseline_model, df_dict, target)
                
                results.append({
                    'model_type': model_type,
                    'shift_factor': shift_factor,
                    'fraction': fraction,
                    'robust_score': robust_score,
                    'baseline_score': baseline_score,
                    'improvement': robust_score - baseline_score
                })
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Find best (robust) score per model
    best_per_model = {}
    for mt in model_types:
        model_subset = results_df[results_df['model_type'] == mt]
        if not model_subset.empty:
            # Row with the maximum robust_score for this model_type
            best_idx = model_subset['robust_score'].idxmax()
            best_row = model_subset.loc[best_idx]
            best_per_model[mt] = {
                'best_robust_score': best_row['robust_score'],
                'best_baseline_score': best_row['baseline_score'],
                'best_params': {
                    'model_type': best_row['model_type'],
                    'shift_factor': best_row['shift_factor'],
                    'fraction': best_row['fraction']
                }
            }
        else:
            # If no results for this model_type for some reason
            best_per_model[mt] = None
    
    return {
        'results_df': results_df,
        'best_per_model': best_per_model
    }



In [10]:
N = 1  # Number of times to generate data and run the grid search

# We'll store the best results for each iteration in a list of dicts:
all_best_per_model = []

# Data generation parameters
num_samples = 10000
num_features = 3
degree = 2  # degree for polynomial
all_covariance_shifts = [] 

# ------------------------------------------------------------
# Generate TRAIN data once (outside the loop)
# ------------------------------------------------------------
if GEN_DATA:
    mean_train = [0.90920214, 0.81962487, 0.88819135]
    covariance_train = np.array([
        [0.726318,     0.20240102,  0.52472545],
        [0.20240102,   0.11392557,  0.0264108],
        [0.52472545,   0.0264108,   1.05107627]
    ])
    
    sample_train = build_multivariate_sample(num_samples, mean_train, covariance_train)
    df_train = pd.DataFrame(sample_train, columns=[f'X{i+1}' for i in range(num_features)])
    
    # Polinomial coefficients for target generation
    coef = [
        -0.8061577012389105, -0.3621987584904036, -0.16057091147074054, 
         0.4803476403769713, -0.10624889645240687,  0.3182084398201366, 
         0.6789895126695962, -0.791324832566177,    0.531479159887424, 
         0.49115959567000167
    ]
    y_train,  coef_train = build_poly_target(sample_train, degree, coef)
    df_train['Y'] = y_train
    


# ------------------------------------------------------------
# Main loop - run N times
# ------------------------------------------------------------
for iteration in range(N):
    print(f"\n{'='*35}")
    print(f"   Iteration {iteration+1}/{N}")
    print(f"{'='*35}")
    
    # --------------------------------------------------------
    # Generate SHIFTED TEST data in each iteration (optional)
    # --------------------------------------------------------
    if GEN_DATA:
        mean_shift = attributes_quantile(df_train, 0.05)
        covariance_shift = random_cov(num_features)
        all_covariance_shifts.append(covariance_shift)
        
        df_dict = {}
        for mix_prob in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 
                         0.6, 0.7, 0.8, 0.9, 1.0]:
            sample_mix = build_mixture_sample(
                num_samples, 
                mean_train, covariance_train, 
                mean_shift, covariance_shift, 
                mix_prob=mix_prob
            )
            df_mix = pd.DataFrame(sample_mix, columns=[f'X{i+1}' for i in range(num_features)])
            y_mix,  _ = build_poly_target(sample_mix, degree, coefficients=coef_train)
            df_mix['Y'] = y_mix
            
            df_dict[mix_prob] = df_mix

    # --------------------------------------------------------
    # Grid search: specify the models and parameters to test
    # --------------------------------------------------------
    grid_results = grid_search_parameters(
        df_train=df_train,
        df_dict=df_dict,
        target='Y',
        shift_factors=[0.01, 0.05], 
        fractions=[ 0.99],
        model_types=['tree','gbc'],  
        n_rounds=1,
        random_state=42,
        noise_scale=0.0
    )
    
    # Extract the best result for each model
    best_per_model = grid_results['best_per_model']
    all_best_per_model.append(best_per_model)
    
    # Print a summary for this iteration
    for model_type, info in best_per_model.items():
        if info is not None:
            print(f"\n[Iteration {iteration+1}] Best for {model_type.upper()}:")
            print(f"  Score:       {info['best_robust_score']:.4f}")
            print(f"  Baseline:    {info['best_baseline_score']:.4f}")
            print(f"  shift_factor={info['best_params']['shift_factor']}, fraction={info['best_params']['fraction']}")
        else:
            print(f"[Iteration {iteration+1}] No result found for model_type={model_type}.")
    for i, cov in enumerate(all_covariance_shifts):
        print(f"\nIteration {i+1}:")
        print(cov)


# ------------------------------------------------------------
# After finishing all iterations
# ------------------------------------------------------------
print("\nFinished all iterations.\n")
for i, best_dict in enumerate(all_best_per_model, start=1):
    print(f"========== Iteration {i} best results per model ===========")
    for model_type, info in best_dict.items():
        if info is not None:
            print(f"  {model_type.upper()}: best_robust_score={info['best_robust_score']:.4f}, "
                  f"best_baseline_score={info['best_baseline_score']:.4f}, "
                  f"Robust Train params={info['best_params']}")
        else:
            print(f"  {model_type.upper()}: No result.")
    print("-----------------------------------------------------------")





   Iteration 1/1

Testing shift_factor=0.01, fraction=0.99, model_type=tree
Training set shape = (10000, 3);  Target distribution:
Y
1    5502
0    4498
Name: count, dtype: int64

=== Training Baseline Model ===
=> Baseline model trained.

=== Training Robust Model ===
[MechanisticTrainer] Initial fit on full dataset.
[MechanisticTrainer] Augmentation Round 1/1
  => Augmenting 10000 samples.
  => Augmented pool size: 20000 samples
  => Downsampled combined data to 10000 total samples.
[MechanisticTrainer] Robust model training completed.

=> Robust model trained.

=== Evaluation on Shifted Datasets ===

Shift = 0.0
  Baseline => Accuracy: 0.792, F1: 0.796, AUC: 0.870
  Robust   => Accuracy: 0.790, F1: 0.798, AUC: 0.866
  Delta AUC (Robust - Baseline) = -0.0039

Shift = 0.1
  Baseline => Accuracy: 0.784, F1: 0.776, AUC: 0.854
  Robust   => Accuracy: 0.785, F1: 0.783, AUC: 0.852
  Delta AUC (Robust - Baseline) = -0.0021

Shift = 0.2
  Baseline => Accuracy: 0.762, F1: 0.741, AUC: 0.832
 