### RELIEF

In [None]:
from sklearn.metrics import matthews_corrcoef

def get_scores(y_true, y_pred, hospitals):
    scores = {"mcc_overall": matthews_corrcoef(y_true, y_pred)}

    mcc_per_class = {
        f"mcc_{cls}": matthews_corrcoef((y_true == cls), (y_pred == cls)) for cls in hospitals
    }
    scores.update(mcc_per_class)
    return scores

In [None]:
from src.harmonization.relief import RELIEFHarmonizer
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
harmonization_method = 'relief'
INFO_FILE_PATH = 'data/ELM19/filtered/ELM19_info_filtered_norm.csv'
FEATURES_FILE_PATH = 'data/ELM19/filtered/ELM19_features_filtered_norm.csv'

COVARIATES = ['age', 'gender']
SITE_COLUMN = 'hospital_id'

N_SPLITS = 5
RANDOM_STATE = 42

CATBOOST_PARAMS = {
    'iterations': 2000,
    'learning_rate': 0.2136106733298358,
    'depth': 5.0,
    'l2_leaf_reg': 1.0050061307458207,
    'early_stopping_rounds': 50,

    'loss_function': 'MultiClass',
    'eval_metric': 'MCC',

    'task_type': "GPU",
    'thread_count': 20,
    'random_seed': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False
}

try:
    info_df = pd.read_csv(INFO_FILE_PATH)
    features_df = pd.read_csv(FEATURES_FILE_PATH)
    logger.info("Data files loaded successfully.")
except FileNotFoundError:
    logger.error(f"Error: Data files not found.")
    logger.error(f"Checked: {INFO_FILE_PATH}")
    logger.error(f"Checked: {FEATURES_FILE_PATH}")

info_df = info_df.rename(columns={
    'age_dec': 'age',
    'patient_sex': 'gender',
    'institution_id': 'hospital_id',
    'classification': 'pathology_label'
})

In [None]:
sites= info_df[SITE_COLUMN]
cov = info_df[COVARIATES]
harmonizer = RELIEFHarmonizer(
                scale_features=True,
                eps=1e-3,
                max_iter=100,
                verbose=True,
                n_jobs=30,
                log_every=1
)

y = info_df[SITE_COLUMN]
X = harmonizer.fit_transform(features_df.T, batch=sites, discrete_covariates=cov['gender'], continuous_covariates=cov['age'])
X = X.T
all_hospitals = np.unique(y)
X_df = pd.DataFrame(X, columns=features_df.columns)
X_df.to_csv('data/ELM19/filtered/ELM19_features_filtered_norm_relief.csv')

In [None]:

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
all_results_list = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_df, y)):

    logger.info(f"--- Fold: {fold + 1}/{N_SPLITS} ---")

    X_train, X_test = X_df.iloc[train_idx], X_df.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model = CatBoostClassifier(**CATBOOST_PARAMS)
    logger.info("Fitting pipeline on training data...")
    model.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

    logger.info("Evaluating on test data...")
    preds = model.predict(X_test)

    scores_fold = get_scores(y_test, preds, all_hospitals)
    scores_fold['method'] = harmonization_method
    scores_fold['fold'] = fold + 1
    all_results_list.append(scores_fold)
    logger.info(f"Fold {fold + 1} Overall MCC: {scores_fold['mcc_overall']:.4f}")

    if fold == 0:
        logger.info("Saving pipeline and test data for Fold 0 (for SHAP analysis)...")

        # 1. Save the pipeline object
        pipeline_filename = f"{harmonization_method}_pipeline_fold0.joblib"
        pipeline_save_path = os.path.join(PIPELINE_SAVE_DIR, pipeline_filename)
        joblib.dump(model, pipeline_save_path)
        logger.info(f"Pipeline saved to: {pipeline_save_path}")

        # 2. Save the corresponding untransformed X_test and y_test
        test_data_filename = f"{harmonization_method}_test_data_fold0.parquet"
        test_data_save_path = os.path.join(SHAP_DATA_SAVE_DIR, test_data_filename)
        X_test_to_save = X_test.copy()
        X_test_to_save['y_true_hospital'] = y_test  # Add labels for context
        X_test_to_save.to_parquet(test_data_save_path)
        logger.info(f"Test data for SHAP saved to: {test_data_save_path}")

# Save results
df_results = pd.DataFrame(all_results_list)
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)
file_exists = os.path.isfile(RESULTS_PATH)
df_results.to_csv(RESULTS_PATH, mode='a', header=not file_exists, index=False)

mean_overall_mcc = df_results['mcc_overall'].mean()
logger.info(f"Results saved for '{harmonization_method}'. Mean Overall MCC: {mean_overall_mcc:.4f}")
