In [1]:
!pip install -q  openml
!pip install -q  catboost
!pip install -q optuna
!pip install -q xgboost
!pip install -q lightgbm
!pip install -q PyGithub
!pip install -q imblearn

# !nvidia-smi  # this should display information about available GPUs
# %load_ext cuml.accel

In [2]:
import openml
import os
import pandas as pd
import requests
from scipy.io import arff

# Load dataset metadata
df = openml.datasets.list_datasets(output_format='dataframe')

# Filter by GEMLeR: binary class, ≥10k features, and 'AP_' or 'OVA_' in name
gemler = df[
    (df['NumberOfClasses'] == 2) &
    (df['NumberOfFeatures'] >= 10000) &
    (df['name'].str.startswith(('AP_', 'OVA_')))
]
gemler['ImbalanceRatio']=gemler['MajorityClassSize']/gemler['MinorityClassSize']
gemler_sorted = gemler.sort_values(by='ImbalanceRatio',ascending=False).head(3)

# Save to CSV
gemler_sorted[['did', 'name', 'NumberOfInstances', 'NumberOfFeatures', 'MajorityClassSize', 'MinorityClassSize', 'ImbalanceRatio']].to_csv("GEMLeR_Datasets.csv", index=False)

print(gemler_sorted[['did', 'name', 'NumberOfInstances', 'NumberOfFeatures', 'MajorityClassSize', 'MinorityClassSize', 'ImbalanceRatio']])


       did             name  NumberOfInstances  NumberOfFeatures  \
1142  1142  OVA_Endometrium             1545.0           10936.0   
1146  1146     OVA_Prostate             1545.0           10937.0   
1139  1139      OVA_Omentum             1545.0           10936.0   

      MajorityClassSize  MinorityClassSize  ImbalanceRatio  
1142             1484.0               61.0       24.327869  
1146             1476.0               69.0       21.391304  
1139             1468.0               77.0       19.064935  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gemler['ImbalanceRatio']=gemler['MajorityClassSize']/gemler['MinorityClassSize']


In [3]:
def encode_target(df, dataset_name):
    target_col = df['Tissue']

    if dataset_name.startswith("AP_"):
        # Format: AP_Word1_Word2
        _, word1, word2 = dataset_name.split("_", 2)
        label_map = {word1: 1, word2: 0}
        y = target_col.map(label_map)

    elif dataset_name.startswith("OVA_"):
        # Format: OVA_Word1
        _, word1 = dataset_name.split("_", 1)
        y = target_col.apply(lambda x: 1 if x == word1 else 0)

    else:
        raise ValueError("Unknown dataset format")

    return y

# Output directory for ARFF files
arff_dir = "datasets_arff"
os.makedirs(arff_dir, exist_ok=True)

# Dictionary to store loaded DataFrames
datasets = {}

for _, row in gemler_sorted.iterrows():
    did = int(row['did'])
    name = row['name']

    # Create safe filename
    safe_name = name.replace(' ', '_').replace('/', '_')
    arff_path = os.path.join(arff_dir, f"{safe_name}.arff")

    # Download dataset metadata
    dataset = openml.datasets.get_dataset(did)

    # Download only if file does not already exist
    if not os.path.exists(arff_path):
        file_url = dataset.url
        response = requests.get(file_url)
        if response.status_code != 200:
            print(f"❌ Failed to download dataset '{name}' (ID {did})")
            continue
        with open(arff_path, 'wb') as f:
            f.write(response.content)
        print(f"✅ Downloaded and saved ARFF file: {arff_path}")
    else:
        print(f"🔁 Skipped download (already exists): {arff_path}")

    # Load the ARFF file into a DataFrame
    data, meta = arff.loadarff(arff_path)
    df = pd.DataFrame(data)
    # Decode byte columns to strings if needed
    for col in df.select_dtypes([object]).columns:
        df[col] = df[col].apply(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)
    df['Tissue']=encode_target(df, safe_name)
    # Store DataFrame
    datasets[safe_name] = df
    print(f"📊 Loaded DataFrame: {safe_name} with shape {df.shape}")

🔁 Skipped download (already exists): datasets_arff/OVA_Endometrium.arff
📊 Loaded DataFrame: OVA_Endometrium with shape (1545, 10937)
🔁 Skipped download (already exists): datasets_arff/OVA_Prostate.arff
📊 Loaded DataFrame: OVA_Prostate with shape (1545, 10937)
🔁 Skipped download (already exists): datasets_arff/OVA_Omentum.arff
📊 Loaded DataFrame: OVA_Omentum with shape (1545, 10937)


In [4]:


for safe_name, df in datasets.items():
    # Retrieve the original OpenML dataset using the name
    # (optional: you could cache the target name in earlier code)
    matching_row = gemler_sorted[gemler_sorted['name'].str.replace(' ', '_') == safe_name]

    if matching_row.empty:
        print(f"⚠️  Could not find metadata for: {safe_name}")
        continue

    did = int(matching_row['did'].values[0])
    dataset = openml.datasets.get_dataset(did)

    target_col = dataset.default_target_attribute
    if target_col not in df.columns:
        print(f"⚠️  Target column '{target_col}' not found in: {safe_name}")
        continue
    print(f"\n📊 Dataset: {safe_name} (Target: {target_col})")
    #print(df['Tissue'],encode_target(df, safe_name))
    print(df[target_col].value_counts())



📊 Dataset: OVA_Endometrium (Target: Tissue)
Tissue
0    1484
1      61
Name: count, dtype: int64

📊 Dataset: OVA_Prostate (Target: Tissue)
Tissue
0    1476
1      69
Name: count, dtype: int64

📊 Dataset: OVA_Omentum (Target: Tissue)
Tissue
0    1468
1      77
Name: count, dtype: int64


In [5]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import check_random_state
from collections import defaultdict
from imblearn.base import BaseSampler
from imblearn.pipeline import Pipeline

class RF_SMOTE_TopPairs(BaseSampler):
    """Random Forest-based SMOTE using top minority pairs with min/max leaf purity."""

    _parameter_constraints = {
        "n_estimators": [int],
        "max_depth": [int, type(None)],
        "pair_score": [str],
        "min_leaf_purity": [float],
        "max_leaf_purity": [float],
        "top_pair_fraction": [float],
        "max_per_sample": [int, type(None)],
        "lambda_low": [float],
        "lambda_high": [float],
        "random_state": [int, type(None)]
    }

    def __init__(self, n_estimators=500, max_depth=10, min_leaf_purity=0.8,
                 max_leaf_purity=1.0, top_pair_fraction=0.1, max_per_sample=None,
                 lambda_low=0.0, lambda_high=1.0, pair_score="l1", random_state=None):
        super().__init__()
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_leaf_purity = min_leaf_purity
        self.max_leaf_purity = max_leaf_purity
        self.top_pair_fraction = top_pair_fraction
        self.pair_score = pair_score
        self.max_per_sample = max_per_sample
        self.lambda_low = lambda_low
        self.lambda_high = lambda_high
        self.random_state = random_state

    _sampling_type = "over-sampling"  # <-- indicates this is an oversampler

    def _fit_resample(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        rng = check_random_state(self.random_state)

        classes, counts = np.unique(y, return_counts=True)
        min_class = classes[np.argmin(counts)]
        max_count = counts.max()

        X_min = X[y == min_class]
        y_min = y[y == min_class]
        n_min, n_features = X_min.shape

        n_new = max_count - n_min
        if n_new <= 0:
            return X.copy(), y.copy()

        # Train Random Forest
        rf = RandomForestClassifier(
            n_estimators=self.n_estimators,
            max_depth=self.max_depth,
            random_state=rng,
            class_weight="balanced"
        )
        rf.fit(X, y)
        leaves = rf.apply(X)

        # Compute feature weights
        W = self._compute_feature_weights(X_min, y, min_class, rf, leaves)

        # Compute pair scores
        S = np.sqrt((W ** 2).sum(axis=2)) if self.pair_score == "l2" else W.sum(axis=2)
        np.fill_diagonal(S, 0.0)
        i_idx, j_idx = np.triu_indices(n_min, k=1)
        scores = S[i_idx, j_idx]

        # Keep top fraction of pairs
        m_pairs = len(scores)
        keep = max(1, int(np.ceil(self.top_pair_fraction * m_pairs)))
        if keep < m_pairs:
            kth = np.argpartition(scores, -keep)[-keep:]
            sel = kth[np.argsort(scores[kth])][::-1]
        else:
            sel = np.argsort(scores)[::-1]

        i_sel, j_sel, s_sel = i_idx[sel], j_idx[sel], scores[sel]

        if s_sel.sum() <= 0:
            raise ValueError("All selected minority pairs have zero score. "
                             "Cannot generate synthetic samples.")

        # Allocate synthetic samples proportionally
        alloc = n_new * (s_sel / s_sel.sum())
        base = np.floor(alloc).astype(int)
        remainder = n_new - base.sum()
        frac_order = np.argsort((alloc - base))[::-1]
        base[frac_order[:remainder]] += 1

        # Optional max per sample
        if self.max_per_sample is not None:
            use_count = np.zeros(n_min, dtype=int)
            order = np.argsort(s_sel)[::-1]
            for k in order:
                i, j = i_sel[k], j_sel[k]
                cap_left = max(0, min(self.max_per_sample - use_count[i],
                                       self.max_per_sample - use_count[j]))
                base[k] = min(base[k], cap_left)
                use_count[i] += base[k]
                use_count[j] += base[k]
            # redistribute deficit if undershoot
            deficit = n_new - base.sum()
            if deficit > 0:
                for k in order:
                    if deficit == 0:
                        break
                    i, j = i_sel[k], j_sel[k]
                    if use_count[i] < self.max_per_sample and use_count[j] < self.max_per_sample:
                        base[k] += 1
                        use_count[i] += 1
                        use_count[j] += 1
                        deficit -= 1

        # Generate synthetic samples
        X_syn = []
        for k, n_k in enumerate(base):
            if n_k <= 0:
                continue
            a, b = i_sel[k], j_sel[k]
            w_feat = W[a, b]
            for _ in range(n_k):
                lam = self.lambda_low + (self.lambda_high - self.lambda_low) * rng.rand(n_features)
                x_new = np.array([
                    X_min[a, f] + lam[f] * (X_min[b, f] - X_min[a, f]) if w_feat[f] > 0
                    else X_min[a, f] if rng.rand() < 0.5 else X_min[b, f]
                    for f in range(n_features)
                ])
                X_syn.append(x_new)

        X_syn = np.array(X_syn)
        y_syn = np.full(len(X_syn), min_class)

        X_res = np.vstack([X, X_syn])
        y_res = np.hstack([y, y_syn])

        return X_res, y_res

    # -------- internal helpers --------
    def _compute_feature_weights(self, X_min, y, min_class, rf, leaves):
        n_min, n_features = X_min.shape
        n_samples, n_trees = leaves.shape
        W = np.zeros((n_min, n_min, n_features), dtype=float)
        minority_global = np.flatnonzero(y == min_class)
        g2l = {g: i for i, g in enumerate(minority_global)}

        # valid leaves within min/max purity
        leaf_valid = [{} for _ in range(n_trees)]
        for t in range(n_trees):
            for leaf_id in np.unique(leaves[:, t]):
                in_leaf = (leaves[:, t] == leaf_id)
                p_minority = np.mean(y[in_leaf] == min_class)
                if self.min_leaf_purity <= p_minority <= self.max_leaf_purity:
                    leaf_valid[t][leaf_id] = True

        for t, est in enumerate(rf.estimators_):
            tree = est.tree_
            leaf_ids = leaves[:, t]
            leaf_to_local = defaultdict(list)
            for g in minority_global:
                leaf_id = leaf_ids[g]
                if leaf_id in leaf_valid[t]:
                    leaf_to_local[leaf_id].append(g2l[g])
            for leaf_id, local_idxs in leaf_to_local.items():
                if len(local_idxs) < 2:
                    continue
                path_feats = self._path_features(tree, leaf_id)
                if not path_feats:
                    continue
                f_mask = np.zeros(n_features, dtype=float)
                f_mask[list(path_feats)] = 1.0
                for a in local_idxs:
                    W[a, local_idxs] += f_mask
        return W

    def _path_features(self, tree, leaf_id):
        children_left = tree.children_left
        children_right = tree.children_right
        feature = tree.feature
        parents = {children_left[i]: i for i in range(len(children_left)) if children_left[i] != -1}
        parents.update({children_right[i]: i for i in range(len(children_right)) if children_right[i] != -1})
        feats = set()
        nid = leaf_id
        while nid in parents:
            p = parents[nid]
            if feature[p] != -2:
                feats.add(feature[p])
            nid = p
        return feats

In [None]:
import optuna
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score, RepeatedStratifiedKFold, train_test_split
#from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, make_scorer, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA, FastICA, TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from github import Github
import joblib
import random
import json
import pickle
import lzma
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN, BorderlineSMOTE, SVMSMOTE

#calibrate classifier as well!!!
#Do brier score!!!?! and what else
#rewrite the CV score function to output several metrics!?


random.seed(42)
cv_results = {}

# Define outer CV (final model evaluation)
outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=42)



#study_summaries = optuna.get_all_study_summaries(storage)

# Delete each study
#for summary in study_summaries:
#    optuna.delete_study(study_name=summary.study_name, storage=storage)

# List of models to compare
classifiers = {
     "Logistic Regression": LogisticRegression,
 ##   "LDA" : LinearDiscriminantAnalysis,
 ##   "QDA" : QuadraticDiscriminantAnalysis,
  "Naive Bayes" : GaussianNB,
  "SVM (Linear)" : SVC,
 ##   "SVM (RBF)": SVC,
   "k-NN": KNeighborsClassifier,
   "Random Forest": RandomForestClassifier,
 ##   "Gradient Boosting": GradientBoostingClassifier,
 ##   "CatBoost": CatBoostClassifier,
     "XGBoost": XGBClassifier,
 ##   "LightGBM": LGBMClassifier
}

# Balanced accuracy scorer
auc_scorer = make_scorer(roc_auc_score)

# Store best models and scores
best_models = {}
best_scores = {}

# Objective builders for each model
def build_objective(model_name, X, y, dr_method,OS_method):
    def objective(trial):

        if OS_method == "Random":
            steps = [("oversample", RandomOverSampler(random_state=42))]
        elif OS_method == "SMOTE":
            smote_k = trial.suggest_int("smote_k", 2, 10)
            steps =  [('smote', SMOTE(k_neighbors=smote_k, random_state=42))]
        elif OS_method == "ADASYN":
            smote_k = trial.suggest_int("smote_k", 2, 10)
            steps =  [('adasyn', ADASYN(n_neighbors=smote_k, random_state=42))]
        elif OS_method == "Borderline":
            smote_k = trial.suggest_int("smote_k", 2, 10)
            smote_m = trial.suggest_int("smote_m", 5, 20)
            steps = [('borderline', BorderlineSMOTE(k_neighbors=smote_k,m_neighbors=smote_m,random_state=42))]
        elif OS_method == "SVM":
            smote_k = trial.suggest_int("smote_k", 2, 10)
            smote_m = trial.suggest_int("smote_m", 5, 20)
            steps = [('SVM', SVMSMOTE(k_neighbors=smote_k,m_neighbors=smote_m,random_state=42))]
        elif OS_method == "RFsmote":
            min_leaf_purity = trial.suggest_float("min_leaf_purity", 0.5, 0.8)
            max_leaf_purity = trial.suggest_float("max_leaf_purity", min_leaf_purity+0.1, 1.0)
            top_pair_fraction = trial.suggest_float("top_pair_fraction", 0.05, 0.1)
            os_max_depth = trial.suggest_int("os_max_depth", 4, 10)
            steps = [('RFsmote', RF_SMOTE_TopPairs(min_leaf_purity=min_leaf_purity,max_leaf_purity=max_leaf_purity,top_pair_fraction=top_pair_fraction,max_depth=os_max_depth,random_state=42))]
        # elif OS_method = "CDNN":
    
        steps.append(('scaler', StandardScaler()))

        if dr_method == "pca":
            ncomp = trial.suggest_int("ncomp", 5, min(X.shape[1],0.8*X.shape[0]))
            dr_step = ('dr', PCA(n_components=ncomp, random_state=42))
        elif dr_method == "ica":
            ncomp = trial.suggest_int("ncomp", 5, min(X.shape[1], 0.8*X.shape[0]))
            dr_step = ('dr', FastICA(n_components=ncomp, whiten='unit-variance', max_iter=1000, random_state=42))
        elif dr_method == "svd":
            ncomp = trial.suggest_int("ncomp", 5, min(X.shape[1], 0.8*X.shape[0]))
            dr_step = ('dr', TruncatedSVD(n_components=ncomp, random_state=42))
        elif dr_method:
            raise ValueError("Unknown DR method")
        if dr_method:
            steps.append(dr_step)

        if model_name == "Logistic Regression":
            C = trial.suggest_float("C", 1e-4, 1e2,log=True)
            clf = LogisticRegression(penalty='l1', C=C, solver="liblinear", max_iter=1000)
            steps.append(('clf', clf))

        elif model_name == "LDA":
            solver = trial.suggest_categorical("solver", ["lsqr", "eigen"])
            shrinkage_opt = trial.suggest_categorical("shrinkage", ["None", "auto"])
            if shrinkage_opt == "None":
                shrinkage = None
            elif shrinkage_opt == "auto":
                shrinkage = "auto"
            # Suggest tol (used only with lsqr and eigen, but safe to include)
            tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)
            # Suggest n_components (must be <= min(n_classes-1, n_features), so use fixed upper bound)
            # Build classifier
            clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage,tol=tol)
            steps.append(('clf', clf))

        elif model_name == "QDA":
            reg_param = trial.suggest_float("reg_param", 0.0, 1)
            clf = QuadraticDiscriminantAnalysis(reg_param=reg_param)
            steps.append(('clf', clf))

        elif model_name == "Naive Bayes":
            var_smoothing = trial.suggest_float("var_smoothing", 1e-11, 1e-7,log=True)
            clf = GaussianNB(var_smoothing=var_smoothing)
            steps.append(('clf', clf))

        elif model_name == "SVM (Linear)":
            C = trial.suggest_float("C", 1e-4, 1e2,log=True)
            clf = SVC(kernel='linear', C=C, probability=True,max_iter=100000)
            steps.append(('clf', clf))

        elif model_name == "SVM (RBF)":
            C = trial.suggest_float("C", 1e-4, 1e2,log=True)
            gamma = trial.suggest_float("gamma", 1e-5, 1e-1,log=True)
            clf = SVC(kernel='rbf', C=C, gamma=gamma, probability=True)
            steps.append(('clf', clf))

        elif model_name == "k-NN":
            n_neighbors = trial.suggest_int("n_neighbors", 3, 30)
            weights = trial.suggest_categorical("weights", ["uniform", "distance"])
            clf = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, n_jobs=-1)
            steps.append(('clf', clf))

        elif model_name == "Random Forest":
            n_estimators = trial.suggest_int("n_estimators", 100, 1000)
            max_depth = trial.suggest_int("max_depth", 2, 20)
            #min_samples_split = trial.suggest_int("min_samples_split", 5, 30)
            #min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)
            clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
            steps.append(('clf', clf))

        elif model_name == "Gradient Boosting":
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            n_estimators = trial.suggest_int("n_estimators", 100, 1000)
            max_depth = trial.suggest_int("max_depth", 2, 10)
            clf = GradientBoostingClassifier(
                learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
            steps.append(('clf', clf))

        elif model_name == "CatBoost":
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            depth = trial.suggest_int("depth", 4, 10)
            iterations = trial.suggest_int("iterations", 100, 1000)
#            l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 1, 10),
#            border_count = trial.suggest_int('border_count', 32, 255),
            clf = CatBoostClassifier(learning_rate=learning_rate,depth=depth,iterations=iterations, early_stopping_rounds=20,verbose=0,thread_count=-1) #l2_leaf_reg=l2_leaf_reg,border_count = border_count,
            steps.append(('clf', clf))

        elif model_name == "XGBoost":
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            max_depth = trial.suggest_int("max_depth", 3, 10)
            n_estimators = trial.suggest_int("n_estimators", 100, 1000)
            clf = XGBClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators, eval_metric='logloss', n_jobs=-1)
            steps.append(('clf', clf))

        elif model_name == "LightGBM":
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
            max_depth = trial.suggest_int("max_depth", 3, 15)
            n_estimators = trial.suggest_int("n_estimators", 100, 1000)
            clf = LGBMClassifier(learning_rate=learning_rate, max_depth=max_depth, n_estimators=n_estimators,force_col_wise=True, deterministic=True, n_jobs=-1)
            steps.append(('clf', clf))

        else:
            raise ValueError(f"Model '{model_name}' not supported.")

        pipeline = Pipeline(steps)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
        return np.mean(scores)
    return objective

# Main loop: for each model, tune + evaluate with outer CV
def run_nested_cv(X, y, classifiers,ds_name):
    for OS_method in ["Random", "SMOTE", "ADASYN", "Borderline", "SVM", "RFsmote"]:
        dr_method = None
        for name, model_cls in classifiers.items():
            print(f"\n🔍 Optimizing: {name}")
            outer_scores = []
            best_pipeline = None
            best_score = -np.inf
    #!!!! should swap the data loop with the classifer loop...
    #Will save on dim red computation!!!
            storage = optuna.storages.RDBStorage(url="sqlite:///"+name+"_study.db", engine_kwargs={"connect_args": {"timeout": 100}})


            for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
                print(f"  Fold {fold_idx + 1}")

                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                #X_train, X_test = X[train_idx], X[test_idx]
                #y_train, y_test = y[train_idx], y[test_idx]

                study = optuna.create_study(study_name=ds_name+name+str(dr_method)+str(OS_method)+str(fold_idx)+str(random.randint(0, 1e6)),direction="maximize", storage=storage,load_if_exists=True)

                #"Random Forest",
                #if name in ["Gradient Boosting", "CatBoost","LightGBM"]:
                #    study.optimize(build_objective(name, X_train, y_train, dr_method), n_trials=1000, n_jobs=-1)
                #else:
                study.optimize(build_objective(name, X_train, y_train, dr_method,OS_method), n_trials=300, n_jobs=-1)

                best_params = study.best_params
                best_params_preserve = best_params.copy()

                if OS_method == "Random":
                    steps = [("oversample", RandomOverSampler(random_state=42))]
                elif OS_method == "SMOTE":
                    smote_k = best_params.pop("smote_k")
                    steps =  [('smote', SMOTE(k_neighbors=smote_k, random_state=42))]
                elif OS_method == "ADASYN":
                    smote_k = best_params.pop("smote_k")
                    steps =  [('adasyn', ADASYN(n_neighbors=smote_k, random_state=42))]
                elif OS_method == "Borderline":
                    smote_k = best_params.pop("smote_k")
                    smote_m = best_params.pop("smote_m")
                    steps = [('borderline', BorderlineSMOTE(k_neighbors=smote_k,m_neighbors=smote_m,random_state=42))]
                elif OS_method == "SVM":
                    smote_k = best_params.pop("smote_k")
                    smote_m = best_params.pop("smote_m")
                    steps = [('SVM', SVMSMOTE(k_neighbors=smote_k,m_neighbors=smote_m,random_state=42))]
                elif OS_method == "RFsmote":
                    min_leaf_purity =  best_params.pop("min_leaf_purity")
                    max_leaf_purity =  best_params.pop("max_leaf_purity")
                    top_pair_fraction = best_params.pop("top_pair_fraction") 
                    os_max_depth = best_params.pop("os_max_depth") 
                    steps = [('RFsmote', RF_SMOTE_TopPairs(min_leaf_purity=min_leaf_purity,max_leaf_purity=max_leaf_purity,top_pair_fraction=top_pair_fraction,max_depth=os_max_depth,random_state=42))]
               # elif OS_method = "CDNN":

                
                steps.append(('scaler', StandardScaler()))

                dr_step = None

                if dr_method == "pca":
                    pca_n = best_params.pop("ncomp")
                    dr_step = ('dr', PCA(n_components=pca_n,random_state=42))

                elif dr_method == "ica":
                    ica_n = best_params.pop("ncomp")
                    dr_step = ('dr', FastICA(n_components=ica_n, whiten='unit-variance', max_iter=1000,random_state=42))

                elif dr_method == "svd":
                    svd_n = best_params.pop("ncomp")
                    dr_step = ('dr', TruncatedSVD(n_components=svd_n,random_state=42))

                if dr_step:
                    steps.append(dr_step)

                dr_keys = ["ncomp"]
                # Filtered dict with only classifier params
                clf_params = {k: v for k, v in best_params.items() if k not in dr_keys}
                os_keys = ["smote_k","smote_m","min_leaf_purity","max_leaf_purity","top_pair_fraction","os_max_depth"]
                # Filtered dict with only classifier params
                clf_params = {k: v for k, v in clf_params.items() if k not in os_keys}

                # Refit model with best params on train set
                if name == "Logistic Regression":
                    clf = model_cls(**clf_params, solver="liblinear", penalty='l1', max_iter=1000)
                    steps.append(('clf', clf))
                elif name == "LDA":
                    clf = model_cls(**clf_params)
                    steps.append(('clf', clf))
                elif name == "QDA":
                    clf = model_cls(**clf_params)
                    steps.append(('clf', clf))
                elif name == "Naive Bayes":
                    clf = model_cls(**clf_params)
                    steps.append(('clf', clf))
                elif name == "SVM (Linear)":
                    clf = model_cls(**clf_params,kernel='linear', probability=True,max_iter=100000)
                    steps.append(('clf', clf))
                elif name == "SVM (RBF)":
                    clf = model_cls(**clf_params,kernel='rbf', probability=True)
                    steps.append(('clf', clf))
                elif name == "k-NN":
                    clf = model_cls(**clf_params,n_jobs=-1)
                    steps.append(('clf', clf))
                elif name == "Random Forest":
                    clf = model_cls(**clf_params,n_jobs=-1)
                    steps.append(('clf', clf))
                elif name == "Gradient Boosting":
                    clf = model_cls(**clf_params)
                    steps.append(('clf', clf))
                elif name == "XGBoost":
                    clf = model_cls(**clf_params, eval_metric='logloss', n_jobs=-1)
                    steps.append(('clf', clf))
                elif name == "CatBoost":
                    clf = model_cls(**best_params, early_stopping_rounds=20,verbose=0,thread_count=-1,random_state=42)
                    steps.append(('clf', clf))
                elif name == "XGBoost":
                    clf =model_cls(**best_params, use_label_encoder=False, eval_metric="logloss", n_jobs=-1)
                    steps.append(('clf', clf))
                elif name == "LightGBM":
                    clf = model_cls(**best_params,force_col_wise=True, deterministic=True, n_jobs=-1)
                    steps.append(('clf', clf))

                pipeline = Pipeline(steps)
                pipeline.fit(X_train, y_train)
                y_pred = pipeline.predict_proba(X_test)[:, 1]
                score = roc_auc_score(y_test, y_pred)

                print(f"    AUC: {score:.4f}")
                outer_scores.append(score)

                if score > best_score:
                    best_score = score
                    best_pipeline = pipeline
                    mybest_params = best_params_preserve
                # --- Step 1: Authenticate with a personal access token (create one at https://github.com/settings/tokens)
                #g = Github("XXXX")

                # --- Step 2: Choose the repository
                #repo = g.get_user().get_repo("Data588")  # or use g.get_repo("username/repo")

                # --- Step 3: Prepare the file details
                #file_path = "example_study.db"
                #github_path = "experiments/HPClineardimred"+name+"/example_study.db"  # path in the repo

                #try:
                #    with open(file_path, "r", encoding="utf-8") as file:
                #        content = file.read()
                #except UnicodeDecodeError:
                #    with open(file_path, "r", encoding="latin1") as file:
                #        content = file.read()
                # --- Step 5: Upload (create or update the file)
                #try:
                #    contents = repo.get_contents(github_path)
                #    repo.update_file(contents.path, "Update file via script", content, contents.sha)
                #    print("File updated.")
                #except Exception:
                #    repo.create_file(github_path, "Initial commit via script", content)
                #    print("File uploaded.")

            # Save best model
            best_models[name] = best_pipeline
            best_scores[name] = outer_scores
            
            # Save with maximum compression
            #with lzma.open("best_model_"+ds_name+name.replace(' ', '_')+str(dr_method)+".pkl.xz", "wb", preset=9 | lzma.PRESET_EXTREME) as f:
            #    joblib.dump(best_pipeline, f)
    
            #print(f"✅ Saved best model for {name} with AUC: {np.mean(outer_scores):.4f}")
            #print(mybest_params)
            #print(model_cls)

            # --- Step 1: Authenticate with a personal access token (create one at https://github.com/settings/tokens)
            g = Github("XXXX")

            # --- Step 2: Choose the repository
            repo = g.get_user().get_repo("Data588")  # or use g.get_repo("username/repo")

            # --- Step 3: Prepare the file details
            #file_path = "best_model_"+ds_name+name.replace(' ', '_')+str(dr_method)+".pkl.xz"
            #github_path = "experiments/HPClineardimred"+name+"/best_model_"+ds_name+name.replace(' ', '_')+str(dr_method)+".pkl.xz"  # path in the repo

            #try:
            #    with open(file_path, "r", encoding="utf-8") as file:
            #        content = file.read()
            #except UnicodeDecodeError:
            #    with open(file_path, "r", encoding="latin1") as file:
            #        content = file.read()
            # --- Step 5: Upload (create or update the file)
            #try:
            #    contents = repo.get_contents(github_path)
            #    repo.update_file(contents.path, "Update file via script", content, contents.sha)
            #    print("File updated.")
            #except Exception:
            #    repo.create_file(github_path, "Initial commit via script", content)
            #    print("File uploaded.")


            file_path = name+"_study.db"
            github_path = "experiments/HPClinearOS"+name+"/"+name+"_study.db"  # path in the repo
    
            with open(file_path, "rb") as file:
                content = file.read()

            # --- Step 5: Upload (create or update the file)
            try:
                contents = repo.get_contents(github_path)
                repo.update_file(contents.path, "Update file via script", content, contents.sha)
                print("File updated.")
            except Exception:
                repo.create_file(github_path, "Initial commit via script", content)
                print("File uploaded.")


            json_data = {
                "scores": outer_scores,
                "mean": float(np.mean(outer_scores)),
                "std": float(np.std(outer_scores)),
                "best_params": mybest_params
            }
            
            filename = f"{ds_name}_{name}_{dr_method}_{OS_method}_cv_results.json"
            with open(filename, "w") as f:
                json.dump(json_data, f, indent=2)


            file_path = filename
            github_path = "experiments/HPClinearOS"+name+"/"+filename  # path in the repo

            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    content = file.read()
            except UnicodeDecodeError:
                with open(file_path, "r", encoding="latin1") as file:
                    content = file.read()
            # --- Step 5: Upload (create or update the file)
            try:
                contents = repo.get_contents(github_path)
                repo.update_file(contents.path, "Update file via script", content, contents.sha)
                print("File updated.")
            except Exception:
                repo.create_file(github_path, "Initial commit via script", content)
                print("File uploaded.")

                
# Example usage
#from sklearn.datasets import load_breast_cancer
#X, y = load_breast_cancer(return_X_y=True)

#X, y = datasets["OVA_Endometrium"].drop(columns='Tissue'), datasets["OVA_Endometrium"]['Tissue']

#X = X[np.random.choice(X.columns, size=int(0.01 * X.shape[1]), replace=False)]
#X_reduced, _, y_reduced, _ = train_test_split(
#    X, y,
#    test_size=0.8,      # Keep 10%, drop 90%
#    stratify=y,
#    random_state=42     # For reproducibility)

## Add loop to run over all data sets!!!! (needs to be a parameter to run_nested and also needs to be stored in the study name of optuna!)

for safe_name, df in datasets.items():
    if safe_name == 'OVA_Omentum':
        X, y = df.drop(columns='Tissue'), df['Tissue']
#  run_nested_cv(X_reduced, y_reduced, classifiers, safe_name)
        run_nested_cv(X, y, classifiers, safe_name)






  from .autonotebook import tqdm as notebook_tqdm



🔍 Optimizing: XGBoost


[I 2025-09-06 02:42:10,817] A new study created in RDB with name: OVA_OmentumXGBoostNoneRFsmote0670487


  Fold 1


[I 2025-09-06 02:43:36,416] Trial 16 finished with value: 0.918154069970764 and parameters: {'min_leaf_purity': 0.7825088992567243, 'max_leaf_purity': 0.8969178061746931, 'top_pair_fraction': 0.06811269217167996, 'os_max_depth': 4, 'learning_rate': 0.27655817094133417, 'max_depth': 6, 'n_estimators': 231}. Best is trial 16 with value: 0.918154069970764.
[I 2025-09-06 02:43:50,045] Trial 1 finished with value: 0.9368002573894554 and parameters: {'min_leaf_purity': 0.7311274761942435, 'max_leaf_purity': 0.942104161525402, 'top_pair_fraction': 0.06753968887939749, 'os_max_depth': 10, 'learning_rate': 0.2009381272250482, 'max_depth': 5, 'n_estimators': 292}. Best is trial 1 with value: 0.9368002573894554.
[I 2025-09-06 02:44:04,080] Trial 40 finished with value: 0.9317915145411053 and parameters: {'min_leaf_purity': 0.7745675700273109, 'max_leaf_purity': 0.8947401929864661, 'top_pair_fraction': 0.09896480423287317, 'os_max_depth': 7, 'learning_rate': 0.2767528239098058, 'max_depth': 8, 'n_

    AUC: 0.9757
  Fold 2


[I 2025-09-06 03:22:30,243] Trial 16 finished with value: 0.907774560409585 and parameters: {'min_leaf_purity': 0.5814497247720355, 'max_leaf_purity': 0.8448176262816408, 'top_pair_fraction': 0.07986822690555753, 'os_max_depth': 4, 'learning_rate': 0.28690895480131096, 'max_depth': 9, 'n_estimators': 155}. Best is trial 16 with value: 0.907774560409585.
[I 2025-09-06 03:22:43,700] Trial 14 finished with value: 0.918796704295886 and parameters: {'min_leaf_purity': 0.6518734110795151, 'max_leaf_purity': 0.7723298256813546, 'top_pair_fraction': 0.0753700740680589, 'os_max_depth': 6, 'learning_rate': 0.1779716743636271, 'max_depth': 9, 'n_estimators': 216}. Best is trial 14 with value: 0.918796704295886.
[I 2025-09-06 03:22:46,913] Trial 36 finished with value: 0.9131324576496425 and parameters: {'min_leaf_purity': 0.7329196904876794, 'max_leaf_purity': 0.9498191087759535, 'top_pair_fraction': 0.05688583989537346, 'os_max_depth': 5, 'learning_rate': 0.2450175042799762, 'max_depth': 5, 'n_e

    AUC: 0.9771
  Fold 3


[I 2025-09-06 04:07:44,450] Trial 20 finished with value: 0.9467568928616391 and parameters: {'min_leaf_purity': 0.7518642467391976, 'max_leaf_purity': 0.8845610651346018, 'top_pair_fraction': 0.07391304314396308, 'os_max_depth': 7, 'learning_rate': 0.28138906980250644, 'max_depth': 10, 'n_estimators': 200}. Best is trial 20 with value: 0.9467568928616391.
[I 2025-09-06 04:07:56,667] Trial 39 finished with value: 0.9523288150293061 and parameters: {'min_leaf_purity': 0.6059539986590482, 'max_leaf_purity': 0.969486389324762, 'top_pair_fraction': 0.08128424006635437, 'os_max_depth': 7, 'learning_rate': 0.1656487170126496, 'max_depth': 3, 'n_estimators': 180}. Best is trial 39 with value: 0.9523288150293061.
[I 2025-09-06 04:08:02,925] Trial 23 finished with value: 0.9548442374137955 and parameters: {'min_leaf_purity': 0.5803914865684213, 'max_leaf_purity': 0.7339351165078658, 'top_pair_fraction': 0.07072327112536426, 'os_max_depth': 7, 'learning_rate': 0.24467201689776305, 'max_depth': 3

    AUC: 0.9304
  Fold 4


[I 2025-09-06 04:53:53,451] Trial 9 finished with value: 0.93177304964539 and parameters: {'min_leaf_purity': 0.7088270789449959, 'max_leaf_purity': 0.987631775135059, 'top_pair_fraction': 0.06408982909048873, 'os_max_depth': 4, 'learning_rate': 0.14727732510305086, 'max_depth': 9, 'n_estimators': 125}. Best is trial 9 with value: 0.93177304964539.
[I 2025-09-06 04:54:00,296] Trial 22 finished with value: 0.9414675395526461 and parameters: {'min_leaf_purity': 0.5089924981402086, 'max_leaf_purity': 0.8472826796182867, 'top_pair_fraction': 0.0762026808248183, 'os_max_depth': 7, 'learning_rate': 0.2969561611586669, 'max_depth': 5, 'n_estimators': 346}. Best is trial 22 with value: 0.9414675395526461.
[I 2025-09-06 04:54:01,807] Trial 18 finished with value: 0.940949263502455 and parameters: {'min_leaf_purity': 0.5870023120184045, 'max_leaf_purity': 0.842145281107864, 'top_pair_fraction': 0.06792669211756865, 'os_max_depth': 7, 'learning_rate': 0.11660501121451555, 'max_depth': 8, 'n_estim

    AUC: 0.8908
  Fold 5


[I 2025-09-06 05:31:42,625] Trial 11 finished with value: 0.9139007092198582 and parameters: {'min_leaf_purity': 0.7692792065367009, 'max_leaf_purity': 0.8892378710770608, 'top_pair_fraction': 0.05363689370453225, 'os_max_depth': 6, 'learning_rate': 0.29742444054526906, 'max_depth': 9, 'n_estimators': 165}. Best is trial 11 with value: 0.9139007092198582.
[I 2025-09-06 05:31:49,870] Trial 1 finished with value: 0.9231696672122205 and parameters: {'min_leaf_purity': 0.6939740345386799, 'max_leaf_purity': 0.8801971090169832, 'top_pair_fraction': 0.07421680034417136, 'os_max_depth': 7, 'learning_rate': 0.27041827234417287, 'max_depth': 5, 'n_estimators': 266}. Best is trial 1 with value: 0.9231696672122205.
[I 2025-09-06 05:32:04,890] Trial 26 finished with value: 0.9196617566830334 and parameters: {'min_leaf_purity': 0.7590060340298778, 'max_leaf_purity': 0.9577130914489426, 'top_pair_fraction': 0.09900005644832618, 'os_max_depth': 5, 'learning_rate': 0.2873601737624662, 'max_depth': 6, 

    AUC: 0.9260
File updated.
File uploaded.
