# Installing Libraries

In [23]:
!pip install pandas
!pip install optuna
!pip install optuna-dashboard
!pip install scikit
!pip install imbalanced-learn
!pip install seaborn
!pip install matplotlib
!pip install scipy

from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.metrics import recall_score, accuracy_score,f1_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
import warnings
import optuna
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score
from scipy.stats import ks_2samp
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.spatial import distance

[31mERROR: Could not find a version that satisfies the requirement scikit (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for scikit[0m[31m


# Step 1
1. device selection (quite self explanatory ya)
2. init_weights initializes weights of nn.Linear using Xavier uniform distribution and sets biases to zero
3. fold_to_dataloader_tensor converts tabular train and test data (from Pandas) into PyTorch DataLoaders with tensors on the right device.
4. get_feature_count returns the number of input features from a DataLoader batch.
5. criterion_mapping selects the appropriate loss function (BCE, Dice, or Focal Loss) based on user input.

In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using", device)
def init_weights(model): #tested already
    if isinstance(model, nn.Linear):  # Apply only to linear layers
        nn.init.xavier_uniform_(model.weight)
        if model.bias is not None:
            nn.init.zeros_(model.bias)
            
def fold_to_dataloader_tensor(train_x, test_x, train_y, test_y, batch_size=64, device=device):
    train_dataset = TensorDataset(
        torch.tensor(train_x.values,dtype=torch.float32).to(device), 
        torch.tensor(train_y.values,dtype=torch.float32).to(device))
    val_dataset = TensorDataset(
        torch.tensor(test_x.values,dtype=torch.float32).to(device), 
        torch.tensor(test_y.values,dtype=torch.float32).to(device))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False, drop_last=True)
    return train_loader, val_loader 

def get_feature_count(loader):
    """returns the number of features in the dataset"""
    return next(iter(loader))[0].shape[1]

from Criterion_Models import *
def criterion_mapping(criterion_choice:str, pos_weight:float=None, alpha:float=None, gamma:float=None):
    """
    Feel free to add any custom loss functions here.
    returns function for criterion
    """
    if criterion_choice == "FocalLoss":
        return FocalLoss(alpha =alpha, gamma=gamma)
    elif criterion_choice == "DiceLoss":
        return DiceLoss()
    elif criterion_choice == "BCEWithLogitsLoss":
        return nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight])) if pos_weight else nn.BCEWithLogitsLoss()
    return nn.BCEWithLogitsLoss() 

NameError: name 'torch' is not defined

In [25]:
random_state = 42
raw_dataset = pd.read_csv("./data/processed_data.csv") #data has X and Y
X = raw_dataset.drop(columns=["DR"])
Y = pd.DataFrame(raw_dataset["DR"])

# clinical_units = X.copy()

In [26]:
#drop cols here
# numeric_columns = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
# binary_columns = ['Gender', 'DR', 'Community_baihe', 'Community_chonggu', 'Community_huaxin', 'Community_jinze', 'Community_liantang', 'Community_xianghuaqiao', 'Community_xujin', 'Community_yingpu', 'Community_zhaoxian', 'Community_zhujiajiao']

In [27]:
X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST = train_test_split(X, Y, test_size=0.1, random_state=random_state, stratify=Y)

# Step 2: Tabular data preprocessing and augmentation
(Modified to use P() as sent by lz

The new augment_data_in_place prepares input features for model training by:

1. Identifying numerical columns to transform (e.g. lab values, BMI, etc.).
2. Log-transforming these columns to reduce skew.
3. Adding Gaussian noise to only the negative class (DR=0) if noise is set.
4. Returning transformed copies of X and X_test.

In [28]:
def augment_data_in_place(X, X_test, Y=None, normalisation_method=MinMaxScaler(), noise=None):
    all_numerical_columns = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
    binary_columns = ['Gender', 'DR', 'Community_baihe', 'Community_chonggu', 'Community_huaxin', 'Community_jinze', 'Community_liantang', 'Community_xianghuaqiao', 'Community_xujin', 'Community_yingpu', 'Community_zhaoxian', 'Community_zhujiajiao']
    
    existing_columns = [col for col in all_numerical_columns if col in X.columns and col in X_test.columns]

    if not existing_columns:
        print("No matching columns found for augmentation. Normalised data only.")
        X = normalisation_method.fit_transform(X)
        X_test = normalisation_method.transform(X_test)
        return X, X_test

    X_copy = X.copy()
    X_test_copy = X_test.copy()
    
    # Log-transform
    X_copy.loc[:, existing_columns] = X_copy.loc[:, existing_columns].apply(np.log1p)
    X_test_copy.loc[:, existing_columns] = X_test_copy.loc[:, existing_columns].apply(np.log1p)

    # Add noise ONLY to negatives (class 0) if Y is provided and noise is set
    if noise and noise > 0:
        if Y is None:
            raise ValueError("Y must be provided if noise is being added selectively.")
        # Identify negative class indices (class 0)
        negative_indices = Y[Y.iloc[:, 0] == 0].index
        noise_matrix = np.random.normal(0, noise, X_copy.loc[negative_indices, existing_columns].shape)
        X_copy.loc[negative_indices, existing_columns] += noise_matrix

    # Scale
    # scaler = normalisation_method
    # X_copy.loc[:, existing_columns] = scaler.fit_transform(X_copy.loc[:, existing_columns])
    # X_test_copy.loc[:, existing_columns] = scaler.transform(X_test_copy.loc[:, existing_columns])

    return X_copy, X_test_copy


# def iso_forest(X_train, Y_train, contamination=None, random_state=42):
#     # print("Original\n", X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
#     X_train_cleaned, Y_train_cleaned = X_train.copy(), Y_train.copy()
    
#     X_train_zeros = X_train[Y_train.iloc[:, 0] == 0]
#     X_train_ones = X_train[Y_train.iloc[:, 0] == 1]
#     Y_train_zeros = Y_train[Y_train.iloc[:, 0] == 0]
#     Y_train_ones = Y_train[Y_train.iloc[:, 0] == 1] 
#     # print("Ones and zeros\n", X_train_zeros.shape, Y_train_zeros.shape, X_train_ones.shape, Y_train_ones.shape)
#     #only class 0s
#     if X_train_zeros.isna().any().any():
#         print("got NaN values in the training set")
    
#     # Apply Isolation Forest to majority class only
#     iso_forest = IsolationForest(contamination=contamination, random_state=random_state)
#     try:
#         outliers = iso_forest.fit_predict(X_train_zeros)
#     except UserWarning as e:
#         print("Caught warning during IsolationForest fitting:", e)
#         outliers = np.ones(len(X_train_zeros))  # If warning occurs, keep all data
#     # Keep only non-outlier majority samples
#     X_train_zeros = X_train_zeros[outliers == 1]
#     Y_train_zeros = Y_train_zeros[outliers == 1]
#     # print("After iso:\n", X_train_zeros.shape, Y_train_zeros.shape, X_train_ones.shape, Y_train_ones.shape)
    
#     # Combine the cleaned majority class with the untouched minority class
#     X_train_cleaned = pd.concat([X_train_zeros, X_train_ones])
#     Y_train_cleaned = pd.concat([Y_train_zeros, Y_train_ones])
#     return X_train_cleaned, Y_train_cleaned


# Step 3: Quantitative Check on Augmentation

analyze_augmentation_impact evaluates how data augmentation and preprocessing steps affect:

1. Class Distribution:
   Compares the number of samples in class 0 and 1 before and after augmentation/oversampling.

2. Feature Distribution Shifts:
   For all shared, non-constant numerical columns, it compares:
       - Means and standard deviations
       - Kolmogorov–Smirnov test statistics to detect distributional drift

3. Returns a summary dictionary with class counts and feature-wise stats.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

def analyze_augmentation_impact(original_X, original_y, processed_fold):
    """
    Analyze the effect of augmentation on class distribution and feature statistics.
    
    Parameters:
        original_X: DataFrame of raw features before any processing.
        original_y: Series of raw target labels.
        processed_fold: Tuple (X_train, X_test, y_train, y_test) after processing.
        clinical_units: Optional raw (unnormalized) data for clinical comparison.
        
    Returns:
        Dictionary with summary statistics and drift measures.
    """
    X_train_fold, _, y_train_fold, _ = processed_fold
    
    print("\n=== Class Distribution ===")
    class_counts = {
        "original_0": (original_y == 0).sum(),
        "original_1": (original_y == 1).sum(),
        "processed_0": (y_train_fold == 0).sum(),
        "processed_1": (y_train_fold == 1).sum(),
    }
    print(f"Original: 0={class_counts['original_0']} | 1={class_counts['original_1']}")
    print(f"Processed: 0={class_counts['processed_0']} | 1={class_counts['processed_1']}")
    
    # Compare feature distributions for numerical columns
    # num_cols = ['Age', 'UAlb', 'Ucr', 'TCTG', 'HDLC', 'BUN', 'FPG', 'Weight', 'UACR', 'BMI', 'HbA1c']  # Key numerical features to compare
    # num_cols = original_X.columns.intersection(X_train_fold.columns)

    shared_cols = original_X.columns.intersection(X_train_fold.columns)
    combined = pd.concat([original_X[shared_cols], X_train_fold[shared_cols]])
    non_constant_cols = combined.loc[:, combined.nunique() > 1].columns
    num_cols = non_constant_cols
    
    stats_summary = {}
    print("\n=== Feature Statistics (Normalized Units) ===")
    for col in num_cols:
        orig = original_X[col]
        proc = X_train_fold[col]
        ks_stat, ks_p = ks_2samp(orig, proc)
        print(f"{col}: {orig.mean():.2f}±{orig.std():.2f} → {proc.mean():.2f}±{proc.std():.2f} | KS={ks_stat:.3f}, p={ks_p:.3f}")
        stats_summary[col] = {
            "mean_orig": orig.mean(), "std_orig": orig.std(),
            "mean_proc": proc.mean(), "std_proc": proc.std(),
            "ks_stat": ks_stat, "ks_p": ks_p
        }

    return {
        "class_counts": class_counts,
        "feature_stats": stats_summary
    }

# Step 4: Visual and Quantitative check on synthetic data quality

evaluate_augmentation_quality assesses how similar or different synthetic samples are from original minority samples, focusing on distribution, spread, and outliers.

1. PCA visualization:
   - Projects both original and synthtic minority class into 2D using PCA
   - Plots them to visually inspect overlap or drift
2. Data cleaning:
   - Drops constant columns
   - Imputes missing values (mean or zero-fill) before PCA
3. Outlier detection in PCA space:
   - Uses Mahalanobis distance to identify synthetic samples that are unusually far from the original minority clustrer.
   - Computes the outlier rate among synthethic samples
  
It returns a dictionary with:
- Number of original vs synthethic minority samples
- Total PCA variance explained (how much info PCA retaiend)
- Number and proportion of outlier synthetic samples
- n_neighbors info for SMOTE/ ADASYN

In [30]:
def evaluate_augmentation_quality(original_X, original_y, processed_fold, n_neighbors=None):
    """
    Evaluate how synthetic samples differ from original ones via PCA and metrics.
    
    Parameters:
        original_X: DataFrame of raw features.
        original_y: Series of raw targets.
        processed_fold: Tuple (X_train, X_test, y_train, y_test) from processed data.
        n_neighbors: Optional SMOTE/ADASYN config parameter for logging.
        
    Returns:
        Dictionary of quality metrics.
    """
    X_train_fold, _, y_train_fold, _ = processed_fold
    
    # Extract original vs. synthetic minority samples
    minority_mask = original_y == 1
    original_minority = original_X[minority_mask]
    processed_minority = X_train_fold[y_train_fold == 1]
    
    # PCA projection
    pca = PCA(n_components=2)
    combined = pd.concat([original_minority, processed_minority])

    # Combine both for shared column filtering
    combined_df = pd.concat([original_minority, processed_minority])
    
    # Keep only columns that vary (non-constant across the combined set)
    non_constant_cols = combined_df.loc[:, combined_df.nunique() > 1].columns
    
    # Apply same column filtering to both datasets
    original_minority = original_minority[non_constant_cols]
    processed_minority = processed_minority[non_constant_cols]

    # Using imputer to handle NaN columns
    fully_nan_cols = combined.columns[combined.isna().all()].tolist()
    other_cols = [col for col in combined.columns if col not in fully_nan_cols]
    transformer = ColumnTransformer(transformers=[
        ('mean_imputer', SimpleImputer(strategy='mean'), other_cols),
        ('zero_imputer', SimpleImputer(strategy='constant', fill_value=0), fully_nan_cols)
    ])
    
    combined_imputed_array = transformer.fit_transform(combined)
    combined_imputed = pd.DataFrame(combined_imputed_array, columns=other_cols + fully_nan_cols)
    pca_result = pca.fit_transform(combined_imputed)

    original_pca = pca_result[:len(original_minority)]
    synthetic_pca = pca_result[len(original_minority):]

    # PCA plot
    plt.figure(figsize=(10,6))
    plt.scatter(original_pca[:,0], original_pca[:,1], alpha=0.4, label='Original Minority', color='blue')
    plt.scatter(synthetic_pca[:,0], synthetic_pca[:,1], alpha=0.4, label='Synthetic Minority', color='orange')
    plt.title("PCA of Minority Class Samples")
    plt.legend()
    plt.grid(True)
    plt.show()

    # Outlier detection using Mahalanobis distance in PCA space
    original_mean = original_pca.mean(axis=0)
    original_cov = np.cov(original_pca, rowvar=False)
    inv_covmat = np.linalg.pinv(original_cov)

    def is_outlier(point):
        return distance.mahalanobis(point, original_mean, inv_covmat) > 2.5

    synthetic_outliers = [is_outlier(pt) for pt in synthetic_pca]
    num_outliers = int(np.sum(synthetic_outliers))

    return {
        "original_minority_count": len(original_minority),
        "processed_minority_count": len(processed_minority),
        "pca_variance_ratio": pca.explained_variance_ratio_.sum(),
        "synthetic_outliers": num_outliers,
        "outlier_rate": num_outliers / len(processed_minority),
        "n_neighbors": n_neighbors
    }


# Step 5: Outlier filtering and feature scaling pipeline
P performs two key operations for basic preprocessing:
1. Outlier detection
   - Splits the data by class (DR = 0 vs 1)
   - Optionally removes outliers separately for majority and minority classes using outlier detection model
2. Feature scaling
   - Applies a scaler only on continuous columns
   - Ensures the same scaer is used on both training and test sets for consistency

In [31]:
def P(df, df_test, OD_majority, OD_minority, scaler):
    """
    Preprocess the data by applying optional outlier detection and mandatory scaling.

    Parameters:
    df (DataFrame): Training data.
    df_test (DataFrame): Test data.
    OD_majority (model or None): Outlier detection model for majority class (or None to skip).
    OD_minority (model or None): Outlier detection model for minority class (or None to skip).
    scaler (scaler): A fitted scaler instance (like StandardScaler).

    Returns:
    tuple: Processed (df_train, df_test)
    """
    
    cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 
                 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 
                 'Height', 'Weight', 'BMI', 'Duration']
    
    y_col = 'DR'
    
    # Split into classes
    df_majority = df[df[y_col] == 0].copy()
    df_minority = df[df[y_col] == 1].copy()

    # Apply outlier detection to majority class if model is given
    if OD_majority is not None:
        outliers_majority = OD_majority.fit_predict(df_majority[cont_cols])
        df_majority = df_majority[outliers_majority == 1]

    # Apply outlier detection to minority class if model is given
    if OD_minority is not None:
        outliers_minority = OD_minority.fit_predict(df_minority[cont_cols])
        df_minority = df_minority[outliers_minority == 1]

    # Combine cleaned data
    df_after_OD = pd.concat([df_majority, df_minority], ignore_index=True)

    # Apply scaling
    df_after_OD[cont_cols] = scaler.fit_transform(df_after_OD[cont_cols])
    df_test[cont_cols] = scaler.transform(df_test[cont_cols])

    return df_after_OD, df_test

# Step 6: Folds generation

In [34]:
def FOLDS_GENERATOR(X, Y, normalisation_method=MinMaxScaler(), n_splits=5, random_state=None, oversampler=None, contamination=0.05, noise = None):
    """
    Generates stratified folds with specified normalization.
    normalisation_method should be an instance of a scaler, e.g.,
    - MinMaxScaler()
    Returns a list of tuples, each containing:
    (X_train_scaled, X_test_scaled, Y_train, Y_test), representing data for each fold
    """
    kF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    kFolds_list = []

    for fold, (train_idx, test_idx) in enumerate(kF.split(X, Y)):
        # Split the data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]

        # Merge X and Y into DataFrames for use with P()
        df_train = X_train.copy()
        df_train["DR"] = Y_train.values
        df_test = X_test.copy()
        df_test["DR"] = Y_test.values
        # print("Original\n", X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
        
        # if contamination is not None and contamination > 0: #? using contamination = 0.0 works
        #     X_train_cleaned, Y_train_cleaned = iso_forest(X_train, Y_train, contamination=contamination, random_state=random_state)

        # Handle contamination (outlier detection) only if contamination > 0

        # Define outlier detectors only if contamination > 0
        # iso_majority = IsolationForest(contamination=contamination, random_state=random_state) if contamination else None
        # iso_minority = IsolationForest(contamination=contamination, random_state=random_state) if contamination else None

        if contamination:
            if contamination >0:
                OD_majority = IsolationForest(contamination=contamination, random_state=random_state)
                OD_minority = IsolationForest(contamination=contamination, random_state=random_state)
            else:
                OD_majority = None
                OD_minority = None
            X_train_cleaned, Y_train_cleaned = X_train, Y_train
        else:
            OD_majority = None
            OD_minority = None
            X_train_cleaned, Y_train_cleaned = X_train, Y_train
            
        
        #? data augmentation on leftover data
        df_train_processed, df_test_processed = P(df_train, df_test, OD_majority, OD_minority, normalisation_method)

        # Split back into X and Y
        X_train_scaled = df_train_processed.drop(columns=["DR"])
        Y_train_cleaned = df_train_processed[["DR"]]
        X_test_scaled = df_test_processed.drop(columns=["DR"])
        # Note: do NOT clean Y_test — it is untouched ground truth
        Y_test = df_test_processed[["DR"]]

         # Optional: Add Gaussian noise to class 0
        if noise and noise > 0:
            neg_idx = Y_train_cleaned[Y_train_cleaned["DR"] == 0].index
            noise_matrix = np.random.normal(0, noise, X_train_scaled.loc[neg_idx].shape)
            X_train_scaled.loc[neg_idx] += noise_matrix
        
        # Handle oversampling if needed
        #! use X_train_scaled and Y_train_cleaned for oversampling becasue y_train_cleaned no changes after augmentation
        print("Before oversampling class distribution:")
        print(Y_train_cleaned.value_counts())
        if oversampler:
            X_train_scaled, Y_train_cleaned = oversampler.fit_resample(X_train_scaled, Y_train_cleaned)
        print("\nAfter oversampling class distribution:")
        print(Y_train_cleaned.value_counts())
        
        # Convert scaled data back to DataFrame with the correct column names
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_cleaned.columns)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

        # Handle community columns
        community_cols = [col for col in X_train_scaled.columns if col.startswith('Community')]
        if community_cols:
            X_train_scaled[community_cols] = X_train_scaled[community_cols].apply(
                lambda row: pd.Series(np.eye(len(row))[row.argmax()]), axis=1
            ).set_axis(community_cols, axis=1)
        # print(X_train_scaled[community_cols].describe())

        # Ensure 'Gender' is still binary (0 or 1)
        if 'Gender' in X_train_scaled.columns:
            X_train_scaled['Gender'] = (X_train_scaled['Gender'] > 0.5).astype(int)
            X_test_scaled['Gender'] = (X_test_scaled['Gender'] > 0.5).astype(int)

        # Append the processed fold to the list
        kFolds_list.append((X_train_scaled, X_test_scaled, Y_train_cleaned, Y_test))

        print(f"Fold: {fold+1}, Train: {X_train_scaled.shape}, Test: {X_test_scaled.shape}")

    return kFolds_list

In [35]:
oversampler = ADASYN(sampling_strategy='minority', n_neighbors=5, random_state=42)
contamination = 0.05
normalisation_method = MinMaxScaler()
kFolds = FOLDS_GENERATOR(X_FOR_FOLDS, Y_FOR_FOLDS, 
                         normalisation_method = normalisation_method, 
                         n_splits=5, 
                         oversampler = oversampler, random_state=42, contamination=contamination, noise = None)

Before oversampling class distribution:
DR 
0.0    3922
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3922
1.0    3846
Name: count, dtype: int64
Fold: 1, Train: (7768, 28), Test: (1149, 28)
Before oversampling class distribution:
DR 
0.0    3922
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3922
1.0    3841
Name: count, dtype: int64
Fold: 2, Train: (7763, 28), Test: (1149, 28)
Before oversampling class distribution:
DR 
0.0    3923
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3923
1.0    3843
Name: count, dtype: int64
Fold: 3, Train: (7766, 28), Test: (1148, 28)
Before oversampling class distribution:
DR 
0.0    3923
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3923
1.0    3866
Name: count, dtype: int64
Fold: 4, Train: (7789, 28), Test: (1148, 28)
Before oversampling class distribution:
DR 
0.0    3923
1.0     440


In [36]:
for list in kFolds:
    print(list[0].shape, list[1].shape, list[2].shape, list[3].shape)
    print(list[0].describe())
    a = list[0]
    break

(7768, 28) (1149, 28) (7768, 1) (1149, 1)
               Age       Gender         UAlb          Ucr         UACR  \
count  7768.000000  7768.000000  7768.000000  7768.000000  7768.000000   
mean      0.490829     0.535917     0.037902     0.181086     0.025058   
std       0.125350     0.498740     0.074946     0.248951     0.053499   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.410714     0.000000     0.005620     0.000225     0.003719   
50%       0.500000     1.000000     0.012574     0.000487     0.008170   
75%       0.571429     1.000000     0.035340     0.337940     0.020753   
max       1.000000     1.000000     1.000000     1.000000     1.000000   

                TC           TG         TCTG         LDLC         HDLC  ...  \
count  7768.000000  7768.000000  7768.000000  7768.000000  7768.000000  ...   
mean      0.279667     0.096811     0.185084     0.378918     0.327945  ...   
std       0.086005     0.074324     0.112545     0.113

In [37]:
oversampler = ADASYN(sampling_strategy='minority', n_neighbors=5, random_state=42)
contamination = 0.05
normalisation_method = MinMaxScaler()
kFolds2 = FOLDS_GENERATOR(X_FOR_FOLDS, Y_FOR_FOLDS, 
                         normalisation_method = normalisation_method, 
                         n_splits=5, 
                         oversampler = oversampler, random_state=42, contamination=contamination, noise = 0.2)

Before oversampling class distribution:
DR 
0.0    3922
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3922
1.0    3921
Name: count, dtype: int64
Fold: 1, Train: (7843, 28), Test: (1149, 28)
Before oversampling class distribution:
DR 
0.0    3922
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3922
1.0    3922
Name: count, dtype: int64
Fold: 2, Train: (7844, 28), Test: (1149, 28)
Before oversampling class distribution:
DR 
0.0    3923
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
1.0    3925
0.0    3923
Name: count, dtype: int64
Fold: 3, Train: (7848, 28), Test: (1148, 28)
Before oversampling class distribution:
DR 
0.0    3923
1.0     440
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3923
1.0    3920
Name: count, dtype: int64
Fold: 4, Train: (7843, 28), Test: (1148, 28)
Before oversampling class distribution:
DR 
0.0    3923
1.0     440


In [38]:
for list in kFolds2:
    print(list[0].shape, list[1].shape, list[2].shape, list[3].shape)
    print(list[0].describe())
    break

(7843, 28) (1149, 28) (7843, 1) (1149, 1)
               Age       Gender         UAlb          Ucr         UACR  \
count  7843.000000  7843.000000  7843.000000  7843.000000  7843.000000   
mean      0.483317     0.503124     0.087809     0.265081     0.061610   
std       0.186747     0.500022     0.212319     0.300335     0.192249   
min      -0.356629     0.000000    -0.629280    -0.591299    -0.640594   
25%       0.374601     0.000000     0.001902     0.000435     0.001461   
50%       0.481734     1.000000     0.030153     0.249329     0.021141   
75%       0.573698     1.000000     0.194107     0.489366     0.153017   
max       1.300583     1.000000     1.165687     1.508710     1.000000   

                TC           TG         TCTG         LDLC         HDLC  ...  \
count  7843.000000  7843.000000  7843.000000  7843.000000  7843.000000  ...   
mean      0.275731     0.097300     0.174943     0.375491     0.324194  ...   
std       0.164283     0.156821     0.175114     0.177

In [42]:
# =============================================
# PARAMETER EXPERIMENTATION
# =============================================
# 1. First capture the original data distribution
original_X = X_FOR_FOLDS.copy()
original_y = Y_FOR_FOLDS.copy()

# 2. Generate folds as you normally would
kFolds_without_adasyn = FOLDS_GENERATOR(X_FOR_FOLDS, Y_FOR_FOLDS,
                        normalisation_method=RobustScaler(),
                        n_splits=5,
                        oversampler=None,
                        contamination=0.2,
                        noise=None)

oversampler = ADASYN(sampling_strategy='minority', n_neighbors=10, random_state=42)
kFolds_with_adasyn = FOLDS_GENERATOR(X_FOR_FOLDS, Y_FOR_FOLDS,
                        normalisation_method=RobustScaler(),
                        n_splits=5,
                        oversampler=None,
                        contamination=0.2,
                        noise=None)

# 3. Analyze the first fold
print("\n" + "="*50)
print("ANALYSIS OF FIRST FOLD (no adasyn)")
print("="*50)
analyze_augmentation_impact(original_X, original_y, kFolds_without_adasyn[0])
# quality_metrics = evaluate_augmentation_quality(original_X, original_y, kFolds_without_adasyn[0])

print("\n" + "="*50)
print("ANALYSIS OF FIRST FOLD (with adasyn)")
print("="*50)
# quality_metrics = evaluate_augmentation_quality(original_X, original_y, kFolds_with_adasyn[0])
analyze_augmentation_impact(original_X, original_y, kFolds_with_adasyn[0])

# print("\nQuality Metrics:")
# # for k, v in quality_metrics.items():
# #     print(f"{k}: {v}")
# quality_metrics = evaluate_augmentation_quality(original_X, original_y, (X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST))

Before oversampling class distribution:
DR 
0.0    3303
1.0     371
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3303
1.0     371
Name: count, dtype: int64
Fold: 1, Train: (3674, 28), Test: (1149, 28)
Before oversampling class distribution:
DR 
0.0    3303
1.0     371
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3303
1.0     371
Name: count, dtype: int64
Fold: 2, Train: (3674, 28), Test: (1149, 28)
Before oversampling class distribution:
DR 
0.0    3304
1.0     371
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3304
1.0     371
Name: count, dtype: int64
Fold: 3, Train: (3675, 28), Test: (1148, 28)
Before oversampling class distribution:
DR 
0.0    3304
1.0     371
Name: count, dtype: int64

After oversampling class distribution:
DR 
0.0    3304
1.0     371
Name: count, dtype: int64
Fold: 4, Train: (3675, 28), Test: (1148, 28)
Before oversampling class distribution:
DR 
0.0    3304
1.0     371


{'class_counts': {'original_0': DR    5162
  dtype: int64,
  'original_1': DR    580
  dtype: int64,
  'processed_0': DR    3303
  dtype: int64,
  'processed_1': DR    371
  dtype: int64},
 'feature_stats': {'Age': {'mean_orig': 63.8007662835249,
   'std_orig': 7.483781181651379,
   'mean_proc': -0.028007621121393583,
   'std_proc': 0.7039014638695024,
   'ks_stat': 1.0,
   'ks_p': 0.0},
  'Gender': {'mean_orig': 0.5499825844653431,
   'std_orig': 0.49753879497959264,
   'mean_proc': 0.5639629831246598,
   'std_proc': 0.4959593603989724,
   'ks_stat': 0.013980398659316686,
   'ks_p': 0.7665395597412806},
  'UAlb': {'mean_orig': 49.82375478927203,
   'std_orig': 124.38322984308897,
   'mean_proc': 0.6805222240344215,
   'std_proc': 1.9620624568463496,
   'ks_stat': 0.7919521458650098,
   'ks_p': 0.0},
  'Ucr': {'mean_orig': 4873.045280390108,
   'std_orig': 6063.166596062857,
   'mean_proc': 0.5145554128711642,
   'std_proc': 0.6600181606642664,
   'ks_stat': 0.9731800766283525,
   'ks_