In [None]:
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler,
    minmax_scale,
)
from sklearn.metrics import recall_score, accuracy_score,f1_score, precision_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
import warnings
import optuna
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score

In [None]:
random_state = 42
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
raw_dataset = pd.read_csv("./data/processed_data.csv") #data has X and Y
X = raw_dataset.drop(columns=["DR"])
Y = pd.DataFrame(raw_dataset["DR"])
# print(X.describe())
# X.drop(columns = ['Age', 'Gender', 'UAlb', 'Ucr', 'UACR', 'LDLC', 'HDLC'], inplace=True)

# [Age,Gender,UAlb,Ucr,UACR,TC,TG,TCTG,LDLC,HDLC,Scr,BUN,FPG,HbA1c,Height,Weight,BMI,Duration,DR,Community_baihe,Community_chonggu,Community_huaxin,Community_jinze,Community_liantang,Community_xianghuaqiao,Community_xujin,Community_yingpu,Community_zhaoxian,Community_zhujiajiao]
#* 90/10 split for training and final test
X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST = train_test_split(X, Y, test_size=0.1, random_state=random_state, stratify=Y)

In [None]:
#! reference code for scaling to implement

# from sklearn.preprocessing import RobustScaler
# import pandas as pd

# # Let's assume your data is in a DataFrame called df

# # Step 1: Separate numeric and categorical columns
# numeric_columns = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 'Height', 'Weight', 'BMI', 'Duration']
# binary_columns = ['Gender', 'DR', 'Community_baihe', 'Community_chonggu', 'Community_huaxin', 'Community_jinze', 'Community_liantang', 'Community_xianghuaqiao', 'Community_xujin', 'Community_yingpu', 'Community_zhaoxian', 'Community_zhujiajiao']

# # Separate numeric features and binary/categorical features
# X_numeric = df[numeric_columns]
# X_binary = df[binary_columns]

# # Step 2: Apply RobustScaler to numeric features
# scaler = RobustScaler()
# X_numeric_scaled = scaler.fit_transform(X_numeric)

# # Step 3: Combine scaled numeric data with the original binary/categorical features
# X_scaled_df = pd.DataFrame(X_numeric_scaled, columns=numeric_columns)
# X_final = pd.concat([X_scaled_df, X_binary.reset_index(drop=True)], axis=1)

# # Now X_final has the numeric features scaled and binary/categorical features untouched


In [None]:
def augment_data_in_place(X, X_test, normalisation_method=MinMaxScaler()):
    all_numerical_columns = [
        'Age', 'Height', 'Weight', 'Duration',
        'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 
        'TCTG', 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c'
    ]

    # Find which of those columns actually exist in both X and X_test
    existing_columns = [col for col in all_numerical_columns if col in X.columns and col in X_test.columns]
    
    if not existing_columns:
        print("No matching columns found for augmentation. Normalised data only.")
        X= normalisation_method.fit_transform(X)
        X_test = normalisation_method.transform(X_test)
        return X, X_test

    # 1. Log-transform
    X.loc[:, existing_columns] = X.loc[:, existing_columns].apply(np.log1p)
    X_test.loc[:, existing_columns] = X_test.loc[:, existing_columns].apply(np.log1p)

    # 2. Add Gaussian noise to training data only
    noise = np.random.normal(0, 0.1, X[existing_columns].shape)
    X.loc[:, existing_columns] = X.loc[:, existing_columns] + noise

    # 3. Fit scaler on train, transform both
    scaler = normalisation_method
    X.loc[:, existing_columns] = scaler.fit_transform(X.loc[:, existing_columns])
    X_test.loc[:, existing_columns] = scaler.transform(X_test.loc[:, existing_columns])

    return X, X_test


In [None]:
def FOLDS_GENERATOR(X, Y, normalisation_method=MinMaxScaler(), n_splits=5, random_state=None, oversampler=None, contamination=0.05):
    """
    Generates stratified folds with specified normalization.

    normalisation_method should be an instance of a scaler, e.g.,
    - MinMaxScaler()
    - MaxAbsScaler()
    - QuantileTransformer(output_distribution='uniform')

    Returns a list of tuples, each containing:
    (X_train_scaled, X_test_scaled, Y_train, Y_test), representing data for each fold
    """
    kF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    kFolds_list = []

    for fold, (train_idx, test_idx) in enumerate(kF.split(X, Y)):
        # Split the data into training and testing sets for this fold
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
        # print("Original\n", X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
        X_train_cleaned, Y_train_cleaned = X_train.copy(), Y_train.copy()
        if contamination is not None and contamination > 0: #? using contamination = 0.0 works
            X_train_zeros = X_train[Y_train.iloc[:, 0] == 0]
            X_train_ones = X_train[Y_train.iloc[:, 0] == 1]
            Y_train_zeros = Y_train[Y_train.iloc[:, 0] == 0]
            Y_train_ones = Y_train[Y_train.iloc[:, 0] == 1] 
            # print("Ones and zeros\n", X_train_zeros.shape, Y_train_zeros.shape, X_train_ones.shape, Y_train_ones.shape)
            #only class 0s
            if X_train_zeros.isna().any().any():
                print("got NaN values in the training set")
            
            # Apply Isolation Forest to majority class only
            iso_forest = IsolationForest(contamination=contamination, random_state=random_state)
            try:
                outliers = iso_forest.fit_predict(X_train_zeros)
            except UserWarning as e:
                print("Caught warning during IsolationForest fitting:", e)
                outliers = np.ones(len(X_train_zeros))  # If warning occurs, keep all data
            # Keep only non-outlier majority samples
            X_train_zeros = X_train_zeros[outliers == 1]
            Y_train_zeros = Y_train_zeros[outliers == 1]
            # print("After iso:\n", X_train_zeros.shape, Y_train_zeros.shape, X_train_ones.shape, Y_train_ones.shape)
            
            # Combine the cleaned majority class with the untouched minority class
            X_train_cleaned = pd.concat([X_train_zeros, X_train_ones])
            Y_train_cleaned = pd.concat([Y_train_zeros, Y_train_ones])
        #? data augmentation on leftover data
        X_train_scaled, X_test_scaled = augment_data_in_place(X_train_cleaned, X_test, normalisation_method=normalisation_method)
        
        # Handle oversampling if needed
        #! use X_train_scaled and Y_train_cleaned for oversampling becasue y_train_cleaned no changes after augmentation
        if oversampler:
            X_train_scaled, Y_train_cleaned = oversampler.fit_resample(X_train_scaled, Y_train_cleaned)

        # Convert scaled data back to DataFrame with the correct column names
        X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_cleaned.columns)
        X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

        # Handle community columns
        community_cols = [col for col in X_train_scaled.columns if col.startswith('Community')]
        if community_cols:
            X_train_scaled[community_cols] = X_train_scaled[community_cols].apply(
                lambda row: pd.Series(np.eye(len(row))[row.argmax()]), axis=1
            ).set_axis(community_cols, axis=1)
        # print(X_train_scaled[community_cols].describe())

        # Ensure 'Gender' is still binary (0 or 1)
        if 'Gender' in X_train_scaled.columns: #! robust scaler will not work for this
            X_train_scaled['Gender'] = (X_train_scaled['Gender'] > 0.5).astype(int)
            X_test_scaled['Gender'] = (X_test_scaled['Gender'] > 0.5).astype(int)

        # Append the processed fold to the list
        kFolds_list.append((X_train_scaled, X_test_scaled, Y_train_cleaned, Y_test))

        print(f"Fold: {fold+1}, Train: {X_train_scaled.shape}, Test: {X_test_scaled.shape}")

    return kFolds_list

In [None]:
# oversampler = None
# contamination = 0.05
# normalisation_method = QuantileTransformer()
# kFolds = FOLDS_GENERATOR(X_FOR_FOLDS, Y_FOR_FOLDS, 
#                          normalisation_method = normalisation_method, 
#                          n_splits=5, 
#                          oversampler = oversampler, random_state=42, contamination=contamination)