In [None]:
import deep_tabular_augmentation as dta
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold

In [None]:
random_state = 42
raw_dataset = pd.read_csv("./data/processed_data_OHE.csv") #data has X and Y
X = raw_dataset.drop(columns=["DR"])
Y = pd.DataFrame(raw_dataset["DR"])
# Slice your data


X_FOR_FOLDS, X_FINAL_TEST, Y_FOR_FOLDS, Y_FINAL_TEST = train_test_split(X, Y, test_size=0.1, random_state=random_state, stratify=Y)
df = pd.concat([X_FOR_FOLDS, Y_FOR_FOLDS], axis=1)
df.reset_index(drop=True, inplace=True)

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTENC
def Preprocessor(df, df_test, OD_majority, OD_minority, oversampler, scaler):
    """
    Preprocess the data by applying optional outlier detection and mandatory scaling.

    Parameters:
    df (DataFrame): Training data.
    df_test (DataFrame): Test data.
    OD_majority (model or None): Outlier detection model for majority class (or None to skip).
    OD_minority (model or None): Outlier detection model for minority class (or None to skip).
    scaler (scaler): A fitted scaler instance (like StandardScaler).

    Returns:
    tuple: Processed (df_train, df_test)
    """
    
    cont_cols = ['Age', 'UAlb', 'Ucr', 'UACR', 'TC', 'TG', 'TCTG', 
                 'LDLC', 'HDLC', 'Scr', 'BUN', 'FPG', 'HbA1c', 
                 'Height', 'Weight', 'BMI', 'Duration']
    cat_cols = ['Gender', 'DR', 'Community_baihe', 'Community_chonggu', 'Community_huaxin', 'Community_jinze', 'Community_liantang', 'Community_xianghuaqiao', 'Community_xujin', 'Community_yingpu', 'Community_zhaoxian', 'Community_zhujiajiao']
    
    y_col = 'DR'
    
    # Split into classes
    df_majority = df[df[y_col] == 0].copy()
    df_minority = df[df[y_col] == 1].copy()

    # Apply outlier detection to majority class if model is given
    if OD_majority is not None:
        outliers_majority = OD_majority.fit_predict(df_majority[cont_cols])
        df_majority = df_majority[outliers_majority == 1]

    # Apply outlier detection to minority class if model is given
    if OD_minority is not None:
        outliers_minority = OD_minority.fit_predict(df_minority[cont_cols])
        df_minority = df_minority[outliers_minority == 1]

    # Combine cleaned data
    df_after_OD = pd.concat([df_majority, df_minority], ignore_index=True)
    
    
    
    if oversampler is not None:
        SMOTENC(categorical_features=cat_cols, random_state=42)
        # Apply oversampling
        X_resampled, y_resampled = oversampler.fit_resample(df_after_OD, df_after_OD[y_col])
        df_after_OD = pd.DataFrame(X_resampled, columns=cont_cols)
        df_after_OD[y_col] = y_resampled
    # Apply scaling
    df_after_OD[cont_cols] = scaler.fit_transform(df_after_OD[cont_cols])
    df_test[cont_cols] = scaler.transform(df_test[cont_cols])

    return df_after_OD, df_test

In [None]:
def FOLDS_GENERATOR(dataset, n_splits=5, random_state=None, oversampler=None, noise=None,
                     OD_majority=None, OD_minority=None, scaler=None):
    kF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    kFolds_list = []

    # Convert column names to strings to ensure compatibility
    df = dataset.copy()
    X = df.drop(columns=["DR"])
    Y = pd.DataFrame(df["DR"])

    for fold, (train_idx, test_idx) in enumerate(kF.split(X, Y)):
        # Split the data into training and testing sets for this fold
        train = pd.concat([X.iloc[train_idx], Y.iloc[train_idx]], axis=1)
        test = pd.concat([X.iloc[test_idx], Y.iloc[test_idx]], axis=1)
        
        # Apply P to X_train and X_test, passing Y_train to P for class info
        X_train_processed, X_test_processed = Preprocessor(train, test,
                                                OD_majority=OD_majority,
                                                OD_minority=OD_minority,
                                                scaler=scaler)
        # Append the processed fold to the list
        # kFolds_list.append((X_train_processed, X_test_processed))  # Append imputed DataFrames
        kFolds_list.append((X_train_processed.drop(columns=["DR"]),
                            X_test_processed.drop(columns=["DR"]),
                            X_train_processed[["DR"]],
                            X_test_processed[["DR"]]))

        print(f"Fold: {fold+1}, Train: {X_train_processed.shape}, Test: {X_test_processed.shape}")
    return kFolds_list

In [None]:
oversampler = None
OD_majority = None
OD_minority = None
Scaler = None

kFolds_list = FOLDS_GENERATOR(
    raw_dataset,
    n_splits=10, 
    random_state=random_state, 
    oversampler=oversampler,
    noise = None,
    OD_majority=OD_majority,
    OD_minority=OD_minority,
    scaler=Scaler
)