In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import FastICA
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline

from skfeature.function.information_theoretical_based import MRMR


In [2]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Read the data
df = pd.read_csv("diabetes_prediction_dataset.csv")

# Handle duplicates
df = df.drop_duplicates()

# Remove unnecessary values [0.00195%]
df = df[df['gender'] != 'Other']

In [3]:
# Recategorize smoking history
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)


In [4]:
# One-hot encoding
def perform_one_hot_encoding(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)
    return df

data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')


In [5]:
# Ensure that 'diabetes' column is present in the data
if 'diabetes' in data.columns:
    # Resampling
    over = SMOTE(sampling_strategy=0.1)
    under = RandomUnderSampler(sampling_strategy=0.5)

    # Perform MRMR feature selection
    X = data.drop('diabetes', axis=1)
    y = data['diabetes']
    feat_idx = MRMR.mrmr(X.values, y.values, n_selected_features=10)
    selected_features = X.columns[feat_idx]

    # Include the target variable 'diabetes' in the selected features
    selected_features = ['diabetes'] + selected_features.tolist()

    # Filter the data to include only selected features
    data = data[selected_features]

    # Data Preprocessing with ICA
    num_components = min(10, data.shape[1])
    ica = FastICA(n_components=num_components, random_state=42)
    X_ica = ica.fit_transform(data.drop('diabetes', axis=1))

    # Model Pipeline with AdaBoost after ICA
    base_classifier = DecisionTreeClassifier(max_depth=1)
    clf = imbPipeline(steps=[
        ('ica', FastICA(n_components=num_components, random_state=42)),
        ('over', over),
        ('under', under),
        ('classifier', AdaBoostClassifier(
            base_estimator=base_classifier,
            n_estimators=150,
            learning_rate=0.1,
            algorithm='SAMME.R'
        ))
    ])

    # Define the hyperparameters and the values we want to test
    param_grid = {
        'classifier__n_estimators': [150],
        'classifier__learning_rate': [0.1]
    }

    # Create Grid Search object
    grid_search = GridSearchCV(clf, param_grid, cv=5)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_ica, y, test_size=0.2, random_state=42)

    try:
        # Train the model with Grid Search
        grid_search.fit(X_train, y_train)
    except Exception as e:
        print("Error:", e)

    # Predict on the test set using the best model
    y_pred = grid_search.predict(X_test)

    # Evaluate the model
    print("Model Accuracy: ", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
else:
    print("'diabetes' column not found in the data.")

Model Accuracy:  0.9461146364298346
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     17525
           1       0.68      0.73      0.71      1701

    accuracy                           0.95     19226
   macro avg       0.83      0.85      0.84     19226
weighted avg       0.95      0.95      0.95     19226

