In [1]:
import warnings
warnings.filterwarnings('ignore')

# Import Necessary libraries
import numpy as np 
import pandas as pd 

# Import Model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# Import Sampler libraries
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline

# Import MRMR (Minimum Redundancy Maximum Relevance) feature selection
from skfeature.function.information_theoretical_based import MRMR

In [2]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

In [3]:
# Read the data
df = pd.read_csv("diabetes_prediction_dataset.csv")

In [4]:
# Handle duplicates
df = df.drop_duplicates()

In [5]:
# Remove unnecessary values [0.00195%]
df = df[df['gender'] != 'Other']

In [6]:
# Recategorize smoking history
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'


In [7]:
df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)

In [8]:
# One-hot encoding
def perform_one_hot_encoding(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)
    return df


In [9]:
data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')

In [10]:
# Resampling
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

In [11]:
X = data.drop('diabetes', axis=1)
y = data['diabetes']
feat_idx = MRMR.mrmr(X.values, y.values, n_selected_features=10)
selected_features = X.columns[feat_idx]

# Exclude the target variable 'diabetes'
features_to_include = data.columns[data.columns != 'diabetes']
data = data[features_to_include]

In [12]:
# Data Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), data.select_dtypes(include='number').columns),
        ('cat', OneHotEncoder(), data.select_dtypes(include='object').columns)
    ])


In [13]:
# Model Pipeline with AdaBoost
base_classifier = DecisionTreeClassifier(max_depth=1)  # Base classifier for AdaBoost
clf = imbPipeline(steps=[('preprocessor', preprocessor),
                         ('over', over),
                         ('under', under),
                         ('classifier', AdaBoostClassifier(
                             base_estimator=base_classifier,
                             n_estimators=150,
                             learning_rate=0.1,
                             algorithm='SAMME.R'  # AdaBoost.MH equivalent for SAMME.R
                         ))])


In [14]:
# Define the hyperparameters and the values we want to test
param_grid = {
    'classifier__n_estimators': [150],
    'classifier__learning_rate': [0.1]
}


In [15]:
# Create Grid Search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

In [16]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
try:
    # Train the model with Grid Search
    grid_search.fit(X_train, y_train)
except Exception as e:
    print("Error:", e)


In [18]:
# Print the best parameters
print("Best Parameters: ", grid_search.best_estimator_.get_params())


Best Parameters:  {'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(),
                                 Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level'],
      dtype='object')),
                                ('cat', OneHotEncoder(),
                                 Index([], dtype='object'))])), ('over', SMOTE(sampling_strategy=0.1)), ('under', RandomUnderSampler(sampling_strategy=0.5)), ('classifier', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.1, n_estimators=150))], 'verbose': False, 'preprocessor': ColumnTransformer(transformers=[('num', StandardScaler(),
                                 Index(['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level'],
      dtype='object')),
                                ('cat', OneHotEncoder(),
                                 Index([], dtype='object'

In [19]:
# Predict on the test set using the best model
y_pred = grid_search.predict(X_test)

In [20]:
# Evaluate the model
print("Model Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Model Accuracy:  0.9501716425673568
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     17525
           1       0.69      0.80      0.74      1701

    accuracy                           0.95     19226
   macro avg       0.83      0.88      0.86     19226
weighted avg       0.95      0.95      0.95     19226

