In [1]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import fetch_openml

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier,HistGradientBoostingClassifier,BaggingClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV



In [2]:
def load_data():
    diabetes_df = pd.read_csv('dataset/diabetes_prediction_dataset.csv')
    y = diabetes_df['diabetes'].values
    X = diabetes_df.drop('diabetes', axis=1)  
    return X, y
X, y = load_data()
X,y

def one_hot_encode(X, columns):
    for col in columns:
        one_hot = pd.get_dummies(X[col], prefix=col)
        X = pd.concat([X, one_hot], join='inner', axis='columns').drop(col, axis=1)
    return X



def remove_outliers(X):
    outlier_detection = IsolationForest()
    outliers = outlier_detection.fit_predict(X)
   
    X = X[outliers == 1]
    return X

def standardize_data(X, columns):
    scaler = MinMaxScaler()
    X[columns] = scaler.fit_transform(X[columns])
    return X

def prepare_data(X, columns_to_encode, columns_to_scale, columns_no_scale):
    X_encoded = one_hot_encode(X, columns_to_encode)
    
    X_scaled = standardize_data(X_encoded, columns_to_scale)
    
   
    X_final = pd.concat([X_scaled, X_encoded[columns_no_scale]], join='inner', axis='columns')
    return X_final

def split_train_dev_test(X, y):
    total_examples = len(X)

   
    train_size = int(total_examples * 0.8)
    dev_size = int(total_examples * 0.1) 
    Xtrain, Xdev, Xtest = X[:train_size], X[train_size:train_size+dev_size], X[train_size+dev_size:]
    ytrain, ydev, ytest = y[:train_size], y[train_size:train_size+dev_size], y[train_size+dev_size:]

    return Xtrain, Xdev, Xtest, ytrain, ydev, ytest


In [3]:
# Define training functions 
def train_RandomForest(X_train, y_train):
    model = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=12,
                                   min_samples_leaf=8, max_features='sqrt', bootstrap=False, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

def train_GradientBoosting(X_train, y_train):
    model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1, max_depth=8, min_samples_split=12,
                                       min_samples_leaf=8, max_features='sqrt', random_state=42)
    model.fit(X_train, y_train)
    return model

def train_HistGradientBoosting(X_train, y_train):
    model = HistGradientBoostingClassifier(max_iter=100, learning_rate=0.4, max_depth=15, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_AdaBoost(X_train, y_train):
    base_estimator = DecisionTreeClassifier(max_depth=8)
    model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=500, learning_rate=0.4, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_Bagging(X_train, y_train):
    base_estimator = SVC(degree=2, C=1.0, random_state=42)
    model = BaggingClassifier(base_estimator=base_estimator, n_estimators=500, max_samples=0.8, max_features=1.0,
                              random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    return model

def train_Stacking(X_train, y_train):
   
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_leaf=3, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(n_estimators=500, learning_rate=0.4, max_depth=5, random_state=42)),
        ('svc', SVC(kernel='rbf', C=2, random_state=42))
    ]
   
    final_estimator = LogisticRegression(C=1.0, random_state=42, n_jobs=-1)
    
    model = StackingClassifier(estimators=estimators, final_estimator=final_estimator,  n_jobs=-1)
    model.fit(X_train, y_train)
    return model


def train_Voting(X_train, y_train):
    
    model1 = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_leaf=3, random_state=42, n_jobs=-1)
    model2 = GradientBoostingClassifier(n_estimators=500, learning_rate=0.4, max_depth=5, random_state=42)
    model3 = SVC(kernel='rbf', C=2,probability=True,random_state=42)

   
    voting_model = VotingClassifier(estimators=[
        ('rf', model1),
        ('gb', model2),
        ('svc', model3)],
        voting='soft') 

    
    voting_model.fit(X_train, y_train)
    return voting_model


In [4]:
def evaluate(model, Xdev, ydev):
    ydev_pred = model.predict(Xdev)

    accuracy = accuracy_score(ydev, ydev_pred)
    labels = np.unique(ydev_pred)
    precision = precision_score(ydev, ydev_pred, labels=labels, average=None)
    recall = recall_score(ydev, ydev_pred, labels=labels, average=None)
    f1 = f1_score(ydev, ydev_pred, labels=labels, average=None)
    
    dfscore = pd.DataFrame(data=[[accuracy]*len(labels), precision, recall, f1], columns=["Class" + str(l) for l in labels], index=['Accuracy', 'Precision', 'Recall', 'F1_score'])
    dfscore['mean'] = [accuracy, precision.mean(), recall.mean(), f1.mean()]
    
    return dfscore

In [5]:
def main():
    X, y = load_data()
    
   
    columns_to_encode = ['gender', 'smoking_history']
    columns_to_scale = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
    columns_no_scale = ['hypertension', 'heart_disease']

    X_prep = prepare_data(X, columns_to_encode, columns_to_scale, columns_no_scale)
    
    Xtrain, Xdev, Xtest, ytrain, ydev, ytest = split_train_dev_test(X_prep, y)
    
    # Train ensemble models
    model_RandomForest = train_RandomForest(Xtrain, ytrain)
    model_GradientBoosting = train_GradientBoosting(Xtrain, ytrain)
    model_HistGradientBoosting = train_HistGradientBoosting(Xtrain, ytrain)
    model_AdaBoost = train_AdaBoost(Xtrain, ytrain)
    model_Bagging = train_Bagging(Xtrain, ytrain)
    model_Stacking = train_Stacking(Xtrain, ytrain)
    model_Voting = train_Voting(Xtrain, ytrain)
    
    
    dfscore_RF = evaluate(model_RandomForest, Xdev, ydev)
    dfscore_GB = evaluate(model_GradientBoosting, Xdev, ydev)
    dfscore_HGB = evaluate(model_HistGradientBoosting, Xdev, ydev)
    dfscore_AB = evaluate(model_AdaBoost, Xdev, ydev)
    dfscore_BG = evaluate(model_Bagging, Xdev, ydev)
    dfscore_ST = evaluate(model_Stacking, Xdev, ydev)
    dfscore_VT = evaluate(model_Voting, Xdev , ydev)

   
    evaluation_results = pd.concat([dfscore_RF.add_suffix('_RandomForest'), 
                                    dfscore_GB.add_suffix('_GradientBoosting'), 
                                    dfscore_HGB.add_suffix('_HistGradientBoosting'), 
                                    dfscore_AB.add_suffix('_AdaBoost'), 
                                    dfscore_BG.add_suffix('_Bagging'), 
                                    dfscore_VT.add_suffix('_Voting'),
                                    dfscore_ST.add_suffix('_Stacking')], axis=1)

    return evaluation_results


df_scores = main()
pd.set_option('display.max_columns', None)
df_scores





Unnamed: 0,Class0_RandomForest,Class1_RandomForest,mean_RandomForest,Class0_GradientBoosting,Class1_GradientBoosting,mean_GradientBoosting,Class0_HistGradientBoosting,Class1_HistGradientBoosting,mean_HistGradientBoosting,Class0_AdaBoost,Class1_AdaBoost,mean_AdaBoost,Class0_Bagging,Class1_Bagging,mean_Bagging,Class0_Voting,Class1_Voting,mean_Voting,Class0_Stacking,Class1_Stacking,mean_Stacking
Accuracy,0.9724,0.9724,0.9724,0.9687,0.9687,0.9687,0.9719,0.9719,0.9719,0.9675,0.9675,0.9675,0.9617,0.9617,0.9617,0.9722,0.9722,0.9722,0.972,0.972,0.972
Precision,0.970861,1.0,0.985431,0.973356,0.897893,0.935625,0.972143,0.9678,0.969971,0.973322,0.880763,0.927043,0.961076,0.975169,0.968122,0.971952,0.976449,0.974201,0.973248,0.951724,0.962486
Recall,1.0,0.656716,0.828358,0.993149,0.689055,0.841102,0.998043,0.672886,0.835464,0.991844,0.689055,0.84045,0.998804,0.537313,0.768059,0.998586,0.670398,0.834492,0.996955,0.686567,0.841761
F1_score,0.985215,0.792793,0.889004,0.983153,0.779733,0.881443,0.984922,0.793837,0.88938,0.982496,0.773203,0.877849,0.979577,0.692863,0.83622,0.985089,0.794985,0.890037,0.984959,0.797688,0.891324
