In [9]:
import pandas as pd

df = pd.read_csv('diabetes_prediction_dataset.csv')

In [10]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [11]:
df['smoking_history'].value_counts()

No Info        35816
never          35095
former          9352
current         9286
not current     6447
ever            4004
Name: smoking_history, dtype: int64

In [12]:
df.drop('smoking_history', axis=1, inplace=True)

In [13]:
df['diabetes'].value_counts()

0    91500
1     8500
Name: diabetes, dtype: int64

In [15]:
df_filtered = df[df['gender'] != 'Other']


In [16]:
df_encoded = pd.get_dummies(df_filtered, columns=['gender'], prefix=['gender'])

In [17]:
df_encoded

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes,gender_Female,gender_Male
0,80.0,0,1,25.19,6.6,140,0,1,0
1,54.0,0,0,27.32,6.6,80,0,1,0
2,28.0,0,0,27.32,5.7,158,0,0,1
3,36.0,0,0,23.45,5.0,155,0,1,0
4,76.0,1,1,20.14,4.8,155,0,0,1
...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,0,1,0
99996,2.0,0,0,17.37,6.5,100,0,1,0
99997,66.0,0,0,27.83,5.7,155,0,0,1
99998,24.0,0,0,35.42,4.0,100,0,1,0


In [19]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification

X = df_encoded.drop('diabetes', axis=1)
y = df_encoded['diabetes']

# Instantiate the SMOTE object
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Fit and apply SMOTE to the dataset
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after SMOTE:")
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))


Class distribution after SMOTE:
{0: 91482, 1: 91482}


In [20]:
oversampled_df = pd.DataFrame(data=X_resampled, columns=X.columns)
oversampled_df['diabetes'] = y_resampled

In [22]:
from pycaret.classification import *

# Load a sample classification dataset (replace with your own dataset)
from pycaret.datasets import get_data

# Initialize the PyCaret setup
clf_setup = setup(data=oversampled_df, target='diabetes')

# Compare different models
compare_models()

Unnamed: 0,Description,Value
0,Session id,6155
1,Target,diabetes
2,Target type,Binary
3,Original data shape,"(182964, 9)"
4,Transformed data shape,"(182964, 9)"
5,Transformed train set shape,"(128074, 9)"
6,Transformed test set shape,"(54890, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9787,0.9975,0.963,0.9942,0.9784,0.9574,0.9579,6.102
lightgbm,Light Gradient Boosting Machine,0.9756,0.9973,0.9556,0.9953,0.9751,0.9511,0.9519,3.684
et,Extra Trees Classifier,0.9741,0.9958,0.9729,0.9752,0.9741,0.9482,0.9482,4.089
rf,Random Forest Classifier,0.9737,0.997,0.9701,0.9772,0.9736,0.9474,0.9474,6.078
gbc,Gradient Boosting Classifier,0.9702,0.996,0.9508,0.9892,0.9696,0.9404,0.9411,8.843
dt,Decision Tree Classifier,0.9689,0.9693,0.9703,0.9675,0.9689,0.9378,0.9378,0.204
ada,Ada Boost Classifier,0.9586,0.994,0.9529,0.964,0.9584,0.9173,0.9173,2.551
knn,K Neighbors Classifier,0.9377,0.979,0.9705,0.9108,0.9397,0.8755,0.8774,0.856
lr,Logistic Regression,0.9035,0.974,0.8972,0.9086,0.9029,0.807,0.8071,0.926
ridge,Ridge Classifier,0.9022,0.0,0.8876,0.9143,0.9007,0.8044,0.8047,0.074


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [25]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Create an ExtraTreesClassifier
extra_trees = ExtraTreesClassifier(random_state=42)

# Train the classifier on the training data
extra_trees.fit(X_train, y_train)

# Make predictions on the training and test data
y_train_pred = extra_trees.predict(X_train)
y_test_pred = extra_trees.predict(X_test)

# Calculate and print metrics for the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred, average='weighted')
train_f1 = f1_score(y_train, y_train_pred, average='weighted')

print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.2f}")
print(f"Recall: {train_recall:.2f}")
print(f"F1-score: {train_f1:.2f}")

# Calculate and print metrics for the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred, average='weighted')
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print("\nTesting Metrics:")
print(f"Accuracy: {test_accuracy:.2f}")
print(f"Recall: {test_recall:.2f}")
print(f"F1-score: {test_f1:.2f}")

Training Metrics:
Accuracy: 1.00
Recall: 1.00
F1-score: 1.00

Testing Metrics:
Accuracy: 0.97
Recall: 0.97
F1-score: 0.96


In [26]:
from imblearn.under_sampling import RandomUnderSampler

# Instantiate the RandomUnderSampler object
under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)

# Fit and apply undersampling to the dataset
X_resampled, y_resampled = under_sampler.fit_resample(X, y)

# Check the class distribution after undersampling
print("Class distribution after undersampling:")
unique, counts = np.unique(y_resampled, return_counts=True)
print(dict(zip(unique, counts)))

Class distribution after undersampling:
{0: 8500, 1: 8500}


In [27]:
undersampled = pd.DataFrame(data=X_resampled, columns=X.columns)
undersampled['diabetes'] = y_resampled

In [28]:
from pycaret.classification import *

# Load a sample classification dataset (replace with your own dataset)
from pycaret.datasets import get_data

# Initialize the PyCaret setup
clf_setup = setup(data=undersampled, target='diabetes')

# Compare different models
compare_models()

Unnamed: 0,Description,Value
0,Session id,4254
1,Target,diabetes
2,Target type,Binary
3,Original data shape,"(17000, 9)"
4,Transformed data shape,"(17000, 9)"
5,Transformed train set shape,"(11900, 9)"
6,Transformed test set shape,"(5100, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.913,0.9795,0.9346,0.8962,0.9149,0.8261,0.827,0.683
ada,Ada Boost Classifier,0.9085,0.979,0.919,0.9004,0.9095,0.817,0.8174,0.239
lightgbm,Light Gradient Boosting Machine,0.9074,0.9783,0.9192,0.8983,0.9085,0.8148,0.8153,0.724
xgboost,Extreme Gradient Boosting,0.9045,0.9771,0.9131,0.8978,0.9053,0.8089,0.8092,0.618
rf,Random Forest Classifier,0.9012,0.9728,0.9074,0.8964,0.9018,0.8024,0.8025,0.479
et,Extra Trees Classifier,0.8987,0.9639,0.9066,0.8926,0.8995,0.7973,0.7975,0.417
ridge,Ridge Classifier,0.8829,0.0,0.8726,0.8912,0.8817,0.7659,0.7662,0.023
lda,Linear Discriminant Analysis,0.8829,0.9605,0.8726,0.8912,0.8817,0.7659,0.7662,0.03
lr,Logistic Regression,0.8819,0.9618,0.8805,0.8833,0.8818,0.7639,0.7641,0.629
dt,Decision Tree Classifier,0.875,0.8757,0.8724,0.8771,0.8746,0.7499,0.7502,0.032


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [33]:
rf_model = create_model('rf')

# Tune the model
tuned_rf_model = tune_model(rf_model)

# Evaluate the model on the test set
evaluate_model(tuned_rf_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8958,0.9701,0.8874,0.9026,0.8949,0.7916,0.7917
1,0.8924,0.9716,0.9092,0.8797,0.8942,0.7849,0.7853
2,0.8866,0.9645,0.9008,0.8758,0.8882,0.7731,0.7734
3,0.9244,0.9807,0.9311,0.9187,0.9249,0.8487,0.8488
4,0.9092,0.9733,0.9227,0.8985,0.9104,0.8185,0.8188
5,0.8992,0.9719,0.8908,0.906,0.8983,0.7983,0.7984
6,0.9025,0.9744,0.9008,0.9039,0.9024,0.805,0.805
7,0.9034,0.9757,0.9126,0.896,0.9042,0.8067,0.8069
8,0.8992,0.9711,0.9025,0.8965,0.8995,0.7983,0.7983
9,0.8992,0.9743,0.916,0.8862,0.9008,0.7983,0.7988


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8983,0.9709,0.9176,0.8835,0.9002,0.7966,0.7972
1,0.8975,0.9735,0.9462,0.8622,0.9022,0.795,0.7988
2,0.8882,0.9674,0.9462,0.8479,0.8944,0.7765,0.7817
3,0.9118,0.979,0.9412,0.8889,0.9143,0.8235,0.825
4,0.9034,0.9748,0.9546,0.8659,0.9081,0.8067,0.811
5,0.9076,0.9744,0.9412,0.8819,0.9106,0.8151,0.817
6,0.9025,0.9732,0.9479,0.869,0.9068,0.805,0.8084
7,0.9101,0.9767,0.9513,0.8789,0.9136,0.8202,0.823
8,0.8966,0.9727,0.9294,0.8722,0.8999,0.7933,0.795
9,0.9067,0.9757,0.9513,0.8735,0.9107,0.8134,0.8167


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [34]:
save_model(tuned_rf_model, 'model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['age', 'hypertension',
                                              'heart_disease', 'bmi',
                                              'HbA1c_level',
                                              'blood_glucose_level',
                                              'gender_Female', 'gender_Male'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',...
                  RandomFore