In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline


In [2]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Read the data
df = pd.read_csv("diabetes_prediction_dataset.csv")

# Handle duplicates
df = df.drop_duplicates()

# Remove unnecessary values [0.00195%]
df = df[df['gender'] != 'Other']

In [3]:
# Recategorize smoking history
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

In [4]:
df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)

In [5]:
# One-hot encoding
def perform_one_hot_encoding(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)
    return df

In [6]:
data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')

In [7]:
# Resampling
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

In [8]:
# Split data into features and target variable
X = data.drop('diabetes', axis=1)
y = data['diabetes']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define numerical and categorical features
numerical_features = X.select_dtypes(include='number').columns
categorical_features = X.select_dtypes(include='object').columns

# PCA for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=min(X_train[numerical_features].shape[0], X_train[numerical_features].shape[1])))
])

# One-hot encoding for categorical features
categorical_transformer = OneHotEncoder()

In [10]:
# Combine transformers using ColumnTransformer
preprocessor_pca = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the pipeline with PCA
clf_pca = imbPipeline(steps=[
    ('preprocessor_pca', preprocessor_pca),
    ('over', over),
    ('under', under),
    ('classifier', DecisionTreeClassifier(
        max_depth=8,
        min_samples_split=0.2,
        min_samples_leaf=2,
        criterion='gini'
    ))
])

In [11]:
# Train the model with PCA
clf_pca.fit(X_train, y_train)

# Predict on the test set
y_pred_pca = clf_pca.predict(X_test)

In [12]:
# Evaluate the model with PCA
print("Model Accuracy with PCA: ", accuracy_score(y_test, y_pred_pca))
print(classification_report(y_test, y_pred_pca))

Model Accuracy with PCA:  0.885623634661396
              precision    recall  f1-score   support

           0       0.98      0.89      0.93     17525
           1       0.42      0.82      0.56      1701

    accuracy                           0.89     19226
   macro avg       0.70      0.86      0.75     19226
weighted avg       0.93      0.89      0.90     19226

