In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.compose import ColumnTransformer

# Load data
df = pd.read_csv("diabetes_prediction_dataset.csv")


In [2]:
# Handle duplicates
duplicate_rows_data = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_data.shape)
df = df.drop_duplicates()

number of duplicate rows:  (3854, 9)


In [3]:
# Remove Unnecessary value [0.00195%]
df = df[df['gender'] != 'Other']

In [4]:
# Recategorize smoking history
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)


In [5]:
# One-hot encoding
def perform_one_hot_encoding(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)
    return df


In [6]:
data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')

In [7]:
# Resampling
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)


In [8]:
# Preprocessing with PCA
preprocessor_pca = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'hypertension', 'heart_disease']),
        ('cat', OneHotEncoder(), ['gender', 'smoking_history'])
    ])

pca = PCA(n_components=5)  # Adjust the number of components based on your requirements


In [9]:
# SelectKBest with Mutual Information for feature selection
feature_selection = SelectKBest(mutual_info_classif, k=5)  # Adjust k based on your requirements

# Define k-NN classifier
knn_classifier = KNeighborsClassifier()


In [10]:
# Create a pipeline that preprocesses the data, performs feature selection, resamples data, and then trains a classifier
clf_pca = imbPipeline(steps=[
    ('preprocessor_pca', preprocessor_pca),
    ('pca', pca),
    ('feature_selection', feature_selection),
    ('over', over),
    ('under', under),
    ('classifier', knn_classifier)
])

In [11]:
param_grid_pca = {
    'classifier__n_neighbors': [25],
    'classifier__algorithm': ['auto'],
    'classifier__leaf_size': [5],
    'classifier__weights': ['uniform'],
    'classifier__p': [2]  # L2-norm (Euclidean)
}

In [12]:
# Create Grid Search object for PCA
grid_search_pca = GridSearchCV(clf_pca, param_grid_pca, cv=5)

In [13]:
# Split data into train and test sets
X = df.drop('diabetes', axis=1)
y = df['diabetes']

In [14]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Train the model with PCA
grid_search_pca.fit(X_train, y_train)

In [16]:
# Print the best parameters
print("Best Parameters with PCA: ", grid_search_pca.best_params_)

# Predict on the test set using the best model with PCA
y_pred_pca = grid_search_pca.predict(X_test)

# Evaluate the model with PCA
print("Model Accuracy with PCA: ", accuracy_score(y_test, y_pred_pca))
print(classification_report(y_test, y_pred_pca))

Best Parameters with PCA:  {'classifier__algorithm': 'auto', 'classifier__leaf_size': 5, 'classifier__n_neighbors': 25, 'classifier__p': 2, 'classifier__weights': 'uniform'}
Model Accuracy with PCA:  0.9306668053677312
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     17525
           1       0.58      0.81      0.67      1701

    accuracy                           0.93     19226
   macro avg       0.78      0.88      0.82     19226
weighted avg       0.95      0.93      0.94     19226

