In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import FastICA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline


In [2]:
# Set the decimal format
pd.options.display.float_format = "{:.2f}".format

# Read the data
df = pd.read_csv("diabetes_prediction_dataset.csv")

# Handle duplicates
df = df.drop_duplicates()

# Remove unnecessary values [0.00195%]
df = df[df['gender'] != 'Other']

In [3]:
# Recategorize smoking history
def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)


In [4]:
# One-hot encoding
def perform_one_hot_encoding(df, column_name):
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df.drop(column_name, axis=1), dummies], axis=1)
    return df

data = df.copy()
data = perform_one_hot_encoding(data, 'gender')
data = perform_one_hot_encoding(data, 'smoking_history')


In [5]:
# Resampling
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)

# Dimensionality Reduction with ICA
X = data.drop('diabetes', axis=1)
y = data['diabetes']


In [6]:
# Apply ICA to reduce the dimensionality
ica = FastICA(n_components=10)
X_ica = ica.fit_transform(X)

# Include the 'diabetes' column back to the dataframe
data_ica = pd.DataFrame(data=np.column_stack((X_ica, y)), columns=[f"ICA{i}" for i in range(1, 11)] + ['diabetes'])


In [7]:
# Split data into features and target variable
X_ica = data_ica.drop('diabetes', axis=1)
y_ica = data_ica['diabetes']

# Create a pipeline that preprocesses the data, resamples data, and then trains a classifier
clf = imbPipeline(steps=[
    ('over', over),
    ('under', under),
    ('ica', FastICA(n_components=10)),  # Adjust the number of components as needed
    ('classifier', RandomForestClassifier(
        criterion='gini',
        n_estimators=150,
        bootstrap=True,
        min_samples_leaf=1,
        max_depth=8,
        max_features='sqrt'
    ))
])

# Define the hyperparameters and the values we want to test
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

In [8]:
# Create Grid Search object
grid_search = GridSearchCV(clf, param_grid, cv=5)

# Split data into train and test sets
X_train_ica, X_test_ica, y_train_ica, y_test_ica = train_test_split(X_ica, y_ica, test_size=0.2, random_state=42)


In [9]:
# Train the model with Grid Search
grid_search.fit(X_train_ica, y_train_ica)

In [10]:
# Predict on the test set using the best model
y_pred_ica = grid_search.predict(X_test_ica)

# Evaluate the model
print("Model Accuracy: ", accuracy_score(y_test_ica, y_pred_ica))
print(classification_report(y_test_ica, y_pred_ica))

Model Accuracy:  0.9436700301674815
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97     17525
         1.0       0.65      0.79      0.71      1701

    accuracy                           0.94     19226
   macro avg       0.81      0.88      0.84     19226
weighted avg       0.95      0.94      0.95     19226

