In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the dataset
diabetes = pd.read_csv('diabetes_cleaned.csv')

# Separate the features (X) and target variable (y)
X = diabetes.drop('diabetes', axis=1)
y = diabetes['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grids for models
logistic_param_grid = {
    'logistic__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'logistic__solver': ['liblinear', 'lbfgs']
}

random_forest_param_grid = {
    'random_forest__n_estimators': [50, 100, 200],
    'random_forest__max_depth': [None, 10, 20, 30],
    'random_forest__min_samples_split': [2, 5, 10],
    'random_forest__min_samples_leaf': [1, 2, 4]
}

decision_tree_param_grid = {
    'decision_tree__max_depth': [None, 10, 20, 30],
    'decision_tree__min_samples_split': [2, 5, 10],
    'decision_tree__min_samples_leaf': [1, 2, 4],
    'decision_tree__criterion': ['gini', 'entropy']
}

xgb_param_grid = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.001, 0.01, 0.1, 0.2],
    'xgb__max_depth': [3, 6, 10],
    'xgb__subsample': [0.8, 0.9, 1.0],
    'xgb__colsample_bytree': [0.8, 0.9, 1.0]
}

# Define models
logistic = LogisticRegression()
random_forest = RandomForestClassifier(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Create pipelines
pipelines = {
    'logistic': Pipeline([
        ('scaler', StandardScaler()), 
        ('logistic', logistic)
    ]),
    'random_forest': Pipeline([
        ('scaler', StandardScaler()), 
        ('random_forest', random_forest)
    ]),
    'decision_tree': Pipeline([
        ('scaler', StandardScaler()), 
        ('decision_tree', decision_tree)
    ]),
    'xgb': Pipeline([
        ('scaler', StandardScaler()), 
        ('xgb', xgb_model)
    ])
}

# Define GridSearchCV for each model
grid_searches = {
    'logistic': GridSearchCV(pipelines['logistic'], logistic_param_grid, cv=5, n_jobs=-1, verbose=1),
    'random_forest': GridSearchCV(pipelines['random_forest'], random_forest_param_grid, cv=5, n_jobs=-1, verbose=1),
    'decision_tree': GridSearchCV(pipelines['decision_tree'], decision_tree_param_grid, cv=5, n_jobs=-1, verbose=1),
    'xgb': GridSearchCV(pipelines['xgb'], xgb_param_grid, cv=5, n_jobs=-1, verbose=1)
}

# Fit GridSearchCV for each model
for model_name, grid_search in grid_searches.items():
    print(f"Training {model_name}...")
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")

    # Evaluate on test set
    y_pred = grid_search.best_estimator_.predict(X_test)
    print(f"Classification Report for {model_name}:\n{classification_report(y_test, y_pred)}")


Training logistic...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters for logistic: {'logistic__C': 1, 'logistic__solver': 'lbfgs'}
Best score for logistic: 0.9593452408056666
Classification Report for logistic:
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     17509
           1       0.85      0.63      0.72      1721

    accuracy                           0.96     19230
   macro avg       0.91      0.81      0.85     19230
weighted avg       0.95      0.96      0.95     19230

Training random_forest...
Fitting 5 folds for each of 108 candidates, totalling 540 fits


KeyboardInterrupt: 

In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load dataset
data = pd.read_csv('diabetes_cleaned.csv')

# Features and target separation
X = diabetes.drop('diabetes', axis=1)
y = diabetes['diabetes']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handling class imbalance with SMOTE
smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

# Define ANN Model Function
def create_ann_model(learning_rate=0.001):
    model = Sequential()
    model.add(Dense(64, input_dim=x_train_resampled.shape[1], activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Set hyperparameters
batch_size = 64
epochs = 50
learning_rate = 0.001

# Define and compile the model
ann_model = create_ann_model(learning_rate=learning_rate)

# StandardScaler for feature scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_resampled)
x_test_scaled = scaler.transform(x_test)

# Train the ANN model
print("Training ANN...")
ann_model.fit(x_train_scaled, y_train_resampled, batch_size=batch_size, epochs=epochs, verbose=1)

# Evaluate the model
y_pred = ann_model.predict(x_test_scaled)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary class labels
print(f"Classification Report for ANN:\n{classification_report(y_test, y_pred)}")


Training ANN...
Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8853 - loss: 0.2614
Epoch 2/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9114 - loss: 0.1903
Epoch 3/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9170 - loss: 0.1743
Epoch 4/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9187 - loss: 0.1706
Epoch 5/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9210 - loss: 0.1669
Epoch 6/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9214 - loss: 0.1656
Epoch 7/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9223 - loss: 0.1626
Epoch 8/50
[1m2193/2193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9241 - loss: 0.1614
Epoch 9/50
[1m2193/2193[0m [32m━