In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [4]:
# Load the dataset
data = pd.read_csv('diabetes.csv')

In [5]:
# Step 1: Data Preprocessing

# Replace zero values in specific columns with the median of each column
zero_value_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in zero_value_columns:
    median_value = data[column].median()
    data[column] = data[column].replace(0, median_value)


In [6]:
# Step 2: Check for class imbalance
class_counts = data['Outcome'].value_counts(normalize=True)
print("Class distribution in 'Outcome':\n", class_counts)

# If the classes are imbalanced, apply SMOTE
X = data.drop('Outcome', axis=1)
y = data['Outcome']

if class_counts.min() < 0.4:  # Check if any class is less than 40% of the data
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    print("Applied SMOTE to balance the classes.")

Class distribution in 'Outcome':
 Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64
Applied SMOTE to balance the classes.


In [7]:
# Step 3: Model Selection and Cross-Validation

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    accuracy = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
    precision = cross_val_score(model, X, y, cv=5, scoring='precision').mean()
    recall = cross_val_score(model, X, y, cv=5, scoring='recall').mean()
    f1 = cross_val_score(model, X, y, cv=5, scoring='f1').mean()
    auc = cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()

    # Store the results for comparison
    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc
    }

In [8]:
# Step 4: Display Results
print("\nModel Comparison Results:")
results_df = pd.DataFrame(results).T
print(results_df)


Model Comparison Results:
                     Accuracy  Precision  Recall  F1 Score      AUC
Logistic Regression     0.746   0.761614   0.718  0.738565  0.83990
Decision Tree           0.766   0.755815   0.782  0.761038  0.77100
Random Forest           0.827   0.803022   0.858  0.833566  0.90153
