# Logistic Regression Model for Diabetes Prediction

In [3]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

## 1. Data Acquisition

In [5]:
# Load the dataset
diabetes_csv = pd.read_csv("../datasets/Processed_DF.csv",sep=",",index_col="PatientID")
diabetes_csv.head()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


## 2. Data Preparation

In [6]:
# Create age groups
diabetes_csv['AgeGroup'] = pd.cut(diabetes_csv['Age'], bins=[20, 30, 40, 50, 60, 70, 80], labels=['20-30', '31-40', '41-50', '51-60', '61-70', '71-80'])

# Create BMI categories
diabetes_csv['BMICategory'] = pd.cut(diabetes_csv['BMI'], bins=[0, 18.5, 24.9, 29.9, 100], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

In [7]:
# One-hot encode the categorical variables
diabetes_csv = pd.get_dummies(diabetes_csv, columns=['AgeGroup', 'BMICategory'], drop_first=True)

In [8]:
# Define feature columns for clarity
feature_columns = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
                   'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']

# Drop unnecessary columns
diabetes_csv = diabetes_csv.drop(columns=['PatientID','AgeGroup_31-40', 'AgeGroup_41-50', 'AgeGroup_51-60',
       'AgeGroup_61-70', 'AgeGroup_71-80', 'BMICategory_Normal',
       'BMICategory_Overweight', 'BMICategory_Obese'], errors='ignore')

# Separate features and target variable
X = diabetes_csv.drop(columns=['Diabetic'])
y = diabetes_csv['Diabetic']

In [9]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 3. Handling Imbalanced Data

In [10]:
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

## 4. Splitting Data

In [11]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

## 5. Training Model

In [12]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

## 6. Model Evaluation

In [13]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.778
Confusion Matrix:
[[1572  429]
 [ 459 1540]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      2001
           1       0.78      0.77      0.78      1999

    accuracy                           0.78      4000
   macro avg       0.78      0.78      0.78      4000
weighted avg       0.78      0.78      0.78      4000



## 7. Hyperparameter Tuning

In [14]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 0.5, 1.0, 5.0],
    'solver': ['liblinear', 'saga']
}

# Setup grid search
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Show best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

Best Parameters: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.782


## 8. Final Model and Evaluation

In [15]:
# Create final model with best parameters
final_model = LogisticRegression(
    penalty=grid_search.best_params_['penalty'],
    C=grid_search.best_params_['C'],
    solver=grid_search.best_params_['solver'],
    max_iter=1000
)

# Train final model
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred_final = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)
final_report = classification_report(y_test, y_pred_final)

print(f"Final Model Accuracy: {final_accuracy}")
print(f"\nFinal Classification Report:\n{final_report}")

Final Model Accuracy: 0.785

Final Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      2001
           1       0.79      0.77      0.78      1999

    accuracy                           0.79      4000
   macro avg       0.79      0.79      0.79      4000
weighted avg       0.79      0.79      0.79      4000



In [16]:
# Saving the model
with open("log_model.pkl", "wb") as file:
    pickle.dump(final_model, file)
    
# Saving the scaler
with open("log_scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

## Summary

In this notebook, we developed a logistic regression model for diabetes prediction. We performed data preparation, handled imbalanced data using SMOTE, and tuned the model's hyperparameters using GridSearchCV. The final model achieved an accuracy of 78.5% on the test set with balanced precision and recall metrics for both classes. The trained model and the scaler have been saved for future use in the web application.