# Logistic Regression Model for Diabetes Prediction

In [1]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

## 1. Data Acquisition

In [2]:
# Load the dataset
diabetes_csv = pd.read_csv("../datasets/Processed_data.csv",sep=",",index_col="PatientID")
diabetes_csv.head()

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,BMI_Age_Ratio,ISI,BP_Age_Ratio,HighRiskPregnancy,Glucose_Insulin,Pregnancies_Age_Ratio,Metabolic_Risk,Pedigree_Glucose,Fat_Index
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,158,80,34,23,43.509726,0.981927,21,0,2.071892,2.751789,3.809524,0.0,3634,0.0,93.836575,155.144416,38.754863
2,7,103,78,25,269,29.582192,0.981927,43,1,0.687958,0.36092,1.813953,4.815706,27707,0.162791,70.194064,101.138448,27.291096
3,1,85,59,27,35,42.604536,0.549542,22,0,1.93657,3.361345,2.681818,1.93657,2975,0.045455,62.201512,46.711059,34.802268
4,0,82,92,9,253,19.72416,0.103424,26,0,0.758622,0.482021,3.538462,0.0,20746,0.0,64.57472,8.480809,14.36208
5,0,133,49,19,227,21.941357,0.17416,21,0,1.044827,0.331225,2.333333,0.0,30191,0.0,67.980452,23.163251,20.470678


## 2. Data Preparation

In [3]:
# Create age groups
diabetes_csv['AgeGroup'] = pd.cut(diabetes_csv['Age'], bins=[20, 30, 40, 50, 60, 70, 80], labels=['20-30', '31-40', '41-50', '51-60', '61-70', '71-80'])

# Create BMI categories
diabetes_csv['BMICategory'] = pd.cut(diabetes_csv['BMI'], bins=[0, 18.5, 24.9, 29.9, 100], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

In [4]:
# One-hot encode the categorical variables
diabetes_csv = pd.get_dummies(diabetes_csv, columns=['AgeGroup', 'BMICategory'], drop_first=True)

In [5]:
# Define feature columns for clarity
feature_columns = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
                   'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']

# Drop unnecessary columns
diabetes_csv = diabetes_csv.drop(columns=['PatientID','AgeGroup_31-40', 'AgeGroup_41-50', 'AgeGroup_51-60',
       'AgeGroup_61-70', 'AgeGroup_71-80', 'BMICategory_Normal',
       'BMICategory_Overweight', 'BMICategory_Obese'], errors='ignore')

# Separate features and target variable
X = diabetes_csv.drop(columns=['Diabetic'])
y = diabetes_csv['Diabetic']

In [6]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit and transform with the actual data

## 3. Handling Imbalanced Data

In [7]:
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

## 4. Splitting Data

In [8]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

## 5. Training Model

In [9]:
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

## 6. Model Evaluation

In [10]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8555555555555555
Confusion Matrix:
[[1311  246]
 [ 196 1307]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      1557
           1       0.84      0.87      0.86      1503

    accuracy                           0.86      3060
   macro avg       0.86      0.86      0.86      3060
weighted avg       0.86      0.86      0.86      3060



## 7. Hyperparameter Tuning

In [11]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 0.5, 1.0, 5.0],
    'solver': ['liblinear', 'saga']
}

# Setup grid search
grid_search = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Show best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best Cross-Validation Score: 0.8546337585572303


## 8. Final Model and Evaluation

In [12]:
# Create final model with best parameters
final_model = LogisticRegression(
    penalty=grid_search.best_params_['penalty'],
    C=grid_search.best_params_['C'],
    solver=grid_search.best_params_['solver'],
    max_iter=1000
)

# Train final model
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred_final = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)
final_report = classification_report(y_test, y_pred_final)

print(f"Final Model Accuracy: {final_accuracy}")
print(f"\nFinal Classification Report:\n{final_report}")

Final Model Accuracy: 0.8558823529411764

Final Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.86      1557
           1       0.84      0.87      0.86      1503

    accuracy                           0.86      3060
   macro avg       0.86      0.86      0.86      3060
weighted avg       0.86      0.86      0.86      3060



In [13]:
# Saving the model
with open("../models/log_model.pkl", "wb") as file:
    pickle.dump(final_model, file)
    
# Saving the scaler
with open("../models/log_scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)

## Summary

In this notebook, we developed a logistic regression model for diabetes prediction. We performed data preparation, handled imbalanced data using SMOTE, and tuned the model's hyperparameters using GridSearchCV. The final model achieved an accuracy of 78.5% on the test set with balanced precision and recall metrics for both classes. The trained model and the scaler have been saved for future use in the web application.