In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('diabetes.csv')

# Outlier removal using IQR
for col in df.columns[:-1]:  # exclude target column
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

X = df.drop(['Outcome','SkinThickness','DiabetesPedigreeFunction'], axis=1)
y = df['Outcome']

# Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Handling imbalance
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_scaled, y)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Models dictionary (with random_state where applicable)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(random_state=42)
}

# Cross-validation on training data
from sklearn.model_selection import cross_val_score

result = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    result[name] = scores.mean()
    print(f"{name} CV Accuracy: {scores.mean():.4f}")

# Select best model
best_model_name = max(result, key=result.get)
print(f"\nBest Model: {best_model_name}")

# Train best model on training data
best_model = models[best_model_name]
best_model.fit(X_train, y_train)

# Test best model
from sklearn.metrics import accuracy_score, classification_report

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy of best model ({best_model_name}): {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

###### very very importment send the  data into to backend to here


Logistic Regression CV Accuracy: 0.7480
Random Forest CV Accuracy: 0.8049
KNN CV Accuracy: 0.7835
SVM CV Accuracy: 0.8107

Best Model: SVM
Test Accuracy of best model (SVM): 0.7955

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.76      0.79        89
           1       0.77      0.83      0.80        87

    accuracy                           0.80       176
   macro avg       0.80      0.80      0.80       176
weighted avg       0.80      0.80      0.80       176



In [3]:
import pickle

# Assume these variables exist
# best_model = your trained ML model
# best_model_name = name of the best model (optional)
# scaler = your fitted scaler (e.g., StandardScaler or MinMaxScaler)

# Create a dictionary to store both
model_package = {
    "model": best_model,
    "scaler": scaler
}

with open('diabetes_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print(f"Best model and scaler saved to 'Diabetes_model.pkl'")


Best model and scaler saved to 'Diabetes_model.pkl'


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               636 non-null    int64  
 1   Glucose                   636 non-null    int64  
 2   BloodPressure             636 non-null    int64  
 3   SkinThickness             636 non-null    int64  
 4   Insulin                   636 non-null    int64  
 5   BMI                       636 non-null    float64
 6   DiabetesPedigreeFunction  636 non-null    float64
 7   Age                       636 non-null    int64  
 8   Outcome                   636 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 49.7 KB


In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
5,5,116,74,0,0,25.6,0.201,30,0
