In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [2]:
dataset = pd.read_csv("diabetes.csv")

In [3]:
dataset.head(4)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0


In [4]:
dataset.shape

(768, 9)

In [5]:
dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
dataset["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [8]:
dataset.groupby("Outcome").mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [9]:
x = dataset.iloc[:,:-1]
y = dataset["Outcome"]

In [10]:
# Now Standardize the dataset, to a normal range.

In [11]:
scaler = StandardScaler()

In [12]:
scaler.fit(x)

In [13]:
standardized_data =scaler.transform(x)

In [14]:
x = standardized_data
y = dataset["Outcome"]

In [15]:
# Train Test Split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2)

In [17]:
print(x.shape, x_train.shape, x_test.shape)

(768, 8) (614, 8) (154, 8)


In [18]:
# Training the model

In [19]:
from sklearn.svm import SVC

In [20]:
model = SVC(kernel = 'rbf', C = 10, gamma = 0.01, class_weight="balanced", random_state=42)

In [21]:
model.fit(x_train, y_train)

In [22]:
x_test_pred = model.predict(x_test)
x_train_pred = model.predict(x_train)

In [23]:
print("Accuracy of Testing:", accuracy_score(y_test, x_test_pred)*100)
print("Accuracy of Training:", accuracy_score(y_train,x_train_pred)*100)

Accuracy of Testing: 75.32467532467533
Accuracy of Training: 77.85016286644951


In [24]:
# Classification report of SVC model for Medical Support.

In [25]:
print(classification_report(y_test, model.predict(x_test)))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       100
           1       0.65      0.65      0.65        54

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154



In [26]:
# Try Different model

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
lr = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)

In [29]:
lr.fit(x_train, y_train)

In [30]:
x_test_predict = lr.predict(x_test)
x_train_predict = lr.predict(x_train)

In [31]:
print("Accuracy of Testing:", accuracy_score(y_test, x_test_predict)*100)
print("Accuracy of Training:", accuracy_score(y_train,x_train_predict)*100)

Accuracy of Testing: 75.32467532467533
Accuracy of Training: 77.19869706840392


In [32]:
# Classification report of LogisticRegression model for Medical Support.

In [33]:
print(classification_report(y_test, lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       100
           1       0.64      0.67      0.65        54

    accuracy                           0.75       154
   macro avg       0.73      0.73      0.73       154
weighted avg       0.76      0.75      0.75       154



***Making a Predictive System***

In [34]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [35]:
input_data = (13,145,82,19,110,22.2,0.245,57)

In [36]:
# Cahnging the input_data to numpy array 

In [37]:
input_data_as_numpy = np.asarray(input_data)

In [38]:
# reshape the array as we are predicting for one instance

In [39]:
input_data_reshaped = input_data_as_numpy.reshape(1,-1)

In [40]:
# standardized the input data

In [41]:
std_data = scaler.transform(input_data_reshaped)

In [42]:
# Now predict
prediction = model.predict(std_data)  # prediction by using SVC model.
print(prediction)

if (prediction == 1):
    print("The person is diabetic")
else:
    print("The Person is non-diabetic")

[0]
The Person is non-diabetic


In [43]:
prediction1 = lr.predict(std_data)
print(prediction1)

if (prediction1 == 1):
    print("The person is diabetic")
else:
    print("The Person is non-diabetic")

[1]
The person is diabetic


**Now save the model.**

In [44]:
import joblib

In [45]:
# Save trained model
joblib.dump(model, "diabetes_model.pkl")

# Save the scaler too (important for preprocessing new data)
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [46]:
# Load model
loaded_model = joblib.load("diabetes_model.pkl")

# Load scaler
loaded_scaler = joblib.load("scaler.pkl")

# ‚ö†Ô∏è Disclaimer:
This Diabetes Prediction model is developed purely for learning and practicing 
Machine Learning techniques in the healthcare domain. 

It is trained on a small, limited dataset and is NOT a medically validated tool. 
Predictions from this model should NOT be used for medical diagnosis, treatment, 
or decision-making. 

Always consult a qualified healthcare professional for medical advice.

# Conslusion
**You can say ‚Äî ‚ÄúWe compared two models (SVM and Logistic). Both gave similar accuracy (~75%), but Logistic focused more on recall while SVM improved precision. This shows the trade-off between models in medical prediction.‚Äù üî•**