In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

columns = [
    "Pregnancies",
    "Glucose",
    "BloodPressure",
    "SkinThickness",
    "Insulin",
    "BMI",
    "DiabetesPedigree",
    "Age",
    "Outcome"
]

data = pd.read_csv(url, names=columns)

data.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
(data == 0).sum()


Unnamed: 0,0
Pregnancies,111
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,374
BMI,11
DiabetesPedigree,0
Age,0
Outcome,500


In [3]:
import numpy as np

cols_to_fix = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

for col in cols_to_fix:
    data[col] = data[col].replace(0, np.nan)
    data[col].fillna(data[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


In [4]:
(data == 0).sum()


Unnamed: 0,0
Pregnancies,111
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigree,0
Age,0
Outcome,500


In [5]:
from sklearn.model_selection import train_test_split

X = data.drop("Outcome", axis=1)
y = data["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [7]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

accuracy


0.7532467532467533

In [8]:
sample_patient = [[
    2,      # Pregnancies
    130,    # Glucose
    70,     # BloodPressure
    25,     # SkinThickness
    100,    # Insulin
    28.0,   # BMI
    0.5,    # DiabetesPedigree
    35      # Age
]]


In [12]:
import pandas as pd

sample_patient_df = pd.DataFrame(
    [[2, 130, 70, 25, 100, 28.0, 0.5, 35]],
    columns=X.columns
)

prediction = model.predict(sample_patient_df)
prediction


array([0])

In [13]:
if prediction[0] == 1:
    print("⚠️ High risk of diabetes")
else:
    print("✅ Low risk of diabetes")


✅ Low risk of diabetes


In [14]:
import pickle

with open("diabetes_model.pkl", "wb") as file:
    pickle.dump(model, file)
