<a href="https://colab.research.google.com/github/gk0908/2303031247002/blob/main/Diabetes_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install packages
!pip install pandas scikit-learn numpy joblib

# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

print("✅ Packages installed & imported!")

✅ Packages installed & imported!


In [7]:
import pandas as pd

df = pd.read_csv('diabetes.csv')

print("📊 Dataset loaded!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


📊 Dataset loaded!
Shape: (768, 9)
Columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [8]:
# Basic info
print(f"Total patients: {len(df)}")
print(f"Diabetic patients: {df['Outcome'].sum()} ({(df['Outcome'].sum()/len(df))*100:.1f}%)")
print("\nFirst 3 rows:")
print(df.head(3))

Total patients: 768
Diabetic patients: 268 (34.9%)

First 3 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  


In [9]:
# Fix zero values
df_clean = df.copy()

features_to_fix = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for feature in features_to_fix:
    # Replace zeros with median
    df_clean[feature] = df_clean[feature].replace(0, np.nan)
    df_clean[feature].fillna(df_clean[feature].median(), inplace=True)

print("✅ Data cleaned!")

✅ Data cleaned!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean[feature].fillna(df_clean[feature].median(), inplace=True)


In [10]:
# Split data
X = df_clean.drop('Outcome', axis=1)
y = df_clean['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 614
Testing samples: 154


In [11]:
# Train Random Forest
print("🌲 Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_accuracy = accuracy_score(y_test, rf_model.predict(X_test))

# Train Logistic Regression
print("📈 Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)
lr_accuracy = accuracy_score(y_test, lr_model.predict(X_test))

print(f"\n📊 Results:")
print(f"Random Forest Accuracy: {rf_accuracy:.4f} ({rf_accuracy*100:.2f}%)")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")

🌲 Training Random Forest...
📈 Training Logistic Regression...

📊 Results:
Random Forest Accuracy: 0.7468 (74.68%)
Logistic Regression Accuracy: 0.7532 (75.32%)


In [12]:
# Choose best model
if rf_accuracy >= lr_accuracy:
    joblib.dump(rf_model, 'diabetes_model.pkl')
    best_model_name = "Random Forest"
    best_accuracy = rf_accuracy
else:
    joblib.dump(lr_model, 'diabetes_model.pkl')
    best_model_name = "Logistic Regression"
    best_accuracy = lr_accuracy

# Save feature names
feature_names = list(X.columns)
joblib.dump(feature_names, 'feature_names.pkl')

print(f"\n🏆 Best Model: {best_model_name}")
print(f"💾 Saved: diabetes_model.pkl & feature_names.pkl")


🏆 Best Model: Logistic Regression
💾 Saved: diabetes_model.pkl & feature_names.pkl


In [13]:
# Load and test
model = joblib.load('diabetes_model.pkl')
features = joblib.load('feature_names.pkl')

# Test sample
test_patient = {
    'Pregnancies': 2,
    'Glucose': 150,
    'BloodPressure': 70,
    'SkinThickness': 25,
    'Insulin': 80,
    'BMI': 28.5,
    'DiabetesPedigreeFunction': 0.5,
    'Age': 35
}

# Prepare input
input_data = [test_patient[f] for f in features]
input_array = np.array(input_data).reshape(1, -1)

# Predict
prediction = model.predict(input_array)[0]
probability = model.predict_proba(input_array)[0]

print(f"\n🎯 Test Prediction:")
print(f"Patient: Glucose={test_patient['Glucose']}, BMI={test_patient['BMI']}, Age={test_patient['Age']}")
print(f"Result: {'🟥 DIABETIC' if prediction == 1 else '🟩 NOT DIABETIC'}")
print(f"Confidence: {probability[prediction]:.2%}")


🎯 Test Prediction:
Patient: Glucose=150, BMI=28.5, Age=35
Result: 🟩 NOT DIABETIC
Confidence: 53.68%




In [14]:
from google.colab import files
files.download('diabetes_model.pkl')
files.download('feature_names.pkl')
print("📥 Files downloaded!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Files downloaded!


In [15]:


# Step 9: Test with user input
print("\n🧪 Now testing with user input...")
predict_diabetes()  # Run the interactive function


🧪 Now testing with user input...


NameError: name 'predict_diabetes' is not defined

In [16]:
# Step 7: Define the prediction function
def predict_diabetes():
    print("🎯 Enter Patient Details:")

    # Get user input
    pregnancies = float(input("Pregnancies: "))
    glucose = float(input("Glucose: "))
    blood_pressure = float(input("BloodPressure: "))
    skin_thickness = float(input("SkinThickness: "))
    insulin = float(input("Insulin: "))
    bmi = float(input("BMI: "))
    dpf = float(input("DiabetesPedigreeFunction: "))
    age = float(input("Age: "))

    # Create input array
    patient_data = [pregnancies, glucose, blood_pressure, skin_thickness,
                   insulin, bmi, dpf, age]
    input_array = np.array(patient_data).reshape(1, -1)

    # Predict
    prediction = model.predict(input_array)[0]
    probability = model.predict_proba(input_array)[0]

    # Show results
    print(f"\n🔍 Prediction Results:")
    print(f"Risk: {'🟥 HIGH (Diabetic)' if prediction == 1 else '🟩 LOW (Not Diabetic)'}")
    print(f"Confidence: {probability[prediction]:.2%}")
    print(f"Probability - No Diabetes: {probability[0]:.2%}")
    print(f"Probability - Diabetes: {probability[1]:.2%}")

print("✅ Function defined! Now you can use it.")

✅ Function defined! Now you can use it.


In [17]:
# Save the trained model and features
joblib.dump(model, 'diabetes_model.pkl')
joblib.dump(features, 'feature_names.pkl')

print("💾 Model saved: diabetes_model.pkl")
print("💾 Features saved: feature_names.pkl")

💾 Model saved: diabetes_model.pkl
💾 Features saved: feature_names.pkl


In [18]:
from google.colab import files
files.download('diabetes_model.pkl')
files.download('feature_names.pkl')

print("📥 Files downloaded! Ready for your Flask API.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📥 Files downloaded! Ready for your Flask API.


In [19]:
# Now run the function
print("\n🧪 Testing with user input...")
predict_diabetes()


🧪 Testing with user input...
🎯 Enter Patient Details:
Pregnancies: 0
Glucose: 21.9
BloodPressure: 67
SkinThickness: 65
Insulin: 32
BMI: 78
DiabetesPedigreeFunction: 0.5
Age: 20

🔍 Prediction Results:
Risk: 🟩 LOW (Not Diabetic)
Confidence: 52.93%
Probability - No Diabetes: 52.93%
Probability - Diabetes: 47.07%


