<a href="https://colab.research.google.com/github/gayatrinaram/Diabetes-Prediction-using-Python-and-Logistic-Regression/blob/main/Diabetes_Prediction_using_Python_and_Logistic_Regression_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Diabetes Prediction using Python and Logistic Regression**

In [13]:
# Step 1: Import libraries
import pandas as pd
import numpy as np  # Importing numpy to handle numeric types
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression  # Import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder  # Import label encoder

# Step 2: Load the dataset from CSV
csv_file = "/content/diabetes_prediction_dataset.csv.zip"  # Replace with the actual path to your CSV file
df = pd.read_csv(csv_file)

# Step 3: Inspect the dataset
print("Dataset Shape:", df.shape)
print("First few rows of the dataset:\n", df.head())
print("Columns in the dataset:\n", df.columns)

# Step 4: Check for missing values
print("\nMissing values:\n", df.isnull().sum())
df.dropna(inplace=True)  # Drop rows with missing values

# Step 5: Identify and handle the target column dynamically
target_column = 'BmiClass'  # Default to 'BmiClass', change if your target column is different

# Check if 'BmiClass' exists in the dataset, otherwise use the first numeric column as the target
if target_column in df.columns:
    print(f"Using '{target_column}' as the target column.")
else:
    print(f"Warning: Target column '{target_column}' not found. Using the first numeric column as the target.")
    # Find the first numeric column to be used as the target
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    target_column = numeric_columns[0]  # Use the first numeric column as the target

# Step 6: Encode categorical columns (handle all categorical columns)
categorical_columns = df.select_dtypes(include=[object]).columns
for col in categorical_columns:
    if df[col].dtype == 'object':  # Check for categorical columns
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])  # Convert categorical to numerical values

# Step 7: Ensure the target column is categorical

if df[target_column].dtype != 'object':

    bins = [0, 18.5, 24.9, 29.9, np.inf]
    labels = ['Underweight', 'Normal weight', 'Overweight', 'Obese']
    df[target_column] = pd.cut(df[target_column], bins=bins, labels=labels)

# Step 8: Separate features and target variable
X = df.drop(columns=[target_column])  # Features
y = df[target_column]  # Target variable

# Step 9: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 10: Train Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increased iterations to ensure convergence
model.fit(X_train, y_train)

# Step 11: Make predictions
y_pred = model.predict(X_test)

# Step 12: Evaluate the model
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Dataset Shape: (100000, 9)
First few rows of the dataset:
    gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
Columns in the dataset:
 Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

Missing values:
 gender              

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Accuracy: 0.75185

Confusion Matrix:
 [[    0  1384     0   140]
 [    0 12920     0   461]
 [    0  1245     0    77]
 [    0  1656     0  2117]]

Classification Report:
                precision    recall  f1-score   support

Normal weight       0.00      0.00      0.00      1524
        Obese       0.75      0.97      0.84     13381
   Overweight       0.00      0.00      0.00      1322
  Underweight       0.76      0.56      0.64      3773

     accuracy                           0.75     20000
    macro avg       0.38      0.38      0.37     20000
 weighted avg       0.65      0.75      0.69     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Step 12: Evaluate the model and map the target to 'Diabetic' or 'Not Diabetic'

# Convert the predicted categories to 'Diabetic' or 'Not Diabetic'
diabetes_labels = {'Obese': 'Diabetic', 'Overweight': 'Diabetic', 'Normal weight': 'Not Diabetic', 'Underweight': 'Not Diabetic'}

# Apply the label mapping to predictions
y_pred_labels = [diabetes_labels[label] for label in y_pred]

# Display predictions for each test sample
for i in range(len(y_pred_labels)):
    print(f"Patient {i+1}: Predicted - {y_pred_labels[i]}")

# Step 13: Evaluate overall accuracy and other metrics
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Patient 15009: Predicted - Diabetic
Patient 15010: Predicted - Diabetic
Patient 15011: Predicted - Diabetic
Patient 15012: Predicted - Diabetic
Patient 15013: Predicted - Diabetic
Patient 15014: Predicted - Diabetic
Patient 15015: Predicted - Diabetic
Patient 15016: Predicted - Diabetic
Patient 15017: Predicted - Not Diabetic
Patient 15018: Predicted - Diabetic
Patient 15019: Predicted - Not Diabetic
Patient 15020: Predicted - Diabetic
Patient 15021: Predicted - Diabetic
Patient 15022: Predicted - Diabetic
Patient 15023: Predicted - Diabetic
Patient 15024: Predicted - Diabetic
Patient 15025: Predicted - Diabetic
Patient 15026: Predicted - Diabetic
Patient 15027: Predicted - Diabetic
Patient 15028: Predicted - Diabetic
Patient 15029: Predicted - Diabetic
Patient 15030: Predicted - Diabetic
Patient 15031: Predicted - Diabetic
Patient 15032: Predicted - Not Diabetic
Patient 15033: Predicted - Diabetic
Patient 15034: Predicte

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
                precision    recall  f1-score   support

Normal weight       0.00      0.00      0.00      1524
        Obese       0.75      0.97      0.84     13381
   Overweight       0.00      0.00      0.00      1322
  Underweight       0.76      0.56      0.64      3773

     accuracy                           0.75     20000
    macro avg       0.38      0.38      0.37     20000
 weighted avg       0.65      0.75      0.69     20000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
