# Decision Tree Model for Diabetes Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## 1. Data Acquisition

In [2]:
df = pd.read_csv("../datasets/Processed_data.csv",sep=",",index_col="PatientID")
df.shape

(15298, 18)

## 2. Data Preparation

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
# Split data into training and testing sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
# Display training data shape
df_train.shape

(12238, 18)

In [6]:
# Define feature columns for clarity
feature_columns = ['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
                   'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age']

In [7]:
# Extract features and target for training data
X_train = df_train.loc[:, feature_columns].values
y_train = df_train.Diabetic.values

In [8]:
# Extract features and target for test data
X_test = df_test.loc[:, feature_columns].values
y_test = df_test.Diabetic.values

## 3. Decision Tree Model Implementation

In [9]:
from sklearn.tree import DecisionTreeClassifier

# Initialize decision tree classifier
dt_classifier = DecisionTreeClassifier(max_depth=5, random_state=42)

# Train the model
dt_classifier.fit(X=X_train, y=y_train)

# Predict on test data
y_test_predicted = dt_classifier.predict(X_test)

In [10]:
# Calculate basic accuracy
accuracy = (y_test_predicted == y_test).sum() / len(y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9284313725490196


## 4. Model Evaluation

### 4.1 Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix

In [12]:
# Create confusion matrix
cf = pd.DataFrame(
    columns=["y_test_0", "y_test_1"], index=["y_pred_0", "y_pred_1"]
)

cf.loc[:, :] = confusion_matrix(y_true=y_test, y_pred=y_test_predicted)
cf

Unnamed: 0,y_test_0,y_test_1
y_pred_0,1441,116
y_pred_1,103,1400


### 4.2 Classification Metrics

In [13]:
from sklearn.metrics import recall_score, precision_score, classification_report

In [14]:
# Calculate recall and precision
recall = recall_score(y_true=y_test, y_pred=y_test_predicted)
precision = precision_score(y_true=y_test, y_pred=y_test_predicted)

print(f"Recall: {recall},\nPrecision: {precision}")

Recall: 0.9314703925482368,
Precision: 0.9234828496042217


In [15]:
# Generate classification report
report = classification_report(y_true=y_test, y_pred=y_test_predicted)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1557
           1       0.92      0.93      0.93      1503

    accuracy                           0.93      3060
   macro avg       0.93      0.93      0.93      3060
weighted avg       0.93      0.93      0.93      3060



## 7. Hyperparameter Tuning

In [16]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 8, 10],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Setup grid search
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Show best parameters
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Score: {grid_search.best_score_}")

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 10}
Best Cross-Validation Score: 0.9327502196901101


## 8. Final Model and Evaluation

In [17]:
# Create final model with best parameters
final_model = DecisionTreeClassifier(
    criterion=grid_search.best_params_['criterion'],
    max_depth=grid_search.best_params_['max_depth'],
    min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
    random_state=42
)

# Train final model
final_model.fit(X_train, y_train)

# Evaluate on test set
y_pred_final = final_model.predict(X_test)
final_accuracy = (y_pred_final == y_test).sum() / len(y_test)
final_report = classification_report(y_test, y_pred_final)

print(f"Final Model Accuracy: {final_accuracy}")
print(f"\nFinal Classification Report:\n{final_report}")

Final Model Accuracy: 0.9359477124183007

Final Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      1557
           1       0.93      0.94      0.94      1503

    accuracy                           0.94      3060
   macro avg       0.94      0.94      0.94      3060
weighted avg       0.94      0.94      0.94      3060



In [18]:
# Saving the model
with open('../models/decision_tree_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)