In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
import pydotplus
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier


In [2]:
# Load your dataset
data = pd.read_csv("C:/Users/12408/Desktop/INST737/balanced_data.csv")
# Generate a random permutation of indices
indices = np.random.permutation(len(data))
# Specify the ratio for splitting
train_ratio = 0.8
test_ratio = 1 - train_ratio


In [3]:
# Calculate the number of samples for training and testing
num_train_samples = int(train_ratio * len(data))
num_test_samples = len(data) - num_train_samples

# Use the shuffled indices to split the dataset
train_indices = indices[:num_train_samples]
test_indices = indices[num_train_samples:]

# Create training and testing datasets
train_data = data.iloc[train_indices]
test_data = data.iloc[test_indices]




In [4]:
# Check the distribution of 'Diabetes' in the original dataset
original_distribution = data['Diabetes'].value_counts(normalize=True)

# Check the distribution of 'Diabetes' in the training dataset
train_distribution = train_data['Diabetes'].value_counts(normalize=True)

# Check the distribution of 'Diabetes' in the testing dataset
test_distribution = test_data['Diabetes'].value_counts(normalize=True)

print("Original Distribution:")
print(original_distribution)

print("Training Set Distribution:")
print(train_distribution)

print("Testing Set Distribution:")
print(test_distribution)


Original Distribution:
0    0.333333
1    0.333333
2    0.333333
Name: Diabetes, dtype: float64
Training Set Distribution:
1    0.340267
0    0.333687
2    0.326045
Name: Diabetes, dtype: float64
Testing Set Distribution:
2    0.362479
0    0.331919
1    0.305603
Name: Diabetes, dtype: float64


In [5]:
X = data.drop('Diabetes', axis=1)
y = data['Diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
clf = DecisionTreeClassifier(max_depth=7)

# Fit the classifier to the training data
clf.fit(X_train, y_train)


In [26]:
# Predict on the training dataset
y_train_pred = clf.predict(X_train)
confusion_matrix_train = confusion_matrix(y_train, y_train_pred)

# Predict on the testing dataset
y_test_pred = clf.predict(X_test)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred)

print("Confusion Matrix (Training):")
print(confusion_matrix_train)
print("Confusion Matrix (Testing):")
print(confusion_matrix_test)


Confusion Matrix (Training):
[[932 359 255]
 [254 733 595]
 [173 418 992]]
Confusion Matrix (Testing):
[[205 126  86]
 [ 69 150 162]
 [ 45 132 203]]


In [27]:
# Calculate accuracy for training and testing
accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print("Accuracy (Training): {:.2f}%".format(accuracy_train * 100))
print("Accuracy (Testing): {:.2f}%".format(accuracy_test * 100))

# Display the classification report
classification_report_train = classification_report(y_train, y_train_pred, target_names=["No Diabetes", "Pre-Diabetes", "Diabetes"])
classification_report_test = classification_report(y_test, y_test_pred, target_names=["No Diabetes", "Pre-Diabetes", "Diabetes"])

print("Classification Report (Training):\n", classification_report_train)
print("Classification Report (Testing):\n", classification_report_test)


Accuracy (Training): 56.40%
Accuracy (Testing): 47.37%
Classification Report (Training):
               precision    recall  f1-score   support

 No Diabetes       0.69      0.60      0.64      1546
Pre-Diabetes       0.49      0.46      0.47      1582
    Diabetes       0.54      0.63      0.58      1583

    accuracy                           0.56      4711
   macro avg       0.57      0.56      0.57      4711
weighted avg       0.57      0.56      0.56      4711

Classification Report (Testing):
               precision    recall  f1-score   support

 No Diabetes       0.64      0.49      0.56       417
Pre-Diabetes       0.37      0.39      0.38       381
    Diabetes       0.45      0.53      0.49       380

    accuracy                           0.47      1178
   macro avg       0.49      0.47      0.48      1178
weighted avg       0.49      0.47      0.48      1178



In [28]:
feature_importances = clf.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importances
print("Feature Importances:")
print(feature_importance_df)

Feature Importances:
                 Feature  Importance
9         General_health    0.260761
2                    BMI    0.244747
1                    Age    0.229568
10       Income_category    0.083379
14      Alcohol_consumed    0.051703
7          Average_drink    0.029638
6          Food_shortage    0.020608
13  HeartDiseaseorAttack    0.017109
0                    Sex    0.016997
4                 Smoker    0.016362
3               COVIDPOS    0.012840
12    Walking_difficulty    0.010451
5      Physical_activity    0.003688
8               MEDCOST1    0.002150
11                Stroke    0.000000


In [30]:
k = 5
selected_features = feature_importance_df.head(k)['Feature'].tolist()

# Subset the data with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Refit the classifier with selected features
clf.fit(X_train_selected, y_train)

# Predict on the training dataset with selected features
y_train_pred_new = clf.predict(X_train_selected)
confusion_matrix_train = confusion_matrix(y_train, y_train_pred_new)

# Predict on the testing dataset with selected features
y_test_pred_new = clf.predict(X_test_selected)
confusion_matrix_test = confusion_matrix(y_test, y_test_pred_new)

print("Confusion Matrix (Training) with Selected Features:")
print(confusion_matrix_train)
print("Confusion Matrix (Testing) with Selected Features:")
print(confusion_matrix_test)


Confusion Matrix (Training) with Selected Features:
[[1014  304  228]
 [ 338  786  458]
 [ 216  534  833]]
Confusion Matrix (Testing) with Selected Features:
[[223 109  85]
 [ 96 163 122]
 [ 55 156 169]]


In [31]:
# Calculate accuracy for training and testing
accuracy_train = accuracy_score(y_train, y_train_pred_new)
accuracy_test = accuracy_score(y_test, y_test_pred_new)

print("Accuracy (Training): {:.2f}%".format(accuracy_train * 100))
print("Accuracy (Testing): {:.2f}%".format(accuracy_test * 100))

# Display the classification report
classification_report_train = classification_report(y_train, y_train_pred_new, target_names=["No Diabetes", "Pre-Diabetes", "Diabetes"])
classification_report_test = classification_report(y_test, y_test_pred_new, target_names=["No Diabetes", "Pre-Diabetes", "Diabetes"])

print("Classification Report (Training):\n", classification_report_train)
print("Classification Report (Testing):\n", classification_report_test)


Accuracy (Training): 55.89%
Accuracy (Testing): 47.11%
Classification Report (Training):
               precision    recall  f1-score   support

 No Diabetes       0.65      0.66      0.65      1546
Pre-Diabetes       0.48      0.50      0.49      1582
    Diabetes       0.55      0.53      0.54      1583

    accuracy                           0.56      4711
   macro avg       0.56      0.56      0.56      4711
weighted avg       0.56      0.56      0.56      4711

Classification Report (Testing):
               precision    recall  f1-score   support

 No Diabetes       0.60      0.53      0.56       417
Pre-Diabetes       0.38      0.43      0.40       381
    Diabetes       0.45      0.44      0.45       380

    accuracy                           0.47      1178
   macro avg       0.48      0.47      0.47      1178
weighted avg       0.48      0.47      0.47      1178

