In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, balanced_accuracy_score

import numpy as np

# Load the cleaned dataset
file_path = 'cleaned_diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(file_path)

# Splitting the dataset into features and labels
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

# Splitting the dataset into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier using Gini index
clf = DecisionTreeClassifier(criterion='gini', random_state=42)
clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = clf.predict(X_test)

# Calculating precision
precision = precision_score(y_test, y_pred, average=None)

# Calculating recall
recall = recall_score(y_test, y_pred, average=None)

# Calculating F1 score
f1 = f1_score(y_test, y_pred, average=None)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
    
# True Negatives (TN) and False Positives (FP) calculation for multiclass
tn = np.sum(cm) - (np.sum(cm, axis=1) + np.sum(cm, axis=0) - np.diag(cm))  # TN for each class
fp = np.sum(cm, axis=0) - np.diag(cm)  # FP for each class
    
# Specificity calculation
specificity = tn / (tn + fp)

# Predicting probabilities for each class
y_prob = clf.predict_proba(X_test)

# Calculating AUC-ROC for multi-class (using 'ovr' - one-vs-rest)
auc_roc = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Calculating balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

print(f"Precision: {precision}")

print(f"Recall: {recall}")

print(f"F1-score: {f1}")

print(f"Specificity: {specificity}")

print(f"AUC-ROC: {auc_roc}")

print(f"Balanced Accuracy: {balanced_accuracy}")

print(f"Confusion Matrix: {cm}")

Precision: [0.86046826 0.01755786 0.28328393]
Recall: [0.83403295 0.02428256 0.31694304]
F1-score: [0.8470444  0.02037981 0.29916973]
Specificity: [0.34255835 0.97267541 0.85749065]
AUC-ROC: 0.5578653617781775
Balanced Accuracy: 0.39175285176179697
Confusion Matrix: [[31790   967  5359]
 [  682    22   202]
 [ 4473   264  2198]]


In [30]:
# Load the cleaned dataset with Oversampling
file_path = 'overesampled_diabetes_data.csv'
df = pd.read_csv(file_path)

# Splitting the dataset into features and labels
X = df.drop('Diabetes_01', axis=1)
y = df['Diabetes_01']

# Splitting the dataset into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier using Gini index
clf = DecisionTreeClassifier(criterion='gini', random_state=42)
clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = clf.predict(X_test)

# Calculating precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate Specificity (TN / (TN + FP)) for binary classification
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

# Predicting probabilities for each class
y_prob = clf.predict_proba(X_test)[:, 1]

# Calculating AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)

# Calculating balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Display results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Specificity: {specificity}")
print(f"AUC-ROC: {auc_roc}")
print(f"Balanced Accuracy: {balanced_accuracy}")
print(f"Confusion Matrix: \n{cm}")


Precision: [0.98228861 0.84674087]
Recall: [0.82116606 0.98523578]
F1-score: [0.8945299  0.91075331]
Specificity: 0.8211660563268962
AUC-ROC: 0.9056776993161643
Balanced Accuracy: 0.9032009186139932
Confusion Matrix: 
[[31169  6788]
 [  562 37503]]


In [31]:
# Load the cleaned dataset with Oversampling
file_path = 'undersampled_diabetes_data.csv'
df = pd.read_csv(file_path)

# Splitting the dataset into features and labels
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

# Splitting the dataset into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier using Gini index
clf = DecisionTreeClassifier(criterion='gini', random_state=42)
clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = clf.predict(X_test)

# Calculating precision, recall, and F1 score
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate Specificity (TN / (TN + FP)) for binary classification
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

# Predicting probabilities for each class
y_prob = clf.predict_proba(X_test)[:, 1]

# Calculating AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)

# Calculating balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Display results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Specificity: {specificity}")
print(f"AUC-ROC: {auc_roc}")
print(f"Balanced Accuracy: {balanced_accuracy}")
print(f"Confusion Matrix: \n{cm}")

Precision: [0.65134371 0.64846513]
Recall: [0.64950635 0.65030501]
F1-score: [0.65042373 0.64938377]
Specificity: 0.6495063469675599
AUC-ROC: 0.6501439750479266
Balanced Accuracy: 0.6499056773850426
Confusion Matrix: 
[[4605 2485]
 [2465 4584]]


In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
import pandas as pd

# Load the cleaned dataset
file_path = 'overesampled_diabetes_data.csv'
df = pd.read_csv(file_path)

# Splitting the dataset into features and labels
X = df.drop('Diabetes_01', axis=1)
y = df['Diabetes_01']

# Splitting the dataset into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_clf.predict(X_test)

# Calculating precision
precision = precision_score(y_test, y_pred, average=None)

# Calculating recall
recall = recall_score(y_test, y_pred, average=None)

# Calculating F1 score
f1 = f1_score(y_test, y_pred, average=None)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Calculate Specificity (TN / (TN + FP)) for binary classification
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)

# Predicting probabilities for AUC-ROC
y_prob = rf_clf.predict_proba(X_test)[:, 1]

# Calculating AUC-ROC
auc_roc = roc_auc_score(y_test, y_prob)

# Calculating balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Display results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Specificity: {specificity}")
print(f"AUC-ROC: {auc_roc}")
print(f"Balanced Accuracy: {balanced_accuracy}")
print(f"Confusion Matrix: \n{cm}")

Precision: [0.98458909 0.8920518 ]
Recall: [0.88031193 0.98626034]
F1-score: [0.92953515 0.93679351]
Specificity: 0.8803119319229654
AUC-ROC: 0.9887891844927525
Balanced Accuracy: 0.9332861380355666
Confusion Matrix: 
[[33414  4543]
 [  523 37542]]


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
import pandas as pd

# Load the cleaned dataset
file_path = 'cleaned_diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(file_path)

# Splitting the dataset into features and labels
X = df.drop('Diabetes_012', axis=1)
y = df['Diabetes_012']

# Splitting the dataset into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_clf.predict(X_test)

# Calculating precision
precision = precision_score(y_test, y_pred, average=None)

# Calculating recall
recall = recall_score(y_test, y_pred, average=None)

# Calculating F1 score
f1 = f1_score(y_test, y_pred, average=None)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# True Negatives (TN) and False Positives (FP) calculation for multiclass
tn = np.sum(cm) - (np.sum(cm, axis=1) + np.sum(cm, axis=0) - np.diag(cm))  # TN for each class
fp = np.sum(cm, axis=0) - np.diag(cm)  # FP for each class
    
# Specificity calculation
specificity = tn / (tn + fp)

# Predicting probabilities for AUC-ROC
y_prob = rf_clf.predict_proba(X_test)

# Calculating AUC-ROC for multi-class (using 'ovr' - one-vs-rest)
auc_roc = roc_auc_score(y_test, y_prob, multi_class='ovr')

# Calculating balanced accuracy
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

# Display results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print(f"Specificity: {specificity}")
print(f"AUC-ROC: {auc_roc}")
print(f"Balanced Accuracy: {balanced_accuracy}")
print(f"Confusion Matrix: \n{cm}")

Precision: [0.85133436 0.         0.45966102]
Recall: [0.95912478 0.         0.19552992]
F1-score: [0.90202078 0.         0.27435508]
Specificity: [0.18581814 0.99855719 0.95915125]
AUC-ROC: 0.7193883687602435
Balanced Accuracy: 0.38488489922955943
Confusion Matrix: 
[[36558    57  1501]
 [  813     0    93]
 [ 5571     8  1356]]
