# Coding Block 1 - Random Forests (and XGBoost)

### Load the packages

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from xgboost import XGBClassifier


### Read the dataset 
You can also compare processed and non-processed data.

In [45]:
df = pd.read_csv("../data/diabetes.csv")
df_cleaned = pd.read_csv("../data/df_imputed_clean.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [46]:
data = df_cleaned.drop(["Mahalanobis_Distance","Outlier","Multivariate_Outlier"],axis = 1)


### Split the data and train a Random Forest model

In [47]:

# Split the data into features (X) and target (y)
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 0.79
Precision: 0.77
Recall: 0.55
F1-Score: 0.64


### Evaluate the prediction models using a classification report

In [48]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred, target_names=["No Diabetes", "Diabetes"])
print(report)

              precision    recall  f1-score   support

 No Diabetes       0.80      0.92      0.86        97
    Diabetes       0.77      0.55      0.64        49

    accuracy                           0.79       146
   macro avg       0.79      0.73      0.75       146
weighted avg       0.79      0.79      0.78       146



### Print the feature importances of the random forest

In [49]:
# Get feature importances from the trained Random Forest model
importances = rf_classifier.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": importances
})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)

# Print the feature importances
print(feature_importance_df)

                    Feature  Importance
2                   Glucose    0.231341
5                   Insulin    0.132579
6                       BMI    0.125183
8                       Age    0.113238
7  DiabetesPedigreeFunction    0.104067
0                Unnamed: 0    0.090704
4             SkinThickness    0.087994
3             BloodPressure    0.060949
1               Pregnancies    0.053946


In [50]:
# Split the data into features (X) and target (y)
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Train XGBoost model
xgb_classifier = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric="logloss")
xgb_classifier.fit(X_train, y_train)

# Evaluate Random Forest model
y_pred_rf = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_roc_auc = roc_auc_score(y_test, rf_classifier.predict_proba(X_test)[:, 1])

# Evaluate XGBoost model
y_pred_xgb = xgb_classifier.predict(X_test)
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_roc_auc = roc_auc_score(y_test, xgb_classifier.predict_proba(X_test)[:, 1])

# Print results for Random Forest
print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")
print(f"ROC-AUC: {rf_roc_auc:.4f}")
print("\nClassification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf, target_names=["No Diabetes", "Diabetes"]))

# Print results for XGBoost
print("\nXGBoost Results:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")
print(f"ROC-AUC: {xgb_roc_auc:.4f}")
print("\nClassification Report for XGBoost:")
print(classification_report(y_test, y_pred_xgb, target_names=["No Diabetes", "Diabetes"]))

# Compare results
print("\nModel Comparison:")
comparison_df = pd.DataFrame({
    "Model": ["Random Forest", "XGBoost"],
    "Accuracy": [rf_accuracy, xgb_accuracy],
    "Precision": [rf_precision, xgb_precision],
    "Recall": [rf_recall, xgb_recall],
    "F1-Score": [rf_f1, xgb_f1],
    "ROC-AUC": [rf_roc_auc, xgb_roc_auc]
})
print(comparison_df)

Random Forest Results:
Accuracy: 0.7945
Precision: 0.7714
Recall: 0.5510
F1-Score: 0.6429
ROC-AUC: 0.8690

Classification Report for Random Forest:
              precision    recall  f1-score   support

 No Diabetes       0.80      0.92      0.86        97
    Diabetes       0.77      0.55      0.64        49

    accuracy                           0.79       146
   macro avg       0.79      0.73      0.75       146
weighted avg       0.79      0.79      0.78       146


XGBoost Results:
Accuracy: 0.7808
Precision: 0.7429
Recall: 0.5306
F1-Score: 0.6190
ROC-AUC: 0.8167

Classification Report for XGBoost:
              precision    recall  f1-score   support

 No Diabetes       0.79      0.91      0.85        97
    Diabetes       0.74      0.53      0.62        49

    accuracy                           0.78       146
   macro avg       0.77      0.72      0.73       146
weighted avg       0.78      0.78      0.77       146


Model Comparison:
           Model  Accuracy  Precision    R