**Import Libraries and Load Cleaned Data

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import class_weight

# Load the cleaned dataset
df_cleaned = pd.read_csv('../data/cleaned_diabetes_dataset.csv')

**Split Data and Handle Class Imbalanc

In [7]:
# Separating features and target variable
X = df_cleaned.drop('Diabetes_012', axis=1)
y = df_cleaned['Diabetes_012']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate class weights for imbalanced dataset
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i : class_weights[i] for i in range(len(class_weights))}

**Train Random Forest Model

In [8]:
# Create a Random Forest classifier with class weights
rf_model = RandomForestClassifier(class_weight=class_weights_dict, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)
y_train_pred = rf_model.predict(X_train)


**Model Evaluation

In [9]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
training_accuracy = accuracy_score(y_train, y_train_pred)

print(f"Training Accuracy: {training_accuracy}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Training Accuracy: 0.9910615736360769
Accuracy: 0.8390689057079785
Precision: 0.4398733472394805
Recall: 0.37845264291854663
F1 Score: 0.3857267024801713
