In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
import time
import numpy as np
import pandas as pd

# Read Excel file
# Load dataset
X = pd.read_excel("../coffeeDataSynthesized.xlsx", "dataset")
y = np.where(X["type"] == "robusta", 0, 1)
y = pd.Series(y)

X = X[['width', 'height', 'depth', 'weight']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

# (a) Determine the number of trees using 5-fold cross-validation
param_grid = {'n_estimators': [10, 50, 100, 200, 500]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=44), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print cross-validation accuracy for each configuration
print("Cross-validation accuracies for each configuration:")
cv_results = pd.DataFrame(grid_search.cv_results_)
for param, mean_score in zip(cv_results['param_n_estimators'], cv_results['mean_test_score']):
    print(f"n_estimators={param}: Accuracy={mean_score:.4f}")

# Best number of trees
best_n_estimators = grid_search.best_params_['n_estimators']
print(f"\nOptimal number of trees: {best_n_estimators}")

# (b) Train final model
start_time = time.time()
final_model = RandomForestClassifier(n_estimators=best_n_estimators, random_state=44)
final_model.fit(X_train, y_train)
train_time = time.time() - start_time

# Evaluate performance on training set
train_preds = final_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
print(f"\nTraining Accuracy: {train_accuracy:.4f}")

# Evaluate performance on test set
test_preds = final_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print("\nTest Classification Report:")
print(classification_report(y_test, test_preds))

# Runtime summary
print(f"\nTraining time: {train_time:.2f} seconds")


Cross-validation accuracies for each configuration:
n_estimators=10: Accuracy=0.8668
n_estimators=50: Accuracy=0.8709
n_estimators=100: Accuracy=0.8802
n_estimators=200: Accuracy=0.8771
n_estimators=500: Accuracy=0.8781

Optimal number of trees: 100

Training Accuracy: 1.0000
Test Accuracy: 0.9050

Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       120
           1       0.89      0.93      0.91       122

    accuracy                           0.90       242
   macro avg       0.91      0.90      0.90       242
weighted avg       0.91      0.90      0.90       242


Training time: 0.15 seconds


In [2]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, test_preds)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[106  14]
 [  9 113]]


In [3]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, recall_score, precision_score, f1_score

# Assuming y_true and y_pred are your true labels and predicted labels
# For probabilities, use y_pred_proba for AUROC and average precision.
model_scratch = RandomForestClassifier(random_state=42)

# Example: Classification accuracy
accuracy = accuracy_score(y_test, test_preds)

# Example: Recall, Precision, and F1-score
recall = recall_score(y_test, test_preds)
precision = precision_score(y_test, test_preds)
f1 = f1_score(y_test, test_preds)
auroc = roc_auc_score(y_test, test_preds)

# Print all metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUROC: {auroc:.4f}")


Accuracy: 0.9050
Recall: 0.9262
Precision: 0.8898
F1-score: 0.9076
AUROC: 0.9048
