In [1]:
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# `Load datasets`

In [2]:
X_train = np.load('/kaggle/input/pit-classification/distance_vectors_train.npy')
X_test = np.load('/kaggle/input/pit-classification/distance_vectors_test.npy')
X_val = np.load('/kaggle/input/pit-classification/distance_vectors_val.npy')
y_train = np.load('/kaggle/input/pit-classification/train_labels.npy')
y_test = np.load('/kaggle/input/pit-classification/test_labels.npy')
y_val = np.load('/kaggle/input/pit-classification/val_labels.npy')

In [3]:
y_train = np.argmax(y_train, axis=1)
y_test = np.argmax(y_test, axis=1)
y_val = np.argmax(y_val, axis=1)

# `Random Forest`

In [4]:
rf_model = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5,  min_samples_leaf = 2,random_state=42)

In [5]:
rf_model.fit(X_train, y_train) 

# `Testing performance`

In [6]:
# Predictions for each dataset
y_train_pred = rf_model.predict(X_train)
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

In [7]:
# Compute accuracy for train, validation, and test sets
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [8]:
# Print accuracies
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Training Accuracy: 36.35%
Validation Accuracy: 12.98%
Test Accuracy: 13.21%


In [9]:
# Save the best model
joblib.dump(rf_model, '/kaggle/working/random_forest.pkl')

['/kaggle/working/random_forest.pkl']