In [None]:
import numpy as np
import joblib
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

# `Load datasets`

In [2]:
X_train = np.load('/kaggle/input/pit-classification/distance_vectors_train.npy')
X_test = np.load('/kaggle/input/pit-classification/distance_vectors_test.npy')
X_val = np.load('/kaggle/input/pit-classification/distance_vectors_val.npy')
y_train = np.load('/kaggle/input/pit-classification/train_labels.npy')
y_test = np.load('/kaggle/input/pit-classification/test_labels.npy')
y_val = np.load('/kaggle/input/pit-classification/val_labels.npy')

# `Convert one-hot encoded labels to class indices`

In [None]:
y_train = np.argmax(y_train, axis=1)
y_test = np.argmax(y_test, axis=1)
y_val = np.argmax(y_val, axis=1)

# `Standardize features`

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

# `Apply PCA for dimensionality reduction`

In [None]:
pca = PCA(n_components=500)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
X_val_pca = pca.transform(X_val_scaled)

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 10, 15, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

grid_search = GridSearchCV(
    KNeighborsClassifier(n_jobs=-1), param_grid, cv=5, scoring='accuracy', n_jobs=-1
)

In [8]:
grid_search.fit(X_train_resampled, y_train_resampled)
best_params = grid_search.best_params_

In [9]:
print(f"Best K: {best_params['n_neighbors']}, Best Weight: {best_params['weights']}, Best p: {best_params['p']}")

Best K: 3, Best Weight: distance, Best p: 1


In [None]:
knn_model = KNeighborsClassifier(
    n_neighbors=best_params['n_neighbors'], 
    weights=best_params['weights'], 
    p=best_params['p'],
    n_jobs=-1
)

In [11]:
knn_model.fit(X_train_resampled, y_train_resampled)

In [None]:
joblib.dump(knn_model, "knn_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(pca, "pca.pkl")

['pca.pkl']

# `Testing performance`

In [None]:
y_train_pred = knn_model.predict(X_train_pca)
y_val_pred = knn_model.predict(X_val_pca)
y_test_pred = knn_model.predict(X_test_pca)

In [None]:
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [None]:
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Training Accuracy: 31.67%
Validation Accuracy: 18.88%
Test Accuracy: 19.13%
