In [None]:
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA


# `Load datasets`

In [None]:
X_train = np.load('/kaggle/input/pit-classification/distance_vectors_train.npy')
X_test = np.load('/kaggle/input/pit-classification/distance_vectors_test.npy')
X_val = np.load('/kaggle/input/pit-classification/distance_vectors_val.npy')
y_train = np.load('/kaggle/input/pit-classification/train_labels.npy')
y_test = np.load('/kaggle/input/pit-classification/test_labels.npy')
y_val = np.load('/kaggle/input/pit-classification/val_labels.npy')

# `Convert one-hot encoded labels to class indices`

In [None]:
y_train_labels = np.argmax(y_train, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
y_val_labels = np.argmax(y_val, axis=1)

# `Standardize features`

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

# `Apply PCA for dimensionality reduction`

In [None]:
pca = PCA(n_components=2000)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
X_val_pca = pca.transform(X_val_scaled)

# `Logistic Regression`

In [5]:
logreg_model = LogisticRegression(multi_class='multinomial', max_iter=500, solver='lbfgs', random_state=42)
logreg_model.fit(X_train_pca, y_train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# `Save model, scaler, and PCA`

In [None]:
joblib.dump(logreg_model, "/kaggle/working/logreg.pkl")
joblib.dump(scaler, "/kaggle/working/scaler.pkl")
joblib.dump(pca, "/kaggle/working/pca.pkl")

['/kaggle/working/pca.pkl']

# `Testing performance`

In [None]:
# Predictions for each dataset
y_train_pred = logreg_model.predict(X_train_pca)
y_val_pred = logreg_model.predict(X_val_pca)
y_test_pred = logreg_model.predict(X_test_pca)

In [None]:
# Compute accuracy for train, validation, and test sets
train_accuracy = accuracy_score(y_train_labels, y_train_pred)
val_accuracy = accuracy_score(y_val_labels, y_val_pred)
test_accuracy = accuracy_score(y_test_labels, y_test_pred)

In [None]:
# Print accuracies
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Training Accuracy: 74.80%
Validation Accuracy: 33.13%
Test Accuracy: 33.10%
