In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/machine-learning-project/data/data_original.csv')

# drop id column from df
df = df.drop(columns=['id'])

# Separate features and target variable
X = df.drop(columns=['label'])  # All columns except 'label'
y = df['label']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Initialize KNN with default parameters
knn = KNeighborsClassifier()

# Fit the model to the training data
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5062111801242236
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.57      0.53       139
           1       0.00      0.00      0.00        27
           2       0.53      0.54      0.54       156

    accuracy                           0.51       322
   macro avg       0.34      0.37      0.36       322
weighted avg       0.47      0.51      0.49       322



In [3]:
# Set the parameters for GridSearchCV
param_grid = {
    'n_neighbors': range(1, 21),  # Test values for k from 1 to 20
    'weights': ['uniform', 'distance'],  # Two options for weights
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metrics
}

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, n_jobs=-1)

# Fit the GridSearchCV to find the best parameters
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters
print("Best Parameters from Grid Search:", grid_search.best_params_)

# Get the best estimator (the best KNN model with the optimal parameters)
best_knn = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_knn.predict(X_test_scaled)
print("Optimized Accuracy:", accuracy_score(y_test, y_pred_best))
print("Optimized Classification Report:\n", classification_report(y_test, y_pred_best))


Best Parameters from Grid Search: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'distance'}
Optimized Accuracy: 0.5124223602484472
Optimized Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.47      0.48       139
           1       0.00      0.00      0.00        27
           2       0.54      0.63      0.58       156

    accuracy                           0.51       322
   macro avg       0.34      0.37      0.35       322
weighted avg       0.47      0.51      0.49       322



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
