# KNN - K Nearest Neighbour
* In this notebook we are going to train a model using KNN, experiment with various params using grid search and try to come up with best `KNN` version.

## Install Libraries

In [1]:
# %pip install scikit-learn

## Import Libraries

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,recall_score,precision_score,precision_recall_curve
import seaborn as sns


# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.utils import preprocessing
from src.utils import common
from src.utils.training import refit_strategy

## Initialize Directories

In [3]:
data_root_dir = Path("..", "data/")
models_root_dir = Path("..", "models/")

## Read Data

In [4]:
X_train = pd.read_csv(Path(data_root_dir,"X_train.csv"))
y_train = pd.read_csv(Path(data_root_dir,"y_train.csv"))

## Training Default Model

In [5]:
# import sklearn


# sklearn.metrics.get_scorer_names() 

In [7]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.discriminant_analysis import StandardScaler



default_knn_model = KNeighborsClassifier()

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", MinMaxScaler()),
    ("prediction", default_knn_model)
])

scoring = ["recall", "precision", "f1"]

default_knn_scores = cross_validate(
    estimator=model_pipeline, 
    X=X_train, 
    y=y_train.values.ravel(), 
    cv=3, scoring=scoring,
    n_jobs=-1, verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


[CV] END .................................................... total time=   2.4s
[CV] END .................................................... total time=   2.3s
[CV] END .................................................... total time=   2.4s


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    4.4s finished


In [8]:
default_knn_scores

{'fit_time': array([1.14114475, 1.03985214, 1.04305696]),
 'score_time': array([1.20265126, 1.31944776, 1.32730246]),
 'test_recall': array([0.85697888, 0.84090909, 0.85146924]),
 'test_precision': array([0.79055485, 0.78336185, 0.8033355 ]),
 'test_f1': array([0.82242785, 0.81111603, 0.82670233])}

In [9]:
cv_scores = default_knn_scores


In [10]:
mean_recall,mean_precision,mean_f1 = common.calculate_mean_from_cv(default_knn_scores)

Mean Recall: 0.8497857361493725, Mean Precision: 0.8497857361493725,Mean F1: 0.820082069546663


In [12]:
# commenting this code out to avoid overwriting the metrics file. 
common.update_models_metrics("KNN", "v0", mean_recall,mean_precision,mean_f1)

Unnamed: 0,model,version,recall,precision,f1,file
0,Logistic Regression,v0,0.885369,0.885369,0.870934,
1,Logistic Regression,v1,0.885675,0.85711,0.87114,logistic_regression_v1.joblib
2,Logistic Regression,v2,0.886746,0.857321,0.871766,logistic_regression_v2.joblib
3,Linear SVC,v0,0.887511,0.887511,0.871667,
4,Linear SVC,v1,0.887435,0.8564,0.871625,linear_svc_v1.joblib
5,Linear SVC,v2,0.887894,0.856845,0.872076,linear_svc_v2.joblib
6,Random Forest,v0,0.87787,0.87787,0.862945,
7,Random Forest,v1,0.881772,0.848675,0.864893,random_forest_v1.joblib
8,Random Forest,v0,0.877946,0.877946,0.862862,
9,Random Forest,v1,0.882844,0.849042,0.865594,random_forest_v1.joblib


Observations:
* With average recall of `0.84` and average precision of `0.84`, the default KNN is under performing compared to all the other models. 

## GridSearch CV v1

In [13]:
## checking params
# preprocessing.pipeline.get_params()

In [15]:
from sklearn.model_selection import GridSearchCV


model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", KNeighborsClassifier())
])

scoring = ["recall", "precision", "f1"]

param_grid = {
    "prediction__n_neighbors": [3, 5, 10, 15, 20],
    "prediction__weights": ["uniform", "distance"],
    "prediction__algorithm": ["ball_tree","kd_tree","brute"]    
}

grid_search = GridSearchCV(model_pipeline, param_grid,
                           scoring=scoring, cv=3, n_jobs=-1, refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())

In [16]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__algorithm': 'brute', 'prediction__n_neighbors': 20, 'prediction__weights': 'uniform'}
Best Scores : 28
Mean Recall: 0.8990664217936946, Mean Precision: 0.8024758004402274,Mean F1: 0.8480084476468495


In [17]:
# commenting this code out to avoid overwriting the metrics file.
_, file_name = common.save_model(
    "KNN", "v1", grid_search.best_estimator_)
common.update_models_metrics("KNN", "v1", average_recall,
                             average_precision, average_f1, file_name=file_name)
common.update_model_params(
    "KNN", "v1", grid_search.best_params_)

[{'name': 'LogisticRegression',
  'version': 'v1',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l1',
   'prediction__solver': 'liblinear'}},
 {'name': 'LogisticRegression',
  'version': 'v2',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l2',
   'prediction__solver': 'saga',
   'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'ordinal',
   'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal',
   'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal',
   'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal',
   'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'onehot',
   'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}},
 {'name': 'LinearSVC',
  'version': 'v1',
  'params': {'prediction__C': 10, 'prediction__penalty': 'l1'}},
 {'name': 'LinearSVC',
  'versi

* We accidently created another experiment here, instead of using MinMaxScaller we ended up using StandardScaller, so lets try to run the same experiment but with min max scaler and see if it affects the metrics. 

## GridSearch CV V2

In [18]:
from sklearn.model_selection import GridSearchCV


model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", MinMaxScaler()),
    ("prediction", KNeighborsClassifier())
])

scoring = ["recall", "precision", "f1"]

param_grid = {
    "prediction__n_neighbors": [3, 5, 10, 15, 20],
    "prediction__weights": ["uniform", "distance"],
    "prediction__algorithm": ["ball_tree","kd_tree","brute"]    
}

grid_search = GridSearchCV(model_pipeline, param_grid,
                           scoring=scoring, cv=3, n_jobs=-1, refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())

In [19]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__algorithm': 'brute', 'prediction__n_neighbors': 20, 'prediction__weights': 'uniform'}
Best Scores : 28
Mean Recall: 0.8764156718702174, Mean Precision: 0.8117210677437717,Mean F1: 0.8428168318383106


* Slight difference, but nothing significant. We'll still save it for records and may be use it in ensemble. 

In [20]:
# commenting this code out to avoid overwriting the metrics file.
_, file_name = common.save_model(
    "KNN", "v2", grid_search.best_estimator_)
common.update_models_metrics("KNN", "v2", average_recall,
                             average_precision, average_f1, file_name=file_name)
common.update_model_params(
    "KNN", "v2", grid_search.best_params_)

[{'name': 'LogisticRegression',
  'version': 'v1',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l1',
   'prediction__solver': 'liblinear'}},
 {'name': 'LogisticRegression',
  'version': 'v2',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l2',
   'prediction__solver': 'saga',
   'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'ordinal',
   'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal',
   'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal',
   'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal',
   'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'onehot',
   'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}},
 {'name': 'LinearSVC',
  'version': 'v1',
  'params': {'prediction__C': 10, 'prediction__penalty': 'l1'}},
 {'name': 'LinearSVC',
  'versi

## GridSearch CV v3

In [None]:
preprocessing.pipeline.get_params(deep=True)

In [21]:

model_pipeline = Pipeline([
    ("preprocessing", preprocessing.pipeline),
    ("normalizing", StandardScaler()),
    ("prediction", KNeighborsClassifier())
])

scoring = ["recall", "precision", "f1"]

## experiment between onehot and ordinal encoding of various features.

param_grid = {
    "preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding":["onehot", "ordinal"],
    "preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding":["onehot", "ordinal"],
    "prediction__n_neighbors": [3, 5, 10, 15, 20],
    "prediction__weights": ["uniform", "distance"],
    "prediction__algorithm": ["ball_tree","kd_tree","brute"]
}

grid_search = GridSearchCV(model_pipeline, param_grid, scoring=scoring, cv=3,n_jobs=-1,refit=refit_strategy)
grid_search.fit(X_train, y_train.values.ravel())


In [22]:
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Scores : {grid_search.best_index_}")
average_recall,average_precision,average_f1 = common.read_best_mean_grid_search_metrics(grid_search.cv_results_,grid_search.best_index_)

Best Parameters: {'prediction__algorithm': 'ball_tree', 'prediction__n_neighbors': 20, 'prediction__weights': 'distance', 'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'onehot', 'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'onehot', 'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal', 'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal', 'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'ordinal', 'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}
Best Scores : 591
Mean Recall: 0.9012090602999695, Mean Precision: 0.8046516521086527,Mean F1: 0.8501819441954491


In [23]:
# commenting this code out to avoid overwriting the metrics file.
_, file_name = common.save_model(
    "KNN", "v3", grid_search.best_estimator_)
common.update_models_metrics("KNN", "v3", average_recall,
                             average_precision, average_f1, file_name=file_name)
common.update_model_params(
    "KNN", "v3", grid_search.best_params_)

[{'name': 'LogisticRegression',
  'version': 'v1',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l1',
   'prediction__solver': 'liblinear'}},
 {'name': 'LogisticRegression',
  'version': 'v2',
  'params': {'prediction__C': 1,
   'prediction__penalty': 'l2',
   'prediction__solver': 'saga',
   'preprocessing__age_pipeline__age_encoding__age_range_encoding__encoding': 'ordinal',
   'preprocessing__cgpa_pipeline__cgpa_encoding__cgpa_range_encoding__encoding': 'ordinal',
   'preprocessing__degree_pipeline__degree_encoding__degree_level_encoding__encoding': 'ordinal',
   'preprocessing__dietary_habits_pipeline__dietary_habits_encoding__encoding': 'ordinal',
   'preprocessing__hours_pipeline__hours_encoding__hours_range_encoding__encoding': 'onehot',
   'preprocessing__sleep_duration_pipeline__sleep_duration_encoding__encoding': 'ordinal'}},
 {'name': 'LinearSVC',
  'version': 'v1',
  'params': {'prediction__C': 10, 'prediction__penalty': 'l1'}},
 {'name': 'LinearSVC',
  'versi

Observations:
* V3 of KNN seems to be the best model we got so far. The recall seems to be significantly higher, and precision seems to better than models with high recals. 