In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score, log_loss

In [2]:
df = pd.read_csv('HR_comma_sep.csv')
df.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium


In [3]:
dum_hr = pd.get_dummies(df, drop_first=True)
X = dum_hr.drop('left', axis=1)
y = dum_hr['left']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, stratify=y,random_state=24)

In [9]:
knn = KNeighborsClassifier()

## Without Scaling

In [10]:
params = {'n_neighbors': np.arange(1, 40)}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(knn, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'n_neighbors': 38}
-0.2855897663566085


## With StandardScaler

In [6]:
scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)
knn.fit(X_train_scl, y_train)

X_test_scl = scaler.transform(X_test)

pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
pipe.fit(X_train, y_train)

params = {'KNN__n_neighbors': np.arange(1, 40)}
knn = KNeighborsClassifier()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 39}
-0.3329779348050666


## With MinMaxScaler

In [7]:
mnx_scaler = MinMaxScaler()

pipe = Pipeline([('SCL', mnx_scaler), ('KNN', knn)])
pipe.fit(X_train, y_train)
y_pred_prob = pipe.predict_proba(X_test)

params = {'KNN__n_neighbors': np.arange(1, 40)} 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)

print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 39}
-0.35124560125812215
