In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score, log_loss

In [18]:
df = pd.read_csv('HR_comma_sep.csv')
df.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium


In [19]:
dum_hr = pd.get_dummies(df, drop_first=True)
dum_hr.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,1,0,False,False,False,False,False,False,True,False,False,True,False
1,0.8,0.86,5,262,6,0,1,0,False,False,False,False,False,False,True,False,False,False,True


In [20]:
X = dum_hr.drop('left', axis=1)
y = dum_hr['left']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, stratify=y,random_state=24)

In [22]:
params = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
knn = KNeighborsClassifier()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(knn, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'n_neighbors': 10}
-0.468508329142402


## Scaling

In [37]:
# knn = KNeighborsClassifier(n_neighbors=5)

scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)
knn.fit(X_train_scl, y_train)

X_test_scl = scaler.transform(X_test)
y_pred_prob = knn.predict_proba(X_test_scl)
print(log_loss(y_test, y_pred_prob[:,1]))

0.7175772218036754


## Using Pipeline

In [39]:
pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
pipe.fit(X_train, y_train)
y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob[:,1]))

0.7175772218036754


## Pipeline with GridSearchCV

In [44]:
# params = {'n_neighbors': np.arange(1, 40)}
params = {'KNN__n_neighbors': np.arange(1, 40)}  # KNN prefix from Pipeline([('SCL', scaler), ('KNN', knn)])
knn = KNeighborsClassifier()
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 39}
-0.3329779348050666


## Using MinMaxScaler

In [47]:
mnx_scaler = MinMaxScaler()

pipe = Pipeline([('SCL', mnx_scaler), ('KNN', knn)])
pipe.fit(X_train, y_train)
y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test, y_pred_prob[:,1]))

0.688465995800685


## Using MinMaxScaler with GridSearchCV

In [50]:
mnx_scaler = MinMaxScaler()

pipe = Pipeline([('SCL', mnx_scaler), ('KNN', knn)])
params = {'KNN__n_neighbors': np.arange(1, 40)} 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)

print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 39}
-0.35124560125812215


## All Scaling in one

In [49]:
pipe = Pipeline([('SCL', None), ('KNN', knn)])
params = {'KNN__n_neighbors': np.arange(1, 40), 'SCL':[None, scaler, mnx_scaler]} 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)

gcv = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X,y)

print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 38, 'SCL': None}
-0.2855897663566085
