# Cross-Validation

#### Sample KNN Model to demonstrate Cross-Validation techniques below

In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("./Data Files/KNN_Test_File.csv")
X = data[['XVPM', 'GWYH', 'TRAT', 'TLLZ', 'IGGA', 'HYKR', 'EDFS', 'GUUB', 'MGJM', 'JHZC']]
y = data["TARGET CLASS"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1111)
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
for i in X_train.columns:
    if X_train[i].dtypes in ["float64", "int64"]:
        m = X_train[i].mean()
        s = X_train[i].std()
        X_train_scaled[i] = (X_train[i] - m) / s
        X_test_scaled[i] = (X_test[i] - m) / s
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors = 5)  # Can increase the number of neighbors
knn_model.fit(X_train_scaled, y_train)
y_pred = knn_model.predict(X_test_scaled)
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print("Base KNN Model", "\n")
print(pd.DataFrame(confusion_matrix(y_test, y_pred)), 
      "      Accuracy:", round(accuracy_score(y_test, y_pred), 3), 
      "\n")
print(classification_report(y_test, y_pred))

Base KNN Model 

     0    1
0  113   36
1   28  123       Accuracy: 0.787 

             precision    recall  f1-score   support

          0       0.80      0.76      0.78       149
          1       0.77      0.81      0.79       151

avg / total       0.79      0.79      0.79       300



#### Traiditional Cross-Validation for Model Validation

In [6]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(knn_model, X_train_scaled, y_train, 
                            cv = 5, scoring = "accuracy", n_jobs = -1)

print("Cross Validation Results", "\n")
print("Avg. Accuracy:", round(cv_scores.mean(), 4))

Cross Validation Results 

Avg. Accuracy: 0.8043


#### Using Grid Search to tune parameters

In [8]:
from sklearn.model_selection import GridSearchCV
parameters = {"n_neighbors": [5, 10, 15, 20]}  # Adjust these values over and over
grid = GridSearchCV(knn_model, parameters, verbose = 2)
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] n_neighbors=5 ...................................................
[CV] .................................... n_neighbors=5, total=   0.0s
[CV] n_neighbors=5 ...................................................
[CV] .................................... n_neighbors=5, total=   0.0s
[CV] n_neighbors=5 ...................................................
[CV] .................................... n_neighbors=5, total=   0.0s
[CV] n_neighbors=10 ..................................................
[CV] ................................... n_neighbors=10, total=   0.0s
[CV] n_neighbors=10 ..................................................
[CV] ................................... n_neighbors=10, total=   0.0s
[CV] n_neighbors=10 ..................................................
[CV] ................................... n_neighbors=10, total=   0.0s
[CV] n_neighbors=15 ..................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.2s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [9]:
grid.best_params_

{'n_neighbors': 15}

In [12]:
grid_y_pred = grid.predict(X_test_scaled)
print("Optimized KNN Model using Grid Search", "\n")
print(pd.DataFrame(confusion_matrix(y_test, grid_y_pred)), 
      "      Accuracy:", round(accuracy_score(y_test, grid_y_pred), 4), 
      "\n")
print(classification_report(y_test, grid_y_pred))

Optimized KNN Model using Grid Search 

     0    1
0  120   29
1   26  125       Accuracy: 0.8167 

             precision    recall  f1-score   support

          0       0.82      0.81      0.81       149
          1       0.81      0.83      0.82       151

avg / total       0.82      0.82      0.82       300

