# KNN CLASSIFIER - RANDOM SEARCH

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
import pandas as pd
from datetime import datetime
from time import time

### GLOBAL VARIABLES

In [13]:
DATAPATH = 'data/train_test/'
SEED = 47
NITER = 500
CV = 3
SCORE = 'balanced_accuracy'
usenull = False
NJOBS = -1

### LOAD DATASET

In [3]:
train_features = pd.read_pickle(DATAPATH+'X_train.pkl').values

In [6]:
train_features.shape

(148865, 1770)

In [4]:
train_labels = pd.read_pickle(DATAPATH+'y_train.pkl')['target'].values

#### Input Null values

In [5]:
if usenull == False:
    train_features[np.isnan(train_features)] = 0

### TRAIN MODEL

#### Set hyperparameters

In [7]:
# ======== General Parameters ======= #

# Number of neighbors to use.
n_neighbors  = [i for i in range(3,21,2)] 

# Weight function used in prediction. Possible values:
# uniform : uniform weights. All points in each neighborhood are weighted equally.
# distance : weight points by the inverse of their distance. in this case, closer neighbors of a query point 
#            will have a greater influence than neighbors which are further away.
weights = ['uniform', 'distance']

# the distance metric to use for the tree.
metric = ['euclidean', 'minkowski']

# Algorithm used to compute the nearest neighbors:
# - ball_tree will use BallTree
# - kd_tree will use KDTree
# - brute will use a brute-force search.
# - auto will attempt to decide the most appropriate algorithm based on the values passed to fit method.
algorithm = ['auto']

[KNN params](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [8]:
# Create the random grid
random_grid = {
    'n_neighbors' : n_neighbors,
    'weights' : weights,
    'metric' : metric,
    'algorithm': algorithm
}

In [9]:
random_grid

{'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19],
 'weights': ['uniform', 'distance'],
 'metric': ['euclidean', 'minkowski'],
 'algorithm': ['auto']}

#### Training

In [14]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
model = KNeighborsClassifier()

In [15]:
# Random search of parameters, using CV fold cross validation, 
# search across NITER different combinations, and use all available cores
knn_rsearch = RandomizedSearchCV(estimator = model, param_distributions = random_grid, scoring=SCORE, n_iter = NITER, cv = CV, verbose=2, random_state=SEED, n_jobs = NJOBS)# Fit the random search model


In [16]:
start = time()
knn_rsearch.fit(train_features, train_labels)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), NITER))

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 1138.7min


KeyboardInterrupt: 

#### Saving results

In [None]:
cv_results = pd.DataFrame(knn_rsearch.cv_results_)

In [None]:
cv_results.to_csv('../models/rsearch_knn_classifier_d' + str(datetime.now().date()) + '.csv',sep=';',index=False)

#### Best estimator

In [None]:
knn_rsearch.best_estimator_

#### Best parameter

In [None]:
knn_rsearch.best_params_

#### Best Score

In [None]:
print(SCORE,' : ', knn_rsearch.best_score_)

#### Saving best hyperparameters

In [77]:
np.save('../models/knn_classifier_bestparams_d' + str(datetime.now().date()) + '.npy', knn_rsearch.best_params_)

In [3]:
import numpy as np

In [8]:

from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

TypeError: 'rv_frozen' object is not iterable