### Import

In [41]:
seed = 17
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from scipy import stats
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import pickle

### Load Data

In [42]:
# LOAD DATA
# Lists of observations and targets
Xs = []
ys = []

Xs.append(pd.read_csv("../data/credit_X.csv"))
Xs.append(pd.read_csv("../data/raisin_X.csv"))
Xs.append(pd.read_csv("../data/alzheimer_X.csv"))
Xs.append(pd.read_csv("../data/salary_X.csv"))

ys.append(pd.read_csv("../data/credit_y.csv").iloc[:, 0])
ys.append(pd.read_csv("../data/raisin_y.csv").iloc[:, 0])
ys.append(pd.read_csv("../data/alzheimer_y.csv").iloc[:, 0])
ys.append(pd.read_csv("../data/salary_y.csv").iloc[:, 0])

### Functions

In [43]:
def tune_with_random_search(model, search_grid, n_iter, scoring):
    opt_results = []
    for i in range(4):
        # in my case works faster with n_jobs=-1
        opt = RandomizedSearchCV(model, search_grid, n_iter=n_iter, cv=5, scoring=scoring, random_state=seed, n_jobs=-1)
        opt_results.append(opt.fit(Xs[i], ys[i]))
    return opt_results

def tune_with_bayes_search(model, search_grid, n_iter, scoring):
    opt_results = []
    for i in range(4):
        opt = BayesSearchCV(model, search_grid, n_iter=n_iter, cv=5, scoring=scoring, random_state=seed)
        opt_results.append(opt.fit(Xs[i], ys[i]))
    return opt_results

def get_tunability(model_class, default_grid, tuned_grids, scoring):
    tunabilities = []
    for i in range(4):
        default_score = cross_val_score(model_class(**default_grid), Xs[i], ys[i], cv=5, scoring=scoring, n_jobs=-1).mean()
        tuned_score = cross_val_score(model_class(**tuned_grids[i].best_params_), Xs[i], ys[i], cv=5, scoring=scoring, n_jobs=-1).mean()
        tunabilities.append(tuned_score - default_score)
    return tunabilities

def save_result(result, path_to_save):
    with open(path_to_save, 'wb') as file:
        pickle.dump(result, file)

def load_result(path):
    with open(path, 'rb') as file:
        return pickle.load(file)

### KNN with MinMaxScaler

In [57]:
random_seed = 17
pipe_knn = Pipeline(
    [
        ('scaler', MinMaxScaler()),
        ('knn', KNeighborsClassifier())
    ]

)

search_grid = {
    'knn__n_neighbors': [i for i in range(1, 100)],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [i for i in range(1,4)], 
    'knn__leaf_size': [i for i in range(10, 51, 2)], 
    'knn__n_jobs': [-1],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

In [58]:
%%time

knn_opt_results_rs = tune_with_random_search(pipe_knn, search_grid,100, 'roc_auc')

CPU times: user 731 ms, sys: 356 ms, total: 1.09 s
Wall time: 16.2 s


In [None]:
#save_result(knn_opt_results_rs,'./saved_results/knn_opt_results_rs.pkl')

In [59]:
for i in range(4):
    print(knn_opt_results_rs[i].best_params_)

{'knn__weights': 'uniform', 'knn__p': 1, 'knn__n_neighbors': 97, 'knn__n_jobs': -1, 'knn__leaf_size': 24, 'knn__algorithm': 'kd_tree'}
{'knn__weights': 'uniform', 'knn__p': 1, 'knn__n_neighbors': 27, 'knn__n_jobs': -1, 'knn__leaf_size': 30, 'knn__algorithm': 'kd_tree'}
{'knn__weights': 'distance', 'knn__p': 1, 'knn__n_neighbors': 94, 'knn__n_jobs': -1, 'knn__leaf_size': 18, 'knn__algorithm': 'ball_tree'}
{'knn__weights': 'distance', 'knn__p': 1, 'knn__n_neighbors': 66, 'knn__n_jobs': -1, 'knn__leaf_size': 32, 'knn__algorithm': 'kd_tree'}


In [60]:
knn_tunabilities_rs = get_tunability(pipe_knn, {'knn__n_jobs': -1}, knn_opt_results_rs, 'roc_auc')
knn_tunabilities_rs

TypeError: 'Pipeline' object is not callable