# KNN Nutcracker
Conny Lin | June 11, 2020 | update June 16, 2020

In [1]:
# import local libraries using host specific paths
import socket, sys, time, datetime, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# get paths for local computer
hostname = socket.gethostname().split('.')[0]
# set local path settings based on computer host
if hostname == 'PFC':
    pylibrary = '/Users/connylin/Dropbox/Code/proj'
elif hostname == 'Angular-Gyrus':
    pylibrary = '/Users/connylin/Code/proj'
else:
    assert False, 'host computer not regonized'

# import local variables
if pylibrary not in sys.path:
    sys.path.insert(1, pylibrary)
from brainstation_capstone.ml.toolbox.mlSOP import test_model
from brainstation_capstone.ml.toolbox.mlSOP import ml_timer
from brainstation_capstone.ml.toolbox.mlSOP import ModelEvaluation
from brainstation_capstone.system import host_paths
localpaths = host_paths.get(hostname)
data_dir = os.path.join(localpaths['Capstone'], 'data')

# report latest run
print(f'last ran on: {datetime.datetime.now()} PT')

getting host computer specific paths
None
None


In [None]:

# import data
from brainstation_capstone.etl.loaddata import nutcracker
data = nutcracker(localpaths, 'nutcracker', ['X_train','X_test','y_train','y_test'])

## rough tune - takes forever to run. Discard this option

In [None]:
# rough tune
from sklearn.neighbors import KNeighborsClassifier
KNN_model = KNeighborsClassifier(n_neighbors=3)
KNN_model.fit(X_train, y_train)
print('finished fitting model')
print(f'train score: {KNN_model.score(X_train, y_train)}')
print(f'test score: {KNN_model.score(X_test, y_test)}')

finished fitting model


In [None]:
# rough tune
# example of grid searching key hyperparametres for KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = 5 #cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, 
                           scoring='accuracy',error_score=0, verbose=5)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

## tune n_neighbors

In [None]:

from sklearn.ensemble import GradientBoostingClassifier
# hyperparameters - testing
learning_rate = np.arange(0.9, 1, 0.02)  # max tested 0.1 can be more (There is a trade-off between learning_rate and n_estimators)
hyperparameter_list = learning_rate.copy()
hyperparameter_name = 'learning_rate'
# hyperparameters - determined - can tune further
subsample = 0.8 # can tune between 0.99 to 0.5
max_depth = 4 # 9 or more can be better, but to limit time spend on tuning others, keep this low when testing
n_estimators = 100 # more better, but takes a lot more time.
# hyperparameters - determined - no further tuning
verbose = 1
random_state = 318
loss = 'deviance'
# hyperparameters - to test
min_samples_leaf = 1
min_samples_split = 2
min_weight_fraction_leaf = 0.0
min_impurity_decrease = 0.0
min_impurity_split = None
init = None 
max_features = None 
max_leaf_nodes = None
validation_fraction = 0.1 
n_iter_no_change = None
tol = 1e-4 
ccp_alpha = 0.0
# hyperparameters - test - low priorty
criterion = 'friedman_mse' # generally best
warm_start = False


# test hyperparameter
model_acc = test_model()
timer = ml_timer()
for parameter in hyperparameter_list:
    print(f'running: {hyperparameter_name} = {parameter}')
    timer.param_start()
    model = GradientBoostingClassifier(verbose=verbose, random_state=random_state, 
                warm_start=warm_start, loss=loss, 
                max_depth=max_depth, n_estimators=n_estimators,
                learning_rate=parameter, 
                subsample=subsample, 
                max_features=max_features, criterion=criterion,
                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf, 
                min_impurity_decrease=min_impurity_decrease, 
                min_impurity_split=min_impurity_split, init=init, 
                max_leaf_nodes=max_leaf_nodes,  
                validation_fraction=validation_fraction, 
                n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha
                )
    model_acc.score_data(model, data)
    timer.param_end()
    print(model)
# end time
timer.session_end()
time_per_session = timer.get_time()
# graph
hyperparameterplot(hyperparameter_list, model_acc.train_acc, model_acc.test_acc, hyperparameter_name)
print(f'{hyperparameter_name} = {hyperparameter_list}')
print(f'train_acc = {model_acc.train_acc}\ntest_acc = {model_acc.test_acc}')
print(f'time per param = {time_per_session}')

from sklearn.ensemble import GradientBoostingClassifier
# hyperparameters - testing
learning_rate = np.arange(0.9, 1, 0.02)  # max tested 0.1 can be more (There is a trade-off between learning_rate and n_estimators)
hyperparameter_list = learning_rate.copy()
hyperparameter_name = 'learning_rate'
# hyperparameters - determined - can tune further
subsample = 0.8 # can tune between 0.99 to 0.5
max_depth = 4 # 9 or more can be better, but to limit time spend on tuning others, keep this low when testing
n_estimators = 100 # more better, but takes a lot more time.
# hyperparameters - determined - no further tuning
verbose = 1
random_state = 318
loss = 'deviance'
# hyperparameters - to test
min_samples_leaf = 1
min_samples_split = 2
min_weight_fraction_leaf = 0.0
min_impurity_decrease = 0.0
min_impurity_split = None
init = None 
max_features = None 
max_leaf_nodes = None
validation_fraction = 0.1 
n_iter_no_change = None
tol = 1e-4 
ccp_alpha = 0.0
# hyperparameters - test - low priorty
criterion = 'friedman_mse' # generally best
warm_start = False


# test hyperparameter
model_acc = test_model()
timer = ml_timer()
for parameter in hyperparameter_list:
    print(f'running: {hyperparameter_name} = {parameter}')
    timer.param_start()
    model = GradientBoostingClassifier(verbose=verbose, random_state=random_state, 
                warm_start=warm_start, loss=loss, 
                max_depth=max_depth, n_estimators=n_estimators,
                learning_rate=parameter, 
                subsample=subsample, 
                max_features=max_features, criterion=criterion,
                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                min_weight_fraction_leaf=min_weight_fraction_leaf, 
                min_impurity_decrease=min_impurity_decrease, 
                min_impurity_split=min_impurity_split, init=init, 
                max_leaf_nodes=max_leaf_nodes,  
                validation_fraction=validation_fraction, 
                n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha
                )
    model_acc.score_data(model, data)
    timer.param_end()
    print(model)
# end time
timer.session_end()
time_per_session = timer.get_time()
# graph
hyperparameterplot(hyperparameter_list, model_acc.train_acc, model_acc.test_acc, hyperparameter_name)
print(f'{hyperparameter_name} = {hyperparameter_list}')
print(f'train_acc = {model_acc.train_acc}\ntest_acc = {model_acc.test_acc}')
print(f'time per param = {time_per_session}')
