In [68]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import neighbors
from sklearn import ensemble
from sklearn.metrics import f1_score

In [69]:
# read training and test data
train = pd.read_csv('online_shoppers_intention_train.csv')
test = pd.read_csv('online_shoppers_intention_test.csv')


In [70]:
def analyze_algo(algo_name, classifier, train, test):
    """ Analyzes ML algorithm performance.
    Create classifier during function call! """

    # select the last column as label (y)
    X_train = train.iloc[:, :-1].values
    y_train = train.iloc[:, -1].values
    X_test = test.iloc[:, :-1].values
    y_test = test.iloc[:, -1].values

    # Training: Fit classifier to the training set
    classifier.fit(X_train, y_train)

    # predict test set results
    y_pred = classifier.predict(X_test)

    # print the performance results for the predicted test results
    print('')
    print(algo_name)
    print('F1 Score:', f1_score(y_test, y_pred))
    cross_val = np.mean(sklearn.model_selection.cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1_macro'))
    print('Cross Validation:', cross_val)

In [71]:
classifier = neighbors.KNeighborsClassifier()

In [72]:
classifier.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [73]:
param_grid = {'n_neighbors': [1, 15], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan', 'minkowski']}
search = sklearn.model_selection.GridSearchCV(classifier, param_grid, cv=10, scoring='f1_macro')

In [74]:
X_train = train.iloc[:, :-1].values
y_train = train.iloc[:, -1].values
X_test = test.iloc[:, :-1].values
y_test = test.iloc[:, -1].values
search.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan', 'minkowski'],
                         'n_neighbors': [1, 15],
                         'weights': ['uniform', 'distance']},
             scoring='f1_macro')

In [75]:
search.cv_results_

{'mean_fit_time': array([1.54685974e-03, 3.08048725e-03, 1.56507492e-03, 5.96451759e-04,
        3.93199921e-04, 4.83710766e-03, 0.00000000e+00, 1.74746513e-03,
        9.86099243e-05, 0.00000000e+00, 1.50241852e-03, 0.00000000e+00]),
 'std_fit_time': array([0.00464058, 0.00616266, 0.00469522, 0.00066181, 0.00048252,
        0.01205852, 0.        , 0.00460884, 0.00029583, 0.        ,
        0.00450726, 0.        ]),
 'mean_score_time': array([0.10005813, 0.05003598, 0.10466244, 0.08671501, 0.09332602,
        0.07287657, 0.12046878, 0.09824233, 0.07614913, 0.05004168,
        0.10624871, 0.08281741]),
 'std_score_time': array([0.03731034, 0.00597267, 0.00747286, 0.00915617, 0.00550237,
        0.01407242, 0.00720174, 0.00642944, 0.00461653, 0.00612613,
        0.00637651, 0.00729053]),
 'param_metric': masked_array(data=['euclidean', 'euclidean', 'euclidean', 'euclidean',
                    'manhattan', 'manhattan', 'manhattan', 'manhattan',
                    'minkowski', 'minkowsk

In [76]:
search.best_estimator_

KNeighborsClassifier(metric='euclidean', n_neighbors=15, weights='distance')

In [77]:
est = search.best_estimator_

In [78]:
analyze_algo('KNeighbors custom', est, train, test)


KNeighbors custom
F1 Score: 0.6016559337626496
Cross Validation: 0.7686270735101891
