## Machine Learning competition
#### Logistic Regression Notebook
##### Loading all the dependencies

In [14]:
# REQUIRED IMPORTS FROM STANDARD PACKAGES

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import random
import scipy
import pandas as pd
import sklearn as sk
from os.path import join as pjoin
from glob import glob
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.externals.joblib import parallel_backend
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.preprocessing import StandardScaler, RobustScaler
# IMPORTS FROM THE UTIL LIBRARY PROVIDED BY US

import util.vis as V
import util.helpers as H

# Normally, all libraries are loaded only once, 
# even if you execute the import code multiple times
# This code is helpful if you make your own helper libraries 
%load_ext autoreload
%autoreload 1
# list your libraries below with aimport: should ensure 
#they are reloaded each time without having to restart your kernel
# in this case, our libraries are used as an example

%aimport util.helpers, util.vis
%aimport features_extraction
%aimport augmentation
%aimport validation
%aimport preprocessing
%aimport upsampling

# seed random generator such that this notebook always returns the same values 
# (this is by no means necessary, but it is useful for reproducability of results)
rng = np.random.RandomState(42)
print("Environment Ready")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Environment Ready


##### Loading the data

In [15]:
# PATHS

DATA_DIR = '../data'
POSE_DIR = '../data/pose'

##Loading all the training data
dataset_file=pjoin(DATA_DIR,'labels.csv')

train_samples=[]
train_labels=[]
train_persons = []
train_personlabels = []

with open(dataset_file) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    sample_index = 0
    for row in reader: 
        name, _gloss, label, _person = row
        sample = np.load(pjoin(POSE_DIR, 'train', name+'.npy'))
        if upsampling.keep_sample(sample):
            train_samples.append(sample)
            train_labels.append(int(label))
            train_persons.append(_person)
            train_personlabels.append((label, _person))
        sample_index += 1

##Loading all the test data
all_test_files = sorted(glob(pjoin(POSE_DIR, 'test', '*.npy')))  

test_samples = []
for numpy_file in all_test_files:
    sample = np.load(numpy_file)
    test_samples.append(sample)
test_samples = np.array(test_samples)

print("Data Ready")

Data Ready


##### Preprocessing

In [16]:
train_samples = preprocessing.centering(train_samples)
test_samples = preprocessing.centering(test_samples)

--- Centering finished ---
--- Centering finished ---


##### Extracting features

In [17]:
y_train = np.array(train_labels)
X_train = features_extraction.extract_features(train_samples).values
X_test = features_extraction.extract_features(test_samples).values
num_features = X_train.shape[1]
print("Number of features: ", num_features)

  r = func(a, **kwargs)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  vector_hand_fingers = vector_hand_fingers / np.linalg.norm(vector_hand_fingers)


Number of features:  869


##### Custom scoring functions

In [7]:
##Custom scoring functions

def map3_score(y_true, proba):
    return H.top3_accuracy(proba, y_true)
    
map3 = make_scorer(map3_score, needs_proba=True)

##### Validation Strategy

In [22]:
##Train validate data splitter for Cross Validation
seed = np.random.randint(1,999)
print("Seed: ", seed)
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

Seed:  28


##### Pipeline model & training

In [35]:
# the function below generates our rescaled pipeline model
# with optimized hyperparameters (e.g.: regularisation parameter)
def tune_pipeline(x_data,r_data,verbose=0):
    pipe = Pipeline([
        ('scale', RobustScaler()),
        ('selectkbest', SelectKBest(f_classif)), 
        ('logreg', LogisticRegression(multi_class='multinomial', class_weight='balanced'))
    ])

    # Set the parameters by cross-validation
    tuned_parameters = {'logreg__C': np.logspace(-1, 0, 2),
                        'logreg__solver': ['sag'], 
                        'selectkbest__k': np.arange(220, 230, 5)}

    print("------ Start tuning hyperparameters ------")
    CV = GridSearchCV(pipe, tuned_parameters, n_jobs=-1, scoring=map3, pre_dispatch='n_jobs', cv=sgkf, verbose=2, return_train_score=True)
    with parallel_backend('threading'):
        CV.fit(x_data, r_data)
    print("------ Tuning hyperparameters finished ------")
    
    bestC = CV.best_params_['logreg__C']
    bestSolver = CV.best_params_['logreg__solver']
    bestK = CV.best_params_['selectkbest__k']
    print("Optimal regularisation value: ", bestC)
    print("Optimal solver: ", bestSolver)
    print("Optimal k value: ", bestK)
    
    optimal_pipe = Pipeline([
        ('scale', RobustScaler()),
        ('selectkbest', SelectKBest(f_classif, k=bestK)),
        ('logreg', LogisticRegression(C=bestC, multi_class='multinomial', solver=bestSolver, class_weight='balanced'))
        ])
    
    if verbose>0:
        print("Grid validation scores on training data set:")
        cv_means = CV.cv_results_['mean_test_score']
        print(cv_means)
        cv_stds = CV.cv_results_['std_test_score']
        for mean, std, params in zip(cv_means, cv_stds, CV.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    return optimal_pipe, CV.cv_results_

##### Training model

In [36]:
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)
optimal_pipe, res = tune_pipeline(X_train, y_train, 1)

##Train validate data splitter for Cross Validation
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

with parallel_backend('threading'):
    scores = cross_val_score(optimal_pipe, X_train, y_train, scoring=map3, cv=sgkf, n_jobs=-1, pre_dispatch='n_jobs')
print(scores)
print("Average (cross validated) map@3 score: ",scores.mean(),", stdev: ",scores.std())

optimal_pipe.fit(X_train, y_train)

------ Start tuning hyperparameters ------
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=220 ...........
[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=220 ...........
[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=220 ...........
[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=220 ...........[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=220 ...........

[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=225 ...........[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=225 ...........[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=225 ...........




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=220, total=  21.1s
[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=225 ...........
[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=220, total=  21.1s
[CV] logreg__C=0.1, logreg__solver=sag, selectkbest__k=225 ...........




[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=220, total=  21.4s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=220 ...........
[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=220, total=  21.6s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=220 ...........




[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=225, total=  22.3s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=220 ...........




[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=225, total=  24.2s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=220 ...........
[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=220, total=  24.2s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=220 ...........




[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=225, total=  27.3s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=225 ...........




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=220, total=  19.2s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=225 ...........
[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=220, total=  18.4s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=225 ...........




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=220, total=  19.6s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=225 ...........




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=220, total=  17.1s
[CV] logreg__C=1.0, logreg__solver=sag, selectkbest__k=225 ...........




[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=225, total=  20.8s




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=220, total=  20.3s




[CV]  logreg__C=0.1, logreg__solver=sag, selectkbest__k=225, total=  24.1s




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=225, total=  18.5s




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=225, total=  14.6s




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=225, total=  15.0s




[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=225, total=  15.2s


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   57.1s finished


[CV]  logreg__C=1.0, logreg__solver=sag, selectkbest__k=225, total=  15.1s




------ Tuning hyperparameters finished ------
Optimal regularisation value:  0.1
Optimal solver:  sag
Optimal k value:  225
Grid validation scores on training data set:
[0.79016854 0.79269663 0.78876404 0.79185393]
0.790 (+/-0.033) for {'logreg__C': 0.1, 'logreg__solver': 'sag', 'selectkbest__k': 220}
0.793 (+/-0.035) for {'logreg__C': 0.1, 'logreg__solver': 'sag', 'selectkbest__k': 225}
0.789 (+/-0.037) for {'logreg__C': 1.0, 'logreg__solver': 'sag', 'selectkbest__k': 220}
0.792 (+/-0.039) for {'logreg__C': 1.0, 'logreg__solver': 'sag', 'selectkbest__k': 225}




[0.77382646 0.78393352 0.81420765 0.81163435 0.77973568]
Average (cross validated) map@3 score:  0.7926675316331205 , stdev:  0.016865459586132298




Pipeline(memory=None,
     steps=[('scale', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('selectkbest', SelectKBest(k=225, score_func=<function f_classif at 0x000002A1CE37BEA0>)), ('logreg', LogisticRegression(C=0.1, class_weight='balanced', dual=False,
          fit_i...l2',
          random_state=None, solver='sag', tol=0.0001, verbose=0,
          warm_start=False))])

##### Kaggle submission file

In [37]:
test_probas = optimal_pipe.predict_proba(X_test)
H.create_submission(test_probas, 'LogisticRegressionTuned.csv')