## Machine Learning competition
#### AdaBoost Classifier Ensemble Notebook
##### Loading all the dependencies

In [1]:
# REQUIRED IMPORTS FROM STANDARD PACKAGES

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import random
import scipy
import pandas as pd
from os.path import join as pjoin
from glob import glob
import sklearn as sk
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import make_scorer
from sklearn.externals.joblib import parallel_backend
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# IMPORTS FROM THE UTIL LIBRARY PROVIDED BY US

import util.vis as V
import util.helpers as H

# Normally, all libraries are loaded only once, 
# even if you execute the import code multiple times
# This code is helpful if you make your own helper libraries 
%load_ext autoreload
%autoreload 1
# list your libraries below with aimport: should ensure 
#they are reloaded each time without having to restart your kernel
# in this case, our libraries are used as an example

%aimport util.helpers, util.vis
%aimport features_extraction
%aimport augmentation
%aimport Analysis
%aimport validation
%aimport preprocessing
%aimport upsampling

# seed random generator such that this notebook always returns the same values 
# (this is by no means necessary, but it is useful for reproducability of results)
rng = np.random.RandomState(42)

##### Loading all the training and test data

In [2]:
# PATHS

DATA_DIR = '../data'
POSE_DIR = '../data/pose'

##Loading all the training data
dataset_file=pjoin(DATA_DIR,'labels.csv')

train_samples=[]
train_labels=[]
train_persons = []
train_personlabels = []

with open(dataset_file) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    sample_index = 0
    for row in reader: 
        name, _gloss, label, _person = row
        sample = np.load(pjoin(POSE_DIR, 'train', name+'.npy'))
        if upsampling.keep_sample(sample):
            train_samples.append(sample)
            train_labels.append(int(label))
            train_persons.append(_person)
            train_personlabels.append((label, _person))
        sample_index += 1

train_samples=np.array(train_samples)
train_labels=np.array(train_labels)

##Loading all the test data
all_test_files = sorted(glob(pjoin(POSE_DIR, 'test', '*.npy')))  

test_samples = []
for numpy_file in all_test_files:
    sample = np.load(numpy_file)
    test_samples.append(sample)
test_samples = np.array(test_samples)

print("Data Ready")

Data Ready


##### Data preprocessing

In [3]:
train_samples = preprocessing.rotate(train_samples)
train_samples = preprocessing.scale(train_samples)
train_samples = preprocessing.centering(train_samples)

test_samples = preprocessing.rotate(test_samples)
test_samples = preprocessing.scale(test_samples) 
test_samples = preprocessing.centering(test_samples)

--- Rotating finished ---
--- Scaling finished ---
--- Centering finished ---
--- Rotating finished ---
--- Scaling finished ---
--- Centering finished ---


##### Extracting features

In [4]:
y_train = np.array(train_labels)
X_train = features_extraction.extract_features(train_samples).values
X_test = features_extraction.extract_features(test_samples).values
num_features = X_train.shape[1]
print("Number of features: ", num_features)

1.5319662831085168
1.8028555226407619
3.254082070769698
3.4255289195997847
1.1987622488576193
1.3526317918584638
2.596744844128551
2.6369036165192625
1.2613305322221038
1.4865728822358903
2.586740156882101
2.6540213250189097
2.1123107912999757
2.3694609666377744
4.070980236089483
4.130809395521922
Time elapsed for feature extraction:  156.53652501106262  seconds


  vector_hand_fingers = vector_hand_fingers / np.linalg.norm(vector_hand_fingers)


1.4608693481576722
1.3579723098566774
3.7267269965119993
3.877593510209344
0.9657814162469975
0.9212098143516698
2.634307970990534
2.639544709803104
1.0733810572700975
1.0517133385588409
2.6726756453192024
2.9142313057033533
1.779889711452853
1.6599256119167336
4.093239890310912
4.0673065305853555
Time elapsed for feature extraction:  44.14937901496887  seconds
Number of features:  550


##### Custom scoring functions

In [5]:
##Custom scoring functions

def map3_score(y_true, proba):
    return H.top3_accuracy(proba, y_true)
    
map3 = make_scorer(map3_score, needs_proba=True)

##### Validation Strategy

In [6]:
##Train validate data splitter for Cross Validation
seed = np.random.randint(1,999)
print("seed: ", seed)
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

seed:  572


##### Pipeline & model training

In [7]:
# the function below generates our rescaled pipeline model
# with optimized hyperparameters (e.g.: regularisation parameter)
def tune_adaBoost(x_data,r_data,verbose=0):
    pipe = Pipeline([
        ('scale', StandardScaler()), 
        ('selectkbest', SelectKBest(f_classif)), 
        ('adaboost', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=6,min_samples_split=0.01, 
           min_samples_leaf = 0.01, max_features=80, class_weight='balanced', random_state=24)))
        #('tree', DecisionTreeClassifier(random_state=24, class_weight='balanced'))
    ])
    
    # Set the parameters by cross-validation
    tuned_parameters_tree = {'tree__max_depth': np.arange(1, 20, 1), #6 
                             'tree__min_samples_split': np.arange(0.01, 0.5, 0.05),#0.01
                             'tree__min_samples_leaf': np.arange(0.01, 0.5, 0.05),#0.01
                             'tree__max_features': np.arange(50, 200, 30),#80
                             'selectkbest__k': np.arange(200, 350, 20)#280
                            }
    tuned_parameters_ensemble = {'adaboost__n_estimators':np.arange(50, 300, 10),#220
                                 'adaboost__learning_rate':np.logspace(-3, 2, 10), #0.003
                                 'selectkbest__k': np.arange(80, 350, 20) #310
                                }

    print("------ Start tuning hyperparameters ------")
    CV = GridSearchCV(pipe, tuned_parameters_ensemble, n_jobs=-1, scoring=map3, pre_dispatch='n_jobs', cv=sgkf, verbose=2)
    print(CV)
    with parallel_backend('threading'):
        CV.fit(x_data, r_data)
    print("------ Tuning hyperparameters finished ------")
    
    #Tuning ensemble
    
    bestEstimator = CV.best_params_['adaboost__n_estimators']
    bestLR = CV.best_params_['adaboost__learning_rate']
    bestK = CV.best_params_['selectkbest__k']
    print("Optimal n estimators: ", bestEstimator)
    print("Optimal Learning Rate: ", bestLR)
    print("Optimal k value: ", bestK)
    """
    #Tuning tree
    bestDepth = CV.best_params_['tree__max_depth']
    bestSplit = CV.best_params_['tree__min_samples_split']
    bestLeaf = CV.best_params_['tree__min_samples_leaf']
    bestFeatures = CV.best_params_['tree__max_features']
    bestK = CV.best_params_['selectkbest__k']
    print("Optimal Depth: ", bestDepth)
    print("Optimal Split: ", bestSplit)
    print("Optimal Leaf: ", bestLeaf)
    print("Optimal Features: ", bestFeatures)
    print("Optimal K: ", bestK)
    """
    optimal_pipe = Pipeline([
        ('scale', StandardScaler()), 
        ('selectkbest', SelectKBest(f_classif,k=bestK)),
        ('adaboost',AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=6,class_weight='balanced', min_samples_split=0.01,
                    min_samples_leaf=0.01, max_features=80),n_estimators=bestEstimator,learning_rate=bestLR,random_state=24))
        #('tree', DecisionTreeClassifier(random_state=24,class_weight='balanced',max_depth=bestDepth, min_samples_split=bestSplit, min_samples_leaf=bestLeaf, max_features=bestFeatures))
        ])
    
    optimal_pipe.fit(x_data,r_data)  
    
    if verbose>0:
    
        print("Grid scores on training data set:")
        print()
        cv_means = CV.cv_results_['mean_test_score']
        print(cv_means)
        cv_stds = CV.cv_results_['std_test_score']
        for mean, std, params in zip(cv_means, cv_stds, CV.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    return optimal_pipe, CV.cv_results_

##### Training model

In [8]:
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)
optimal_pipe, res = tune_adaBoost(X_train, y_train, 1)
print('---------Optimal pipe finished -----------')

##Train validate data splitter for Cross Validation
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

with parallel_backend('threading'):
    scores = cross_val_score(optimal_pipe, X_train, y_train, scoring=map3, cv=sgkf, n_jobs=-1, pre_dispatch='n_jobs')
print(scores)
print("Average (cross validated) map@3 score: ",scores.mean(),", stdev: ",scores.std())

optimal_pipe.fit(X_train, y_train)

------ Start tuning hyperparameters ------
GridSearchCV(cv=<generator object stratified_group_k_fold at 0x000001CD2D110ED0>,
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('selectkbest', SelectKBest(k=10, score_func=<function f_classif at 0x000001CD22DF7840>)), ('adaboost', AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight='balanced', criteri...24,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'adaboost__n_estimators': array([ 50,  60,  70,  80,  90, 100, 110, 120, 130, 140, 150, 160, 170,
       180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290]), 'adaboost__learning_rate': array([1.00000e-03, 3.59381e-03, 1.29155e-02, 4.64159e-02, 1.66810e-01,
       5.99484e-01, 2.15443e+00, 7.74264e+00, 2.78256

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=80, total=  10.4s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=100 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=80, total=  10.5s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=100 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=80, total=  10.7s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=120 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=100, total=  10.8s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=120 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=100, total=  10.8s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=120 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=80, total=  10.9s
[

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   55.4s


[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=220, total=  10.6s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=240 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=200, total=  10.9s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=240 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=220, total=  10.9s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=240 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=220, total=  10.6s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=260 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=220, total=  10.5s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=260 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=50, selectkbest__k=220, total=  11.

[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=120, total=  12.2s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=140 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=120, total=  12.2s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=140 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=120, total=  12.0s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=160 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=120, total=  12.3s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=160 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=120, total=  12.8s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=160 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=140, total=  12.

[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=300, total=  12.1s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=320 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=300, total=  12.2s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=340 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=300, total=  11.9s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=340 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=300, total=  12.5s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=340 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=320, total=  11.9s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=340 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=60, selectkbest__k=320, total=  12.

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.1min


[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=120, total=  14.6s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=160 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=140, total=  14.1s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=160 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=140, total=  14.3s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=160 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=140, total=  14.5s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=180 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=140, total=  14.4s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=180 
[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=140, total=  14.

[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=320, total=  14.4s
[CV] adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=340 


KeyboardInterrupt: 

[CV]  adaboost__learning_rate=0.001, adaboost__n_estimators=70, selectkbest__k=320, total=  14.0s


##### Generate kaggle submission file

In [None]:
test_probas = optimal_pipe.predict_proba(X_test)
H.create_submission(test_probas, 'AdaBoostTuned.csv')
print("Submission created")