In [4]:
# REQUIRED IMPORTS FROM STANDARD PACKAGES

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import random
import pandas as pd
from os.path import join as pjoin
from glob import glob
import sklearn as sk
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
import scipy
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import make_scorer, accuracy_score
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.externals.joblib import parallel_backend
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

# IMPORTS FROM THE UTIL LIBRARY PROVIDED BY US

import util.vis as V
import util.helpers as H

# Normally, all libraries are loaded only once, 
# even if you execute the import code multiple times
# This code is helpful if you make your own helper libraries 
%load_ext autoreload
%autoreload 1
# list your libraries below with aimport: should ensure 
#they are reloaded each time without having to restart your kernel
# in this case, our libraries are used as an example

%aimport util.helpers, util.vis
%aimport features_extraction
%aimport augmentation
%aimport Analysis
%aimport validation
%aimport preprocessing
%aimport upsampling

# seed random generator such that this notebook always returns the same values 
# (this is by no means necessary, but it is useful for reproducability of results)
rng = np.random.RandomState(42)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# PATHS

DATA_DIR = '../data'
POSE_DIR = '../data/pose'

## Loading data

In [6]:
def keep_sample(sample):
    counter = 0
    no_hands_present = False
    for frame in sample:
        if (frame[4][0] == 0 or frame[7][0] == 0):
            counter += 1
        if (frame[4][0] == 0 and frame[7][0] == 0):
            no_hands_present = True
    if counter/len(sample) > 0.5 or no_hands_present:
        return False
    else:
        return True
##Loading all the training data
dataset_file=pjoin(DATA_DIR,'labels.csv')

train_samples=[]
train_labels=[]
train_persons = []
train_personlabels = []

with open(dataset_file) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    sample_index = 0
    preprocessing_outliers = [229, 297, 1316, 1780, 1817, 49, 495, 541, 746, 878, 2910]
    outliers_jens_fixing_gap1 = [85 , 398 , 717 , 766 , 923 , 979 ,1111 ,1280 ,1671 ,1778 ,1960 ,1993 ,2038 ,2089, 2097, 2135 ,2140, 2145, 2417 ,2463 ,2554 ,2579, 2594, 2794, 2832 ,2975, 3138, 3211, 3215, 3325 ,3341 ,3396 ,3399 ,3482 ,3566, 3649, 3702]
    outliers_jens_fixing_gap2 = [264,322,578,592,663,717,766,770,885,888,894,901,906,917,923,933,949,954,964,979,986,1117,1151,1152,1159,1161,1171,1199,1227,1257,1297,1573,1600,1607,1665,1671,1684,1701,1767,1778,1831,1859,1883,1930,1931,1932,1943,1960,1972,1993,1999,2037,2050,2056,2060,2089,2091,2097,2106,2119,2135,2140,2167,2191,2209,2231,2232,2235,2260,2268,2289,2334,2336,2538,2551,2554,2579,2602,2614,2637,2683,2688,2694,2699,2726,2741,2787,2803,2810,2841,2844,2845,2848,2887,2888,2893,2902,2943,2947,2954,2967,2975,2984,2990,3037,3064,3073,3078,3087,3103,3118,3119,3136,3138,3149,3170,3283,3293,3298,3305,3332,3342,3435,3475,3482,3561,3589,3590,3621,3629,3641,3649,3658,3672,3693,3697,3699,3702,3709,3710]
    outliers_robbe_fixing_gap = [2538, 1753, 1327, 2954, 2956, 2958, 4, 1111, 1117, 1118, 2167, 1992, 1993, 1573, 1575, 1576, 977, 245, 1846, 1909, 2435, 2545, 2590, 2815, 3515, 396, 421, 434, 494, 676, 718, 795, 815, 821, 882, 977, 991, 1016, 1035, 1111, 1295, 1308, 1524, 1564, 1586, 1633, 1634, 1804]
    #add [2538, 1753, 1327, 2954, 2956, 2958, 4, 1111, 1117, 1118]
    for row in reader: 
        name, _gloss, label, _person = row
        sample = np.load(pjoin(POSE_DIR, 'train', name+'.npy'))
        #if (sample_index not in preprocessing_outliers) and (sample_index not in outliers_jens_fixing_gap1) and (sample_index not in outliers_jens_fixing_gap2) and (sample_index not in outliers_robbe_fixing_gap):            
        if keep_sample(sample):
            train_samples.append(sample)
            train_labels.append(int(label))
            train_persons.append(_person)
            train_personlabels.append((label, _person))
        sample_index += 1

train_samples=np.array(train_samples)
train_labels=np.array(train_labels)

##Loading all the test data
all_test_files = sorted(glob(pjoin(POSE_DIR, 'test', '*.npy')))  

test_samples = []
for numpy_file in all_test_files:
    sample = np.load(numpy_file)
    test_samples.append(sample)
    
test_samples = np.array(test_samples)

In [7]:
#Code to determine the outliers
temp = []
for i in range(len(train_samples)):
    teller = 0
    for frame in train_samples[i]:
        if (frame[4][0] == 0 or frame[7][0] == 0) and not(frame[4][0] == 0 and frame[7][0] == 0):
            teller += 1
    if teller/len(train_samples[i]) > 0.5:
        temp.append(i)
temp=np.unique(temp)
print((temp))
print(len(train_samples))

[]
3560


## Upsampling

In [None]:
upsampled_samples, upsampled_labels, upsampled_label_list, upsampled_persons = upsampling.upsample(train_samples, train_labels, train_persons, train_personlabels)
print(len(upsampled_samples), len(upsampled_labels), upsampled_label_list, len(upsampled_persons))
train_samples = np.array(upsampled_samples)
train_labels = np.array(upsampled_labels)
train_persons = np.array(upsampled_persons)

## Data augmentation

In [None]:
#train_samples = augmentation.augment_data(train_samples)
#train_labels = np.concatenate(( train_labels,train_labels, train_labels))
#train_persons = np.concatenate(( train_persons,train_persons, train_persons))
#train_personlabels = np.concatenate(( train_personlabels,train_personlabels, train_personlabels))

## Data preprocessing

In [8]:
#train_samples = preprocessing.rotate(train_samples)
#train_samples = preprocessing.scale(train_samples)
train_samples = preprocessing.centering(train_samples)

#test_samples = preprocessing.rotate(test_samples)
#test_samples = preprocessing.scale(test_samples)
test_samples = preprocessing.centering(test_samples)

--- Centering finished ---
--- Centering finished ---


## Extracting features
Here we use the features from the example notebook

In [9]:
y_train = np.array(train_labels)
X_train = features_extraction.extract_features(train_samples).values
X_test = features_extraction.extract_features(test_samples).values
num_features = X_train.shape[1]
print("Number of features : ", num_features)

  r = func(a, **kwargs)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)
  vector_hand_fingers = vector_hand_fingers / np.linalg.norm(vector_hand_fingers)


Number of features :  869


## Loss functions
Custom scoring functions

In [10]:
##Custom scoring functions

def top3_acc_score(y_true, proba):
    return H.mapk(proba,y_true)

def map3_score(y_true, proba):
    return H.top3_accuracy(proba, y_true)
    

acc = make_scorer(accuracy_score)
top3_acc = make_scorer(top3_acc_score, needs_proba=True) 
map3 = make_scorer(map3_score, needs_proba=True)
scoring_functions = {"acc": acc, "top3_acc": top3_acc, "map3": map3}

## Validation strategy

In [11]:
##Train validate data splitter for Cross Validation
seed = np.random.randint(1,999)
print("seed: ", seed)
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

seed:  94


## Pipeline & model training

In [12]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
# the function below generates our rescaled pipeline model
# with optimized hyperparameters (e.g.: regularisation parameter)
def tune_svm_rbf(x_data,r_data,verbose=0):
    pipe = Pipeline([
        ('scale', StandardScaler()),
        #('pca', PCA(n_components=400)),
        ('selectfrommodel',SelectFromModel(ExtraTreesClassifier(n_estimators=100))),
        #('selectkbest', SelectKBest(chi2)),
        ('minmaxscaler', MinMaxScaler(feature_range=(0,1))),
        ('svm', svm.SVC(kernel='rbf', probability = True))
    ])

    # Set the parameters by cross-validation
    tuned_parameters = {'svm__C': np.logspace(4, 7, 3) ,'svm__gamma': np.logspace(-8, -5, 3)}

    print("------ Start tuning hyperparameters ------")
    CV = GridSearchCV(pipe, tuned_parameters, n_jobs=-1, scoring=map3, pre_dispatch='n_jobs', cv=sgkf, verbose=2)
    with parallel_backend('threading'):
        CV.fit(x_data, r_data)
    results = CV.cv_results_
    params = CV.best_params_
    print("------ Tuning hyperparameters finished ------")
    
    bestC = params['svm__C']
    bestGamma = params['svm__gamma']
    
    #bestN = CV.best_params_['selectfrommodel__n_estimators']
    print("Optimal regularisation value: ", bestC)
    print("Optimal gamma value: ", bestGamma)
    #print("Optimal k value: ", bestN)
    
    optimal_pipe = Pipeline([
        ('scale', StandardScaler()),
        #('pca', PCA(n_components=400)),
        ('selectfrommodel',SelectFromModel(ExtraTreesClassifier(n_estimators=100))),
       # ('selectkbest', SelectKBest(chi2, k=bestK)),
        ('minmaxscaler', MinMaxScaler(feature_range=(0,1))),
        ('svm', svm.SVC(kernel='rbf', probability = True, C=bestC, gamma=bestGamma))
    ])
    
    
    if verbose>0:
    
        print("Grid scores on training data set:")
        print()
        cv_means = results['mean_test_score']
        print(cv_means)
        cv_stds = results['std_test_score']
        for mean, std, params in zip(cv_means, cv_stds, results['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    return optimal_pipe

In [None]:
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

optimal_pipe = tune_svm_rbf(X_train, y_train, 1)

##Train validate data splitter for Cross Validation
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5, seed)

with parallel_backend('threading'):
    scores = cross_val_score(optimal_pipe, X_train, y_train, scoring=map3, cv=sgkf, n_jobs=-1, pre_dispatch='n_jobs')
print(scores)
print("Average (cross validated) map@3 score: ",scores.mean(),", stdev: ",scores.std())

optimal_pipe.fit(X_train, y_train)

------ Start tuning hyperparameters ------
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


[CV] svm__C=10000.0, svm__gamma=1e-08 ................................[CV] svm__C=10000.0, svm__gamma=1e-08 ................................

[CV] svm__C=10000.0, svm__gamma=1e-08 ................................
[CV] svm__C=10000.0, svm__gamma=1e-08 ................................[CV] svm__C=10000.0, svm__gamma=1e-08 ................................
[CV] svm__C=10000.0, svm__gamma=3.162277660168379e-07 ................

[CV] svm__C=10000.0, svm__gamma=3.162277660168379e-07 ................[CV] svm__C=10000.0, svm__gamma=3.162277660168379e-07 ................

[CV] ................. svm__C=10000.0, svm__gamma=1e-08, total=  35.1s
[CV] svm__C=10000.0, svm__gamma=3.162277660168379e-07 ................
[CV] . svm__C=10000.0, svm__gamma=3.162277660168379e-07, total=  35.2s
[CV] svm__C=10000.0, svm__gamma=3.162277660168379e-07 ................
[CV] ................. svm__C=10000.0, svm__gamma=1e-08, total=  36.7s
[CV] svm__C=10000.0, svm__gamma=1e-05 ................................
[CV] .

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min


[CV] .............. svm__C=10000000.0, svm__gamma=1e-08, total=  19.8s
[CV] svm__C=10000000.0, svm__gamma=1e-05 .............................
[CV] .............. svm__C=10000000.0, svm__gamma=1e-08, total=  19.4s
[CV] svm__C=10000000.0, svm__gamma=1e-05 .............................
[CV]  svm__C=10000000.0, svm__gamma=3.162277660168379e-07, total=  18.0s
[CV] svm__C=10000000.0, svm__gamma=1e-05 .............................
[CV]  svm__C=10000000.0, svm__gamma=3.162277660168379e-07, total=  20.0s
[CV] svm__C=10000000.0, svm__gamma=1e-05 .............................


## Generate kaggle submission file

In [None]:
test_probas = optimal_pipe.predict_proba(X_test)
H.create_submission(test_probas, 'submission_svc_slectfrommoel.csv')

## Model analysis
Generate confusion matrix on validation data

In [None]:
sgkf = validation.stratified_group_k_fold(train_samples, train_labels, train_persons, 5)
with parallel_backend('threading'):
    y_pred = cross_val_predict(optimal_pipe, X_train, y_train, cv=sgkf, n_jobs=-1, pre_dispatch='n_jobs')
Analysis.plot_confusion_matrix(confusion_matrix(y_train, y_pred))

Calculate the precision, recall, F1 score, TP, FP and NP

In [None]:
##ANALYSIS

#PLOTS
P,R,F1 = None,None,None # compute these for class c on the TEST SET

#ORDER PROBABILITIES
prob_order = H.get_ordered_predictions(train_probas)
# Get the top prediction per sample.
top_prob = prob_order[:,:1]
top_prob_2= prob_order[:,1:2]
top_prob_3= prob_order[:,2:3]

#PRECISION,RECALL,F1 PER CLASS
print("--- TEST SET ---")
macro_f1 = 0 
for c in range(0,18):
    print("Class :{}".format(c))
    print('------')
    for k in range(1,4):
        print("k :{}".format(k))
        P,R,F1 = None,None,None # compute these for class c on the TEST SET
        # YOUR CODE HERE
        P,R,F1,TP,FP,FN = Analysis.compute_precision_recall_F1_label(train_probas, y_train, k,c)
        #raise NotImplementedError()
        if F1!=0.0:
            macro_f1 += F1
        
        print("Number of TP,FP,FN:")
        print('TP in k{} for class {}:{}'.format(k,c,TP))
        print('FP in k{} for class {}:{}'.format(k,c,FP))
        print('FN in k{} for class {}:{}'.format(k,c,FN))
        print("------")
        print('Precision K:{} for class {}: {}'.format(k,c,P))
        print('Recall K:{} for class    {}: {}'.format(k,c,R))
        print('F1 K:{} for class        {}: {}'.format(k,c,F1))
        
    print('-----------------------------')
        
        
macro_f1 /= 3        

print(f'F1: {macro_f1}')