In [1]:
# REQUIRED IMPORTS FROM STANDARD PACKAGES

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import random
import pandas as pd
from os.path import join as pjoin
from glob import glob
import sklearn as sk
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import scipy
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import make_scorer, accuracy_score
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# IMPORTS FROM THE UTIL LIBRARY PROVIDED BY US

import util.vis as V
import util.helpers as H

# Normally, all libraries are loaded only once, 
# even if you execute the import code multiple times
# This code is helpful if you make your own helper libraries 
%load_ext autoreload
%autoreload 1
# list your libraries below with aimport: should ensure 
#they are reloaded each time without having to restart your kernel
# in this case, our libraries are used as an example

%aimport util.helpers, util.vis
%aimport features_extraction
%aimport augmentation
%aimport Analysis
%aimport validation
%aimport preprocessing

# seed random generator such that this notebook always returns the same values 
# (this is by no means necessary, but it is useful for reproducability of results)
rng = np.random.RandomState(42)

In [2]:
# PATHS

DATA_DIR = '../data'
POSE_DIR = '../data/pose'

## Loading data
Loading all the training and test data

In [3]:
##Loading all the training data
dataset_file=pjoin(DATA_DIR,'labels.csv')

train_samples=[]
train_labels=[]
train_persons = []
train_personlabels = []

with open(dataset_file) as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        name, _gloss, label, _person = row
        sample = np.load(pjoin(POSE_DIR, 'train', name+'.npy'))
        train_samples.append(sample)
        train_labels.append(int(label))
        train_persons.append(_person)
        train_personlabels.append((label, _person))

train_samples=np.array(train_samples)
train_labels=np.array(train_labels)

##Loading all the test data
all_test_files = sorted(glob(pjoin(POSE_DIR, 'test', '*.npy')))  

test_samples = []
for numpy_file in all_test_files:
    sample = np.load(numpy_file)
    test_samples.append(sample)
    
test_samples = np.array(test_samples)

## Data augmentation

In [4]:
train_samples = augmentation.augment_data(train_samples)
train_labels = np.concatenate(( train_labels,train_labels, train_labels))
train_persons = np.concatenate(( train_persons,train_persons, train_persons))
train_personlabels = np.concatenate(( train_personlabels,train_personlabels, train_personlabels))

## Data preprocessing

In [5]:
train_samples = preprocessing.centering(train_samples)
test_samples = preprocessing.centering(test_samples)

## Extracting features

In [7]:
df = features_extraction.extract_features(train_samples)
#Analysis.plot_correlation_matrix(df)
X_train = df.values #convert pandas dataframe to numpy array with .values
y_train = train_labels
X_test = features_extraction.extract_features(test_samples).values

## PCA Analysis
We don't need to run this code anymore, since we will not use PCA anymore. It's just over here in case we would need it once again.

In [None]:
##PCA
#Check the number of components needed in order to get the maximum variance
pca_1 = PCA().fit(df) 

#Plotting variance 
plt.figure() 
plt.plot(np.cumsum(pca_1.explained_variance_ratio_)) 
plt.xlabel(' Number of components ') 
plt.ylabel(' Variance (%) ') #per each component
plt.title('Explained Variance ') 
plt.show()

#Number of components 7
#Fit with only 7 features
pca_1 = PCA(0.99) 
new_matrix=preprocessing.StandardScaler().fit_transform(df)
new_matrix = pca_1.fit_transform(new_matrix)

print(new_matrix.shape)

## Loss functions
Custom scoring functions

In [8]:
##Custom scoring functions

def top3_acc_score(y_true, proba):
    return H.mapk(proba,y_true)

def map3_score(y_true, proba):
    return H.top3_accuracy(proba, y_true)
    

acc = make_scorer(accuracy_score)
top3_acc = make_scorer(top3_acc_score, needs_proba=True) 
map3 = make_scorer(map3_score, needs_proba=True)
scoring_functions = {"acc": acc, "top3_acc": top3_acc, "map3": map3}

## Validation strategy

In [17]:
##Train validate data splitter for Cross Validation
gss = GroupShuffleSplit(n_splits = 4, test_size=0.2, random_state=0)
gss = gss.split(train_samples, train_labels, groups=train_persons)

## Pipeline & model training

In [22]:
# the function below generates our rescaled pipeline model
# with optimized regularisation parameter
def tune_C_logreg(x_data,r_data,verbose=0):
    pipe = Pipeline([
        ('scale', StandardScaler()), 
        ('lda', LDA()), 
        ('logreg', LogisticRegression(tol=1e-3, class_weight='balanced', solver='liblinear'))
    ])


    # Set the parameters by cross-validation
    tuned_parameters = [{'logreg__C': [1.0e-5,1.0e-4,1.0e-3,1.0e-2,1.0e-1,1.0, 
                                       10.0, 100.0, 1000.0,10000.0,100000]}]

    CV = GridSearchCV(pipe, tuned_parameters, cv=gss, scoring =map3)
    CV.fit(x_data, r_data)
    bestC = CV.best_params_['logreg__C']
    print("Optimal regularisation value: ",bestC)
    optimal_pipe = Pipeline([
        ('scale', StandardScaler()), 
        ('lda', LDA()), 
        ('logreg', LogisticRegression(C=bestC, tol=1e-3, class_weight='balanced', solver='liblinear'))
    ])
    
    if verbose>0:
    
        print("Grid scores on training data set:")
        print()
        cv_means = CV.cv_results_['mean_test_score']
        cv_stds = CV.cv_results_['std_test_score']
        for mean, std, params in zip(cv_means, cv_stds, CV.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))


    return optimal_pipe


In [23]:
##Train validate data splitter for Cross Validation
gss = GroupShuffleSplit(n_splits = 4, test_size=0.2, random_state=0)
gss = gss.split(train_samples, train_labels, groups=train_persons)

##Create Optimal Pipeline and train the model

optimal_pipe = tune_C_logreg(X_train, y_train)
#optimal_pipe.fit(X_train, y_train)

##Train validate data splitter for Cross Validation
gss = GroupShuffleSplit(n_splits = 10, test_size=0.2, random_state=0)
gss = gss.split(train_samples, train_labels, groups=train_persons)

scores = cross_val_score(optimal_pipe, X_train, y_train, cv=gss)
print(scores)
print("Average accuracy: ",scores.mean(),", stdev: ",scores.std())






Optimal regularisation value:  1e-05




[0.38314653 0.37028658 0.32734531 0.39150508 0.34172662 0.34948097
 0.3619211  0.33968254 0.34424242 0.41682723]
Average accuracy:  0.3626164378981739 , stdev:  0.026402099839656884




In [None]:

#C_range = [ 1.0e-5,1.0e-4,1.0e-3,1.0e-2,1.0e-1,1.0, 10.0, 100.0, 1000.0,10000.0]
#tuned_parameters = {'logreg__C': C_range}

#CV=RandomizedSearchCV(pipe1, tuned_parameters, verbose = 1, scoring =map3, cv=gss, n_iter=10, n_jobs=-1,pre_dispatch='n_jobs', iid=False, return_train_score=True)
#CV.fit(X_train, y_train)

#print("Best parameters set found on development set: ",CV.best_params_)

#print("Best parameters set found on development set: ",CV.best_params_)
## store the best optimization parameter for later reuse
#bestC2 = CV.best_params_['logreg__C']

#print("Grid scores on training data set:")
#print()
#cv_means = CV.cv_results_['mean_test_score']
#cv_stds = CV.cv_results_['std_test_score']


#use this for randonsearch
#train_probas = CV.predict_proba(X_train)
#print('Training set accuracy:', CV.score(X_train, y_train))
#print('Training set score (map@3):', H.mapk(train_probas,y_train))
#print('Training set top-3 accuracy:', H.top3_accuracy(train_probas, y_train))

#print(CV.cv_results_['mean_train_score'])
#print(CV.cv_results_['mean_test_score'])
#till here




#val_probas = pipe1.predict_proba(X_val)
#print('Validation set accuracy:', pipe1.score(X_val, y_val))
#print('Validation set score (map@3):', H.mapk(val_probas,y_val))
#print('Validation set top-3 accuracy:', H.top3_accuracy(val_probas, y_val))
                  
#print('Training set accuracy:', CV.cv_results_['mean_train_acc'])
#print('Training set score (map@3):', CV.cv_results_['mean_train_map3'])
#print('Training set top-3 accuracy:', CV.cv_results_['mean_train_top3acc'])

#print('Training set accuracy:', CV.cv_results_['mean_test_acc'])
#print('Validation set score (map@3):', CV.cv_results_['mean_test_map3'])
#print('Training set top-3 accuracy:', CV.cv_results_['mean_test_top3acc'])

## Generate kaggle submission file

In [None]:
test_probas = pipe1.predict_proba(X_test)
H.create_submission(test_probas, 'submission_11_lda_map3.csv')

## Model analysis
Generate confusion matrix and calculate the precision, recall, F1 score, TP, FP, NP

In [None]:
##ANALYSIS

#PLOTS
P,R,F1 = None,None,None # compute these for class c on the TEST SET

#ORDER PROBABILITIES
prob_order = H.get_ordered_predictions(train_probas)
# Get the top prediction per sample.
top_prob = prob_order[:,:1]
top_prob_2= prob_order[:,1:2]
top_prob_3= prob_order[:,2:3]

#PRECISION,RECALL,F1 PER CLASS
print("--- TEST SET ---")
macro_f1 = 0 
for c in range(0,18):
    print("Class :{}".format(c))
    print('------')
    for k in range(1,4):
        print("k :{}".format(k))
        P,R,F1 = None,None,None # compute these for class c on the TEST SET
        # YOUR CODE HERE
        P,R,F1,TP,FP,FN = Analysis.compute_precision_recall_F1_label(train_probas, y_train, k,c)
        #raise NotImplementedError()
        if F1!=0.0:
            macro_f1 += F1
        
        print("Number of TP,FP,FN:")
        print('TP in k{} for class {}:{}'.format(k,c,TP))
        print('FP in k{} for class {}:{}'.format(k,c,FP))
        print('FN in k{} for class {}:{}'.format(k,c,FN))
        print("------")
        print('Precision K:{} for class {}: {}'.format(k,c,P))
        print('Recall K:{} for class    {}: {}'.format(k,c,R))
        print('F1 K:{} for class        {}: {}'.format(k,c,F1))
        
    print('-----------------------------')
        
        
macro_f1 /= 3        

print(f'F1: {macro_f1}')


#CONFUSION MATRICES
##Top 1 (1)
Analysis.plot_confusion_matrix(top_prob, y_train.astype(np.int32) ,1)
##Top 2 (1/2)
Analysis.plot_confusion_matrix(top_prob_2, y_train.astype(np.int32) , '1/2')
##Top 3 (1/3)
Analysis.plot_confusion_matrix(top_prob_3, y_train.astype(np.int32) , '1/3')