## 1. Basic import

In [46]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import *
import pickle

## 2. Creation of dataset : features and labels 

In [47]:
tracking = pd.read_pickle('MLbodyFace').dropna()
database = pd.read_pickle('database').reset_index()

In [48]:
tracking.drop(['seq', 'id', 'traSeq', 'trackId', 'rgbSeq'], axis=1, inplace=True)

In [49]:
database.rename(columns={'index': 'name'}, inplace=True)

### Height normalization

As max height equal 2.3 and min height is 1.3, we just have to shift it to have value in [-0.5, 0.5] and multiply the value by 2 to have value in [-1, 1] 

In [50]:
tracking.height = tracking.height.apply(lambda x : (x - 1.8)*2)
database.height = database.height.apply(lambda x : (x - 1.8)*2)

### Extraction of embeddings information into different column to feed model

In [51]:
#Extract embedding informations and rearrange it into columns 
#(one column for each features - 128 columns in total)
em = [[] for i in range(len(tracking.iloc[0].embedding))]

for index, r in tracking.iterrows():
    for column in range (len(tracking.iloc[0].embedding)):
         em[column].append(r.embedding[column])

In [52]:
#Insert nex feature's columns into dataframe
for column in range (len(tracking.iloc[0].embedding)):
    tracking[column] = em[column]

In [53]:
# Drop unuseful column
tracking.drop('embedding', 1, inplace=True)

### Creation of dataset for label 1
Creation of dataframe with the same person information/features for two different detection.
Will be labeled as 1 (label for the same person)

In [54]:
# Group by name - then work on one group to create same person dataset
groupBy = tracking.groupby('GroundTrue')

In [55]:
# Return DF of pair of embeddings + bluriness + distance
# Data augmentation between 
def same(name, group, oneLenght):
    listToConcatSame = []
    
    for index, r in group.reset_index().drop('index', 1).iterrows():
        base = database[database.name == name]
        base.drop('name', 1, inplace=True)
        
        # Need to modify columns order to have the same for every sample (from DB and from detection)
        cols = base.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        base = base[cols]
        
        # Print percentage since could be long:
        percentage = index * 100 / len(group)
        if percentage % 10 == 0 :
            print str(percentage) + "%"

        # Cosine similarity
        temp = r.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH', 'distance', 'blur'])
        cosines = cosine_similarity(np.asarray(base), np.asarray(temp.reshape(1, -1)))
        cosines = cosines.reshape(1, -1).squeeze()
        listToConcatSame.append(cosines)
        oneLenght = oneLenght + len(cosines)
    
    return np.asarray(listToConcatSame).reshape(1, -1).squeeze(), oneLenght

In [56]:
# Same person dataset
listToConcat = []
oneLenght = 0
for name, group in groupBy:
    print name
    cosines, oneLenght = same(name, group, oneLenght)
    listToConcat.append(cosines)

Alberto
0%
0%
0%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


10%
10%
20%
20%
30%
30%
40%
40%
40%
50%
50%
50%
60%
60%
60%
70%
70%
70%
80%
80%
80%
90%
90%
90%
Lucas
0%
0%
0%
10%
10%
20%
20%
20%
30%
30%
30%
40%
40%
50%
50%
50%
60%
60%
60%
70%
70%
80%
80%
80%
90%
90%
90%
Oliver
0%
0%
0%
10%
10%
10%
20%
20%
20%
30%
30%
30%
40%
40%
40%
50%
50%
50%
60%
60%
60%
70%
70%
70%
80%
80%
80%
90%
90%
90%
Sabrine
0%
10%
20%
30%
40%
50%
60%
70%
80%
90%


### Creation of dataset for label 0
Creation of dataframe of pair of different person information/features.
Will be labeled as 0 (label for two different persons)

In [57]:
def different(name, group, zeroLenght):
    listToConcatDiff = []
    
    for index, r in group.reset_index().drop('index', 1).iterrows():
        base = database[database.name != name]
        base.drop('name', 1, inplace=True)
        
        # Display percentage ince could be long
        percentage = index * 100 / len(group)
        if percentage % 20 == 0 :
            print str(percentage) + "%"
            
        # Need to modify columns order to have the same for every sample (from DB and from detection)
        cols = base.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        base = base[cols]
            
        # Cosine similarity
        temp = r.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH', 'distance', 'blur'])
        cosines = cosine_similarity(np.asarray(base), np.asarray(temp.reshape(1, -1)))
        cosines = cosines.reshape(1, -1).squeeze()
        listToConcatDiff.append(cosines)
        zeroLenght = zeroLenght + len(cosines)
        
    return np.asarray(listToConcatDiff).reshape(1, -1).squeeze(), zeroLenght

In [58]:
# Same person dataset
zeroLenght = 0
for name, group in groupBy:
    print name
    cosines, zeroLenght = different(name, group, zeroLenght)
    listToConcat.append(cosines)

Alberto
0%
0%
0%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


20%
20%
40%
40%
40%
60%
60%
60%
80%
80%
80%
Lucas
0%
0%
0%
20%
20%
20%
40%
40%
60%
60%
60%
80%
80%
80%
Oliver
0%
0%
0%
20%
20%
20%
40%
40%
40%
60%
60%
60%
80%
80%
80%
Sabrine
0%
20%
40%
60%
80%


### Creation of the features matrix and label vector

In [59]:
listToConcat

[array([ 0.67863152,  0.82064115,  0.77565398,  0.67867596,  0.82067093,
         0.7756884 ,  0.67675685,  0.81937404,  0.7741944 ,  0.67799567,
         0.82021378,  0.77516056,  0.67911381,  0.82096363,  0.77602708,
         0.68014848,  0.82165039,  0.77682406,  0.68115054,  0.82230875,
         0.7775913 ,  0.68144424,  0.82250042,  0.77781529,  0.68310478,
         0.8235726 ,  0.77907385,  0.68320349,  0.8236357 ,  0.77914824,
         0.68381159,  0.82402285,  0.77960538,  0.68248465,  0.82317451,
         0.77860543,  0.68315203,  0.82360282,  0.77910947,  0.68371   ,
         0.82395837,  0.77952915,  0.68431134,  0.82433891,  0.77997963,
         0.68481869,  0.82465779,  0.78035822,  0.68303339,  0.82352691,
         0.77902003,  0.68359137,  0.82388296,  0.77944005,  0.68190004,
         0.82279668,  0.77816209,  0.68038036,  0.82180334,  0.77700201,
         0.67799498,  0.82021331,  0.77516003,  0.6792292 ,  0.82104056,
         0.7761162 ,  0.68040454,  0.82181926,  0.7

In [60]:
X = np.hstack(listToConcat)

In [61]:
y = np.concatenate([np.ones(oneLenght), np.zeros(zeroLenght)])

### Train SVM classifier

In [62]:
svm = linear_model.SGDClassifier(n_iter=100, alpha=0.01, class_weight='balanced')

In [63]:
cross = cross_val_score(svm, X.reshape(-1, 1), y, cv=10, verbose=100)

[CV]  ................................................................
[CV] ................................. , score=0.798629, total=   0.1s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.794412, total=   0.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.749605, total=   0.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.740116, total=   0.1s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.3s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , 

In [64]:
cross.mean()

0.74417195945149106

Not bad, so I will test it

In [65]:
svm.fit(X.reshape(-1, 1), y)

SGDClassifier(alpha=0.01, average=False, class_weight='balanced', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=100, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [66]:
# save the model to disk
filename = 'svm.sav'
pickle.dump(svm, open(filename, 'wb'))