## 1. Basic import

In [1]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import *
import pickle

## 2. Creation of dataset : features and labels 

In [4]:
tracking = pd.read_pickle('MLbodyFace').dropna()
database = pd.read_pickle('database').reset_index()

In [5]:
tracking.drop(['seq', 'id', 'traSeq', 'trackId', 'rgbSeq'], axis=1, inplace=True)

In [6]:
tracking.head()

Unnamed: 0,trackX,trackY,trackW,trackH,height,distance,blur,embedding,GroundTrue
0,326.4,124.0,387.6,458.0,1.571027,2.248792,367.316834,"[-0.110296, 0.0504573, 0.00468473, -0.0105098,...",Oliver
1,802.4,150.0,442.0,520.0,1.519628,1.882822,790.153174,"[0.0554316, 0.0860607, -0.00389893, 0.0320879,...",Alberto
2,1135.6,108.0,309.4,364.0,1.475796,2.76029,600.989129,"[-0.0656685, 0.047579, -0.100778, 0.00661445, ...",Lucas
4,323.0,124.0,387.6,456.0,1.569231,2.25334,367.316834,"[-0.110296, 0.0504573, 0.00468473, -0.0105098,...",Oliver
5,805.8,152.0,438.6,518.0,1.519507,1.894786,790.153174,"[0.0554316, 0.0860607, -0.00389893, 0.0320879,...",Alberto


In [7]:
database.rename(columns={'index': 'name'}, inplace=True)

In [8]:
database.head()

Unnamed: 0,name,0,1,2,3,4,5,6,7,8,...,119,120,121,122,123,124,125,126,127,height
0,Laurene,-0.124272,0.110877,0.044338,0.036444,0.238794,-0.062442,-0.048617,-0.043027,0.094638,...,0.04319,0.058516,0.047929,-0.04402,0.020436,0.101279,0.096265,0.105147,0.067171,1.8
1,Laurene,-0.081603,0.029872,0.132522,0.101111,0.063786,-0.066584,-0.080945,-0.094811,0.063537,...,0.034718,-0.029186,0.088114,-0.025882,0.0288,0.11226,0.090284,0.08023,0.092933,1.8
2,Laurene,-0.122956,0.06021,0.037656,0.038232,0.181885,-0.031491,-0.047854,-0.006947,0.026738,...,0.131896,0.064968,0.014392,-0.069568,0.036996,0.183133,0.12324,0.12812,0.074121,1.8
3,Wissem,0.005441,-0.016771,-0.009609,-0.084184,-0.064907,0.13924,0.001245,0.082115,-0.085787,...,-0.083136,-0.157419,-0.036077,-0.128765,0.00596,-0.039848,-0.177919,-0.01044,-0.087472,1.83
4,Wissem,0.05932,-0.026325,0.066739,-0.106427,0.017275,0.079638,0.104448,0.143851,0.02888,...,-0.0981,-0.027099,-0.079001,-0.111031,0.051414,-0.04158,-0.184687,-0.048371,-0.140847,1.83


### Height normalization

As max height equal 2.3 and min height is 1.3, we just have to shift it to have value in [-0.5, 0.5] and multiply the value by 2 to have value in [-1, 1] 

In [9]:
tracking.height = tracking.height.apply(lambda x : (x - 1.8)*2)

In [10]:
database.height = database.height.apply(lambda x : (x - 1.8)*2)

### Extraction of embeddings information into different column to feed model

In [11]:
#Extract embedding informations and rearrange it into columns 
#(one column for each features - 128 columns in total)
em = [[] for i in range(len(tracking.iloc[0].embedding))]

for index, r in tracking.iterrows():
    for column in range (len(tracking.iloc[0].embedding)):
         em[column].append(r.embedding[column])

In [12]:
#data = np.asarray(em).T

In [13]:
#Insert nex feature's columns into dataframe
for column in range (len(tracking.iloc[0].embedding)):
    tracking[column] = em[column]

In [14]:
# Drop unuseful column
tracking.drop('embedding', 1, inplace=True)

In [15]:
tracking.head()

Unnamed: 0,trackX,trackY,trackW,trackH,height,distance,blur,GroundTrue,0,1,...,118,119,120,121,122,123,124,125,126,127
0,326.4,124.0,387.6,458.0,-0.457947,2.248792,367.316834,Oliver,-0.110296,0.050457,...,-0.001748,-0.08656,0.028729,0.0669,-0.051501,-0.029163,0.052923,-0.105883,0.100579,-0.002058
1,802.4,150.0,442.0,520.0,-0.560744,1.882822,790.153174,Alberto,0.055432,0.086061,...,-0.033809,-0.125701,0.021227,0.139772,-0.022684,-0.037909,0.186199,-0.067317,0.077795,0.044126
2,1135.6,108.0,309.4,364.0,-0.648407,2.76029,600.989129,Lucas,-0.065669,0.047579,...,-0.025146,-0.081062,0.052214,0.186648,-0.156097,0.029255,0.069186,-0.105391,-0.135352,-0.108049
4,323.0,124.0,387.6,456.0,-0.461538,2.25334,367.316834,Oliver,-0.110296,0.050457,...,-0.001748,-0.08656,0.028729,0.0669,-0.051501,-0.029163,0.052923,-0.105883,0.100579,-0.002058
5,805.8,152.0,438.6,518.0,-0.560986,1.894786,790.153174,Alberto,0.055432,0.086061,...,-0.033809,-0.125701,0.021227,0.139772,-0.022684,-0.037909,0.186199,-0.067317,0.077795,0.044126


### Creation of dataset for label 1
Creation of dataframe with the same person information/features for two different detection.
Will be labeled as 1 (label for the same person)

In [16]:
# Group by name - then work on one group to create same person dataset
groupBy = tracking.groupby('GroundTrue')

In [17]:
# Return DF of pair of embeddings + bluriness + distance
# Data augmentation between 
def same(name, group, oneLenght):
    listToConcatSame = []
    
    for index, r in group.reset_index().drop('index', 1).iterrows():
        base = database[database.name == name]
        base.drop('name', 1, inplace=True)
        
        # Print percentage since could be long:
        percentage = index * 100 / len(group)
        if percentage % 10 == 0 :
            print str(percentage) + "%"
        
        # NumpyArray/Dataframe with same information for same detection repeated enought
        data = np.asarray([r for i in range(len(base))])
        left = pd.DataFrame(data, columns=group.columns)
        left.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH'], 1, inplace=True)
        
        # Create a copy of the group (return a dataframe of all the detection for the same person)
        #copy = group.copy()
        # Set index to avoid non compatibility between the 2 dataframes
        base.set_index(left.index, inplace=True)
        
        #concate the 2 dateframes with a suffix "_right" for the dataframe of all the detection for the same person
        for col in base.columns:
            left[str(col) + '_right'] = base[col]
        
        # Concate left dataframe tot he list of all possible combinaison
        listToConcatSame.append(left)
        
        # Creation of a new sample with the embedding swap
        cols = left.columns.tolist()
        leftFeatures = cols[3:3+127+1]
        rightFeatures = cols[3+127+1:]

        newCols = cols[:3] + rightFeatures + leftFeatures
        right = left[newCols].copy()

        # Concatenate dataframe to return all possible combination
        listToConcatSame.append(right)
        oneLenght = oneLenght + len(left) + len(right)
        
    return pd.concat(listToConcatSame, ignore_index=True), oneLenght

In [18]:
# Same person dataset
listToConcat = []
oneLenght = 0
for name, group in groupBy:
    print name
    df, oneLenght = same(name, group, oneLenght)
    listToConcat.append(df)

Alberto
0%
0%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0%
10%
10%
20%
20%
30%
30%


KeyboardInterrupt: 

### Creation of dataset for label 0
Creation of dataframe of pair of different person information/features.
Will be labeled as 0 (label for two different persons)

In [None]:
def different(name, group, zeroLenght):
    listToConcatDiff = []
    
    for index, r in group.reset_index().drop('index', 1).iterrows():
        base = database[database.name != name]
        
        # Display percentage ince could be long
        percentage = index * 100 / len(group)
        if percentage % 20 == 0 :
            print str(percentage) + "%"
        
        # NumpyArray/Dataframe with same information for same detection repeated enought
        data = np.asarray([r for i in range(len(base))])
        left = pd.DataFrame(data, columns=group.columns)
        
        # Create a copy of the group (return a dataframe of all the detection for the same person)
        #copy = group.copy()
        # Set index to avoid non compatibility between the 2 dataframes
        base.set_index(left.index, inplace=True)
        
        #concate the 2 dateframes with a suffix "_right" for the dataframe of all the detection for the same person
        for col in base.columns:
            left[str(col) + '_right'] = base[col]
        
        # Concate left dataframe to the list of all possible combinaison
        listToConcatDiff.append(left)
        
        # Creation of a new sample with the embedding swap
        cols = left.columns.tolist()
        leftFeatures = cols[3:3+127+1]
        rightFeatures = cols[3+127+1:]

        newCols = cols[:3] + rightFeatures + leftFeatures
        right = left[newCols].copy()

        # Concatenate dataframe to return all possible combination
        listToConcatDiff.append(right)
        zeroLenght = zeroLenght + len(left) + len(right)
        
    return pd.concat(listToConcatDiff, ignore_index=True), zeroLenght

In [None]:
# different person dataset
zeroLenght = 0
for name, group in groupBy:
    print name
    df, zeroLenght = different(name, group, zeroLenght)
    listToConcat.append(df)


### Creation of the features matrix and label vector

In [None]:
big = pd.concat(listToConcat, ignore_index=True)

In [None]:
del listToConcat, 
del tracking, database

In [None]:
big = np.load('bigML')

In [243]:
X = big.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH', 'name_right', 'label'], 1)

In [244]:
label = np.concatenate([np.ones(oneLenght), np.zeros(zeroLenght)])

In [245]:
big['label'] = label
big.to_pickle('bigML')

In [246]:
Y = label

In [179]:
bigOne = big[:oneLenght]

In [180]:
bigZero = big[oneLenght:]

In [181]:
smallZero = bigZero.sample(frac=0.2)

In [182]:
len(smallZero)

6502

In [183]:
float(len(smallZero)) / len(bigOne)

1.2000738279808048

In [184]:
smallX = pd.concat([bigOne, smallZero], axis=0, ignore_index=True)

In [185]:
smallX.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH', 'name_right', 'label'], 1, inplace=True)

In [186]:
YOne = np.ones(len(bigOne))
smallYZero = np.zeros(len(smallZero))

In [187]:
smallY = np.concatenate((YOne, smallYZero))

In [188]:
X.isnull().values.any()

False

In [203]:
del big

In [212]:
svm = linear_model.SGDClassifier(loss = 'squared_loss', n_iter=100, alpha=0.01, class_weight='balanced')

In [213]:
cross = cross_val_score(svm, smallX, smallY, cv=10, verbose=100)

[CV]  ................................................................
[CV] ................................. , score=0.545683, total=   1.1s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.413244, total=   1.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.3s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.454698, total=   1.1s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.4s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.454698, total=   1.1s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.5s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , 

In [214]:
cross.mean()

0.48625217077283694

In [197]:
for c in smallX.columns:
    print c

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
0_right
100_right
101_right
102_right
103_right
104_right
105_right
106_right
107_right
108_right
109_right
10_right
110_right
111_right
112_right
113_right
114_right
115_right
116_right
117_right
118_right
119_right
11_right
120_right
121_right
122_right
123_right
124_right
125_right
126_right
127_right
12_right
13_right
14_right
15_right
16_right
17_right
18_right
19_right
1_right
20_right
21_right
22_right
23_right
24_right
25_right
26_right
27_right
28_right
29_right
2_right
30_right
31_right
32_right
33_right
34_right
35_right
36_right
37_right
38_right
39_right
3_right
40_right
41_righ

In [199]:
smallX.shape

(11920, 260)

In [205]:
X.shape

(37926, 260)

In [204]:
for c in X.columns:
    print c

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
0_right
100_right
101_right
102_right
103_right
104_right
105_right
106_right
107_right
108_right
109_right
10_right
110_right
111_right
112_right
113_right
114_right
115_right
116_right
117_right
118_right
119_right
11_right
120_right
121_right
122_right
123_right
124_right
125_right
126_right
127_right
12_right
13_right
14_right
15_right
16_right
17_right
18_right
19_right
1_right
20_right
21_right
22_right
23_right
24_right
25_right
26_right
27_right
28_right
29_right
2_right
30_right
31_right
32_right
33_right
34_right
35_right
36_right
37_right
38_right
39_right
3_right
40_right
41_righ

In [206]:
svm.fit(smallX, smallY)
confusion_matrix(Y, svm.predict(X))

array([[32508,     0],
       [ 5418,     0]])

In [152]:
svm.fit(smallX, smallY)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [269]:
clf = MLPClassifier(solver='adam', activation='relu', shuffle=True, alpha=0, learning_rate='adaptive',
                     hidden_layer_sizes=(100, 200, 50), random_state=1, max_iter= 500)

In [279]:
clf.fit(smallX, smallY)

MLPClassifier(activation='relu', alpha=0, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 200, 50), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [271]:
cross = cross_val_score(clf, smallX, smallY, cv=10, verbose=100)

[CV]  ................................................................
[CV] ................................. , score=0.799665, total=   3.4s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.723386, total=   2.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.5s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.746644, total=   3.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.5s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , score=0.821309, total=   3.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.5s remaining:    0.0s
[CV]  ................................................................
[CV] ................................. , 

In [272]:
cross.mean()

0.65918846368050243

In [280]:
confusion_matrix(Y, clf.predict(X))

array([[19116, 13392],
       [  390,  5028]])

In [281]:
f1_score(Y, clf.predict(X))

0.4218474704253713

In [282]:
accuracy_score(Y, clf.predict(X))

0.63660813162474295

In [283]:
# save the model to disk
filename = 'MLP.sav'
pickle.dump(clf, open(filename, 'wb'))

In [120]:
Y.iloc[1]

0.0

In [158]:
X.iloc[1]

height             1.51963
distance           1.88282
blur               790.153
0                0.0554316
1                0.0860607
2              -0.00389893
3                0.0320879
4                0.0109683
5                0.0210551
6               -0.0581478
7                0.0384976
8               0.00455054
9                 0.127191
10              -0.0890631
11               -0.228727
12              -0.0571598
13               0.0849429
14             -0.00518724
15                0.136657
16               0.0777629
17              -0.0440824
18                0.121872
19              -0.0569496
20             -0.00699838
21              -0.0778498
22                 0.07773
23              -0.0354916
24               0.0847794
25               -0.064217
26               -0.021229
                   ...    
99_right         0.0461792
100_right       -0.0465649
101_right        0.0163632
102_right        -0.206328
103_right      -0.00792275
104_right       -0.0365712
1

In [121]:
clf.predict(X.iloc[1].reshape(1, -1))

  if __name__ == '__main__':


array([ 1.])