## 1. Basic import

In [None]:
from sklearn import svm
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pickle

## 2. Creation of dataset : features and labels 

In [None]:
tracking = pd.read_pickle('MLbodyFace').dropna()
database = pd.read_pickle('database').reset_index()

In [None]:
tracking.drop(['seq', 'id', 'traSeq', 'trackId', 'rgbSeq'], axis=1, inplace=True)

In [None]:
database.rename(columns={'index': 'name'}, inplace=True)

#### We prepare the other dataset to be append to the big one

In [None]:
sabrine = pd.read_pickle('sabrine')

In [None]:
# Remove useless column - so both DataFrame have the same column 
sabrine.drop(['seq', 'id', 'traSeq', 'trackId', 'rgbSeq'], axis=1, inplace=True)
sabrine.rename(columns={'name': 'GroundTrue'}, inplace=True)

### Height normalization

As max height equal 2.3 and min height is 1.3, we just have to shift it to have value in [-0.5, 0.5] and multiply the value by 2 to have value in [-1, 1] 

In [None]:
tracking.height = tracking.height.apply(lambda x : (x - 1.8)*2)
database.height = database.height.apply(lambda x : (x - 1.8)*2)
sabrine.height = sabrine.height.apply(lambda x : (x - 1.8)*2)

### Extraction of embeddings information into different column to feed model

In [None]:
#Extract embedding informations and rearrange it into columns 
#(one column for each features - 128 columns in total)
em = [[] for i in range(len(tracking.iloc[0].embedding))]

for index, r in tracking.iterrows():
    for column in range (len(tracking.iloc[0].embedding)):
         em[column].append(r.embedding[column])

In [None]:
#Insert nex feature's columns into dataframe
for column in range (len(tracking.iloc[0].embedding)):
    tracking[column] = em[column]

In [None]:
# Drop unuseful column
tracking.drop('embedding', 1, inplace=True)

In [None]:
# Append both DataFrame - got good amount of information for Sabrine as well
tracking = tracking.append(sabrine)

### Creation of dataset for label 1
Creation of dataframe with the same person information/features for two different detection.
Will be labeled as 1 (label for the same person)

In [None]:
# Group by name - then work on one group to create same person dataset
groupBy = tracking.groupby('GroundTrue')

In [None]:
# Return DF of pair of embeddings + bluriness + distance
# Data augmentation between 
def same(name, group, oneLenght):
    listToConcatSame = []
    
    for index, r in group.reset_index().drop('index', 1).iterrows():
        base = database[database.name == name]
        base.drop('name', 1, inplace=True)
        
        # Need to modify columns order to have the same for every sample (from DB and from detection)
        cols = base.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        base = base[cols]
        
        # Print percentage since could be long:
        #percentage = index * 100 / len(group)
        #if percentage % 10 == 0 :
            #print str(percentage) + "%"

        # Cosine similarity
        temp = r.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH', 'distance', 'blur'])
        difference = np.asarray(base.subtract(temp)).squeeze()
        listToConcatSame.append(difference)
        
        # add number of ones
        oneLenght = oneLenght + difference.shape[0]
        
    return np.asarray(listToConcatSame), oneLenght

In [None]:
# Same person dataset
listToConcat = []
oneLenght = 0
for name, group in groupBy:
    print name
    differences, oneLenght = same(name, group, oneLenght)
    listToConcat.append(differences)

### Creation of dataset for label 0
Creation of dataframe of pair of different person information/features.
Will be labeled as 0 (label for two different persons)

In [None]:
def different(name, group, zeroLenght):
    listToConcatDiff = []
    
    for index, r in group.reset_index().drop('index', 1).iterrows():
        base = database[database.name != name]
        base.drop('name', 1, inplace=True)
        
        # Display percentage ince could be long
        #percentage = index * 100 / len(group)
        #if percentage % 20 == 0 :
            #print str(percentage) + "%"
            
        # Need to modify columns order to have the same for every sample (from DB and from detection)
        cols = base.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        base = base[cols]
            
        # Cosine similarity
        temp = r.drop(['GroundTrue', 'trackX', 'trackY', 'trackW', 'trackH', 'distance', 'blur'])
        difference = np.asarray(base.subtract(temp)).squeeze()
        listToConcatDiff.append(difference)
        zeroLenght = zeroLenght + difference.shape[0]
        
    return np.asarray(listToConcatDiff).squeeze(), zeroLenght

In [None]:
# Same person dataset
zeroLenght = 0
for name, group in groupBy:
    print name
    cosines, zeroLenght = different(name, group, zeroLenght)
    listToConcat.append(cosines)

### Creation of the features matrix and label vector

In [None]:
newList = []
for e in listToConcat:
    new = e.reshape(-1, 129)
    for i in range(new.shape[0]):
        newList.append(new[i])

In [None]:
X = np.asarray(newList)

In [None]:
X.shape

In [None]:
y = np.concatenate([np.ones(oneLenght), np.zeros(zeroLenght)])

In [None]:
len(y)

### Train SVM classifier

In [None]:
svc = SVC(C=100, gamma=0.001, degree=3, class_weight='balanced')

In [None]:
cross = cross_val_score(svc, X, y, cv=10, verbose=100)

In [None]:
cross.mean()

### Grid search to tune SVC

In [None]:
parameters = [
  {'C': [100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], 'degree': [2, 3, 4]},
 ]

In [None]:
clf = GridSearchCV(svc, parameters)

In [None]:
clf.fit(X, y)

In [None]:
clf.get_params(deep=False)

In [None]:
clf.cv_results_

Not bad, so I will test it

In [None]:
svc = SVC(C=100, gamma=0.001, degree=3, class_weight='balanced')

In [None]:
svc.fit(X, y)

In [None]:
confusion_matrix(y, svc.predict(X))

In [None]:
# Best metric to evaluate score
f1_score(y, svc.predict(X))

So this model shoudl be better then the simple cosine model

In [None]:
# load the model from disk
#filename = 'svc.sav'
#svc = pickle.load(open(filename, 'rb'))

In [None]:
# save the model to disk
folderPath = '/home/sabrine/social_tracking/models/svcEmbeddingHeight/'
filename = 'svc.sav'
pickle.dump(svc, open(folderPath + filename, 'wb'))
pickle.dump(X.reshape(-1, 1), open(folderPath + 'X', 'wb'))
pickle.dump(y, open(folderPath + 'y', 'wb'))