In [1]:
import os
import json
import pickle
import random
import numpy as np
import xgboost as xgb
import tensorflow as tf
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
from sklearn.covariance import EllipticEnvelope
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

random.seed(123)
np.random.seed(123)

2021-09-19 11:28:29.726284: I tensorflow/stream_executor/platform/default/dso_loader.cc:54] Successfully opened dynamic library libcudart.so.11.0


# The input embeddings

The data in the input pickle file is stored in a dictionary structure:
```
{
    [
        'FAMILY_ID/PERSON_ID': [EMB_1, EMB_2...EMB_N],
        .
        .
        .
    ]
}
```

In [2]:
with open('../data/train_img_embeddings.pkl', 'rb') as f:
       train_embeddings = pickle.load(f)
print(f'The keys examples: {list(train_embeddings.keys())[:5]}')

embedding_shape = list(list(train_embeddings.values())[0].values())[0].shape
print(f'Embeddings shape: {embedding_shape}')

The keys examples: ['F0475/MID3', 'F0475/MID7', 'F0475/MID6', 'F0475/MID4', 'F0475/MID2']
Embeddings shape: (512,)


# Training pairs generating

Available training pairs from csv files are splitted to train - validation sets. Those pairs are positive(there is blood relation). For each set(train/valid) we additionally generate negative pairs.

Positive pairs are generated according to the input csv file. For each person of positive pair we create one negative pair.
In total we'll have twice more negative than positive pairs.

In [3]:
def make_embs_pair(pair):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    '''
        
    p1, p2 = pair
    
    dir1 = train_embeddings[p1].values()
    dir2 = train_embeddings[p2].values()
    
    for e1 in dir1:
        for e2 in dir2:
            yield np.concatenate([e1, e2], axis=0)
            yield np.concatenate([e2, e1], axis=0)

In [4]:
def pairs_set(input_pairs):
    for pair, label in input_pairs:
        try:
            embs = make_embs_pair(pair)
            for emb in embs:
                yield emb, label
        except KeyError:
            continue

def batched_pairs(input_pairs, batch_size, dataset_period):
    embs = []
    labels = []
    counter = 0
    for example in pairs_set(input_pairs):
        # Get every nth sample
        counter += 1
        if counter % dataset_period:
            continue
        
        emb, label = example
        embs.append(emb)
        labels.append(np.array(label, dtype=int))
        if len(labels) == batch_size:
            yield np.array(embs), np.array(labels)
            embs, labels = [], []

In [5]:
with open('../train_val_set.json', 'r') as f:
    train_val_set = json.load(f)

train_rlt_list, neg_train_rltshps, valid_rlt_list, neg_valid_rltshps = list(train_val_set.values())
train_rlts = list(zip(train_rlt_list + neg_train_rltshps, [True]*len(train_rlt_list) + [False]*len(neg_train_rltshps)))
val_rlts = list(zip(valid_rlt_list + neg_valid_rltshps, [True]*len(valid_rlt_list) + [False]*len(neg_valid_rltshps)))

# Run training

In [6]:
# Generate train dataset
train_x = []
train_y = []
val_x = []
val_y = []
for batch in batched_pairs(train_rlts, 32, 1):
    train_x.append(batch[0])
    train_y.append(batch[1])
train_x = np.concatenate(train_x, axis=0)
train_y = np.concatenate(train_y)

# Permute train data
train_idx_perm = np.random.permutation(len(train_x))
train_x = train_x[train_idx_perm]
train_y = train_y[train_idx_perm]

# Generate val dataset
for batch in batched_pairs(val_rlts, 32, 1):
    val_x.append(batch[0])
    val_y.append(batch[1])
val_x = np.concatenate(val_x, axis=0)
val_y = np.concatenate(val_y)

# Permute val data
val_idx_perm = np.random.permutation(len(val_x))
val_x = val_x[val_idx_perm]
val_y = val_y[val_idx_perm]

print(f'Train dataset length: {train_y.shape}')
print(f'Valid dataset length: {val_y.shape}')

Train dataset length: (665728,)
Valid dataset length: (312672,)


In [None]:
models = {}

# SVM
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight='balanced', probability=True, C=0.2, verbose=True))
clf.fit(train_x, train_y)
models['SVM'] = clf

[LibSVM].................................................................................................................................................................................................................................................................................................*..............................*..*
optimization finished, #iter = 321094
obj = -36837.570737, rho = 0.786942
nSV = 263100, nBSV = 214198
Total nSV = 263100


In [None]:
lr = LogisticRegression(random_state=0, solver='saga', C=0.1).fit(train_x, train_y)
models['LogisticRegression'] = lr

In [None]:
qd = QuadraticDiscriminantAnalysis(reg_param=0.5, tol=1e-10).fit(train_x, train_y)
models['QuadraticDiscriminantAnalysis'] = qd

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0).fit(train_x, train_y)
models['RandomForestClassifier'] = qd

In [None]:
gclf = GaussianNB().fit(train_x, train_y)
models['GaussianNB'] = qd

In [None]:
clf.predict(val_x)

In [None]:
for name, model in models.items():
    train_y_prob = model.predict_proba(train_x)
    val_y_prob = model.predict_proba(val_x)
    
    train_y_predicted = np.argmax(train_y_prob, axis=1)
    val_y_predicted = np.argmax(val_y_prob, axis=1)
    
    # Precision and recall
    train_precision, train_recall, _, _ = precision_recall_fscore_support(train_y, train_y_predicted, labels=[0, 1])
    train_accuracy = accuracy_score(train_y, train_y_predicted)
    valid_precision, valid_recall, _, _ = precision_recall_fscore_support(val_y, val_y_predicted, labels=[0, 1])
    valid_accuracy = accuracy_score(val_y, val_y_predicted)
    print(f'Model: {name}')
    print(f'Train - precision: {train_precision}, recall: {train_recall}, accuracy: {train_accuracy}')
    print(f'Valid - precision: {valid_precision}, recall: {valid_recall}, accuracy: {valid_accuracy}')

# Submission

In [None]:
# Load submission pairs
submission_path = 'data/sample_submission.csv'
submission_df = pd.read_csv(submission_path)

In [None]:
# Load models
ckpt_path = 'checkpoints/model_6/weights.70-0.11.hdf5'
model.load_weights(ckpt_path)
embedder = FaceNet()

In [None]:
# Get the threshold according to validation ds
val_pred = model.predict([val_pairs[:, 0], val_pairs[:, 1]])
val_pos_m, val_pos_s, val_neg_m, val_neg_s = val_distance_stats(val_pred, val_labels.astype(np.int))
threshold = ((val_pos_m + val_pos_s) + (val_neg_m - val_neg_s)) / 2

# Iterate over submission pairs
is_related = submission_df['is_related']
predictions = []
for idx, row in submission_df.iterrows():
    # Load images
    img_pair = row['img_pair']
    img1_name, img2_name = img_pair.split('-')
    img1_path = os.path.join('data/test', img1_name)
    img2_path = os.path.join('data/test', img2_name)
    img1 = image.load_img(img1_path)
    img2 = image.load_img(img2_path)
    img1 = np.array(img1).astype('float32')
    img2 = np.array(img2).astype('float32')
    
    # Get FaceNet embeddings
    embedding1 = embedder.embeddings([img1])
    embedding2 = embedder.embeddings([img2])
    
    # Do an inference, if distance is smaller than threshold
    # then there is the relation
    y_pred = model.predict_proba([embedding1, embedding2])
    predictions.append(y_pred[0])
    is_related[idx] = y_pred[0]
    
    # Print step
    if idx % 100 == 0:
        print(f'Processed rows: {idx}')
        
submission_df.to_csv(f'submission_classic.csv', index=False)

In [None]:
plt.hist(predictions, 20)
plt.show()

In [None]:
thr = 0.85
for i, p in enumerate(predictions):
    if p < thr:
        is_related[i] = 1
    else:
        is_related[i] = 0
submission_df.to_csv(f'submission_test.csv', index=False)