In [None]:
import os
import json
import pickle
import keras
import random
import numpy as np
import xgboost as xgb
import tensorflow as tf
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

random.seed(123)
np.random.seed(123)

# The input embeddings

The data in the input pickle file is stored in a dictionary structure:
```
{
    [
        'FAMILY_ID/PERSON_ID': [EMB_1, EMB_2...EMB_N],
        .
        .
        .
    ]
}
```

In [None]:
with open('data/train_img_embeddings.pkl', 'rb') as f:
       train_embeddings = pickle.load(f)
print(f'The keys examples: {list(train_embeddings.keys())[:5]}')

embedding_shape = list(train_embeddings.values())[0].shape
print(f'Embeddings shape: {embedding_shape}')

# Training pairs generating

Available training pairs from csv files are splitted to train - validation sets. Those pairs are positive(there is blood relation). For each set(train/valid) we additionally generate negative pairs.

Positive pairs are generated according to the input csv file. For each person of positive pair we create one negative pair.
In total we'll have twice more negative than positive pairs.

In [None]:
def make_image_pair(pair):
    '''
    Create pair of embeddings.
    
    Arguments:
    p1, p2 -- paths to persons' images directories (familyID/personID)
    
    Returns:
    pairs -- array of image pairs, pairing is alligned to smaller number of images
    '''
        
    p1, p2 = pair
    img_path1 = p1.replace('/', '\\')
    img_path2 = p2.replace('/', '\\')
    
    dir1 = train_embeddings[img_path1]
    dir2 = train_embeddings[img_path2]
    
    for i in range(len(dir1)):
        for j in range(len(dir2)):
            yield np.concatenate([dir1[i], dir2[j]], axis=0)

In [None]:
def pairs_set(input_pairs):
    for pair, label in input_pairs:
        try:
            embs = make_image_pair(pair)
            for emb in embs:
                yield emb, label
        except KeyError:
            continue

def batched_pairs(input_pairs, batch_size, dataset_period):
    embs = []
    labels = []
    counter = 0
    for example in pairs_set(input_pairs):
        # Get every nth sample
        counter += 1
        if counter % dataset_period:
            continue
        
        emb, label = example
        embs.append(emb)
        labels.append(np.array(label, dtype=int))
        if len(labels) == batch_size:
            yield np.array(embs), np.array(labels)
            embs, labels = [], []

In [None]:
with open('train_val_set.json', 'r') as f:
    train_val_set = json.load(f)

train_rlt_list, neg_train_rltshps, valid_rlt_list, neg_valid_rltshps = list(train_val_set.values())
train_rlts = list(zip(train_rlt_list + neg_train_rltshps, [True]*len(train_rlt_list) + [False]*len(neg_train_rltshps)))
val_rlts = list(zip(valid_rlt_list + neg_valid_rltshps, [True]*len(valid_rlt_list) + [False]*len(neg_valid_rltshps)))

random.shuffle(train_rlts)
random.shuffle(val_rlts)

## Training Callbacks

In [None]:
def val_distance_stats(predictions, labels):
    val_pos = predictions[labels.astype(np.bool)]
    val_neg = predictions[(1 - labels).astype(np.bool)]
    val_pos_m, val_pos_s = np.mean(val_pos), np.std(val_pos)
    val_neg_m, val_neg_s = np.mean(val_neg), np.std(val_neg)
    
    return val_pos_m, val_pos_s, val_neg_m, val_neg_s
  
class MetricCallback(xgb.callback.TrainingCallback):
    def __init__(self, logdir):
        if not os.path.exists(logdir):
            os.makedirs(logdir)
        self.train_writer = tf.summary.create_file_writer(logdir + '/train')
        self.valid_writer = tf.summary.create_file_writer(logdir + '/valid')
        self.class_encoded = {
            0: 'not_related',
            1: 'related'
        }
        
    def tb_writer(self, items_to_write, wtype, epoch):
        writer = self.train_writer if wtype == 'train' else self.valid_writer
        
        with writer.as_default():
            for name, value in items_to_write.items():
                tf.summary.scalar(name, value, self.step_number)
            writer.flush()
        
    def after_iteration(self, model, epoch, evals_log):
        val_true = []
        val_pred = []
        for batch in batched_pairs(val_rlts, batch_size, eval_dataset_period):
            val_pred.append(model.predict(batch[0]))
            val_true.extend(list(batch[1]))
        
        val_true = np.array(val_true).astype(int)
        val_pred = np.concatenate(val_pred, axis=0)
        val_pred = np.around(val_pred)
        
        # Precision and recall
        valid_precision, valid_recall, _, _ = precision_recall_fscore_support(val_true, val_pred, labels=[0, 1])
        valid_accuracy = accuracy_score(val_true, val_pred)
        
        logs = {}
        for k, v in self.class_encoded.items():
            logs['valid/precision/' + v] = valid_precision[k]
            logs['valid/recall/' + v] = valid_recall[k]
        
        logs['valid/accuracy'] = valid_accuracy

        self.tb_writer(logs, wtype='valid', epoch=epoch)
        self.step_number += 1

## Run tensorboard plugin in order to track changes of training

In [None]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [None]:
%tensorboard --logdir=./logs/xgboost --port=7008

# Run training

In [None]:
epochs = 2000
batch_size = 32
dataset_period = 4
eval_dataset_period = 12
lr = 1e-1
eval_period = 1000

In [None]:
model_name = 'mobile_xgboost_001'
logdir = os.path.join('logs/xgboost', model_name)
metric_callback = MetricCallback(logdir)

In [None]:
train_x = []
train_y = []
for batch in batched_pairs(train_rlts, batch_size, dataset_period):
    train_x.append(batch[0])
    train_y.append(batch[1])
train_x = np.concatenate(train_x, axis=0)
train_y = np.concatenate(train_y)

val_x = []
val_y = []
for batch in batched_pairs(val_rlts, batch_size, eval_dataset_period):
    val_x.append(batch[0])
    val_y.append(batch[1])
val_x = np.concatenate(val_x, axis=0)
val_y = np.concatenate(val_y)
val_x = np.expand_dims(val_x, axis=1)
val_y = np.expand_dims(val_y, 1)
val_set = list(zip(val_x, val_y))

In [None]:
n_estimators = 10
num_boost_round = 1000
xgb_cls = xgb.XGBClassifier(n_estimators=n_estimators,
                            max_depth=10,
                            learning_rate=lr,
                            use_label_encoder=False,
                            scale_pos_weight=10.,
                            num_boost_round=num_boost_round)
xgb_cls.fit(train_x, train_y, verbose=True, callbacks=[metric_callback])

# Submission

In [None]:
# Load submission pairs
submission_path = 'data/sample_submission.csv'
submission_df = pd.read_csv(submission_path)

In [None]:
# Load models
ckpt_path = 'checkpoints/model_6/weights.70-0.11.hdf5'
model.load_weights(ckpt_path)
embedder = FaceNet()

In [None]:
# Get the threshold according to validation ds
val_pred = model.predict([val_pairs[:, 0], val_pairs[:, 1]])
val_pos_m, val_pos_s, val_neg_m, val_neg_s = val_distance_stats(val_pred, val_labels.astype(np.int))
threshold = ((val_pos_m + val_pos_s) + (val_neg_m - val_neg_s)) / 2

# Iterate over submission pairs
is_related = submission_df['is_related']
predictions = []
for idx, row in submission_df.iterrows():
    # Load images
    img_pair = row['img_pair']
    img1_name, img2_name = img_pair.split('-')
    img1_path = os.path.join('data/test', img1_name)
    img2_path = os.path.join('data/test', img2_name)
    img1 = image.load_img(img1_path)
    img2 = image.load_img(img2_path)
    img1 = np.array(img1).astype('float32')
    img2 = np.array(img2).astype('float32')
    
    # Get FaceNet embeddings
    embedding1 = embedder.embeddings([img1])
    embedding2 = embedder.embeddings([img2])
    
    # Do an inference, if distance is smaller than threshold
    # then there is the relation
    y_pred = model.predict([embedding1, embedding2])
    predictions.append(y_pred[0])
    if y_pred.squeeze() < threshold:
        is_related[idx] = 1
    
    # Print step
    if idx % 100 == 0:
        print(f'Processed rows: {idx}')
        
submission_df.to_csv(f'submission_{model_name}.csv', index=False)

In [None]:
plt.hist(predictions, 20)
plt.show()

In [None]:
thr = 0.85
for i, p in enumerate(predictions):
    if p < thr:
        is_related[i] = 1
    else:
        is_related[i] = 0
submission_df.to_csv(f'submission_test.csv', index=False)