In [1]:
import os
import numpy as np
import pickle as pkl
from data import read_signatures
from data import read_pose
from utils import enumerate_paths
from utils import split_by
from solution import train_test_split
from evaluate import evaluate

In [2]:
sigs_path = 'data/signatures.pkl'
test_sigs_path = 'data/signatures_test.pkl'

In [3]:
train_paths, train_features = read_signatures(sigs_path)
test_paths, test_features = read_signatures(test_sigs_path)

In [4]:
person_ids, video_ids, seq_ids = enumerate_paths(train_paths)

In [5]:
test_seq_ids = [int(x.split('/')[0].split('_')[-1]) for x in test_paths]
len(np.unique(test_seq_ids))

6335

In [6]:
def split_by(data, person_ids, seq_ids, video_ids):
    sections = np.where(np.diff(seq_ids, 1))[0] + 1
    data_splits = np.split(data, sections)
    pids = np.split(person_ids, sections)
    cids = np.split(video_ids, sections)
    data_train, y_train, videos = [], [], []
    
    for pid, cid, data_video in zip(pids, cids, data_splits):
        pid = pid[0]
        cid = cid[0]
        
        data_train.append(data_video)
        y_train.append(pid)
        videos.append(cid)
        
    return data_train, np.array(y_train), np.array(videos)
        
X_train, y_train, videos_train = split_by(train_features, person_ids, seq_ids, video_ids)
len(X_train), len(y_train), len(videos_train)

(4807, 4807, 4807)

In [7]:
X_test, y_test, videos_test = split_by(test_features, test_seq_ids, test_seq_ids, test_seq_ids)
len(X_test)

6335

In [8]:
del train_features
del test_features

In [9]:
import gc
gc.collect()

60

In [10]:
from collections import defaultdict

def split_videos(X, y, videos):
    objects = defaultdict(list)
    for x, label, vid in zip(X, y, videos):
        x_sample = np.mean(x, axis=0)
        objects[label].append((vid, x_sample, x.shape[1]))
            
    return objects

In [11]:
train_data = split_videos(X_train, y_train, videos_train)

In [12]:
len(train_data)

101

In [13]:
test_data = [(np.mean(x, axis=0), x.shape[1]) for x in X_test]

In [14]:
import random
def generate_features(d, left_size, right_size):
    return [np.max(d), np.min(d), np.mean(np.abs(d)), np.mean(d ** 2) ** 0.5, np.std(d), left_size, right_size, left_size - right_size]

# def generate_features(d, left_size, right_size):
#     return [np.mean(d ** 2) ** 0.5, left_size, right_size]

# def generate_features(d, left_size, right_size):
#     return d[::4]

def generate_train_pairs(train_data, count_iterations):
    pairs = []
    same_label = []
    for i in range(count_iterations):
        for label_left in train_data:
            left_video, left_x, left_size = random.choice(train_data[label_left])
            for label_right in train_data:
                right_x, right_size = random.choice([(x, size) for right_video, x, size in train_data[label_right] if right_video != left_video])
                features = np.array(generate_features(left_x - right_x, left_size, right_size))
                pairs.append(features)
                same_label.append(label_left == label_right)
    return np.row_stack(pairs), np.array(same_label)

                                    
def generate_test_pairs(train_data, test_data, count_iterations):
    pairs = []
    label = []
    indices = []
    for i in range(count_iterations):
        for label_left in train_data:
            _, left_x, left_size = random.choice(train_data[label_left])
            for index_test, (right_x, right_size) in enumerate(test_data):
                features = np.array(generate_features(left_x - right_x, left_size, right_size))
                pairs.append(features)
                label.append(label_left)
                indices.append(index_test)
                
    return np.row_stack(pairs), np.array(label), np.array(indices)

In [15]:
p_prior = dict(enumerate([len(train_data[label]) / sum(map(len, train_data.values())) for label in train_data]))

In [16]:
X_pairs_train, y_pairs_train = generate_train_pairs(train_data, count_iterations=50)
X_pairs_train.shape, y_pairs_train.shape, y_pairs_train.mean()

((510050, 8), (510050,), 0.009900990099009901)

In [17]:
from catboost import CatBoostClassifier

params = {
    'iterations': 10000,
    'learning_rate': 0.2,
}
model = CatBoostClassifier(
    task_type="GPU", devices='0:1', eval_metric='Logloss', thread_count=8, 
    **params
)
model.fit(X_pairs_train, y_pairs_train.astype(int), verbose=False)

<catboost.core.CatBoostClassifier at 0x7f757950edd0>

In [18]:
import pandas as pd
import tqdm

p_test_full = []
for i in tqdm.tqdm_notebook(range(25)):
    X_pairs_test, labels, indices = generate_test_pairs(train_data, test_data, count_iterations=1)
    X_pairs_test.shape, labels.shape, indices.mean()

    p_test = model.predict_proba(X_pairs_test)[:, 1]
    p_test = pd.DataFrame(np.column_stack([p_test, labels, indices]))
    p_test.columns = ['p', 'label', 'index']
    p_test['p_prior'] = p_test['label'].replace(p_prior)
    p_test['p'] *= p_test['p_prior']
    p_test_full.append(p_test)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))

KeyboardInterrupt: 

In [19]:
p_test = pd.concat(p_test_full, axis=0, ignore_index=True)

In [20]:
max_p = p_test.groupby(['index', 'label'])[['p']].mean()

submission = (-max_p).groupby('index').rank().reset_index().sort_values(['index', 'p']).groupby('index').head(5).groupby('index')['label'].apply(lambda x: list(map(int, x)))
submission = list(submission.values)

In [21]:
from evaluate import submit
submit('Solo', submission)

{'member': 'Solo', 'rank': 30, 'score': 8.573200992555831}
