In [1]:
import os
import numpy as np
import pickle as pkl
from data import read_signatures
from data import read_pose
from utils import enumerate_paths
from utils import split_by
from solution import train_test_split
from evaluate import evaluate

In [2]:
sigs_path = 'data/signatures.pkl'
test_sigs_path = 'data/signatures_test.pkl'

In [3]:
train_paths, train_features = read_signatures(sigs_path)
test_paths, test_features = read_signatures(test_sigs_path)

In [4]:
from sklearn.externals import joblib

mean_scores = joblib.load('mean_scores.dump')



In [5]:
threshold = -0.5
# -0.7 -> 25.75682382133995
# -0.6 -> 25.725806451612904
# -0.5 -> 26.265508684863523
# -0.4 -> 25.53349875930521

In [6]:
train_good = np.array([mean_scores[path] >= threshold for path in train_paths])
test_good = np.array([mean_scores[path] >= threshold for path in test_paths])
np.mean(train_good), np.mean(test_good)

(0.9612984182544662, 0.9397962675900389)

In [7]:
train_paths = np.array(train_paths)[train_good]
train_features = np.array(train_features)[train_good]

In [8]:
test_paths = np.array(test_paths)[test_good]
test_features = np.array(test_features)[test_good]

In [9]:
person_ids, video_ids, seq_ids = enumerate_paths(train_paths)

In [10]:
test_video_ids = [int(x.split('/')[0].split('_')[-1]) for x in test_paths]
len(np.unique(test_video_ids))

6220

In [11]:
def split_by(data, person_ids, video_ids):
    sections = np.where(np.diff(video_ids, 1))[0] + 1
    data_splits = np.split(data, sections)
    pids = np.split(person_ids, sections)
    cids = np.split(video_ids, sections)
    data_train, y_train = [], []
    
    for pid, cid, data_video in zip(pids, cids, data_splits):
        pid = pid[0]
        cid = cid[0]
        
        data_train.append(data_video)
        y_train.append(pid)
            
    return data_train, np.array(y_train)
        
X_train, y_train = split_by(train_features, person_ids, video_ids)
len(X_train), len(y_train)

(432, 432)

In [12]:
X_test, y_test = split_by(test_features, test_video_ids, test_video_ids)
len(X_test), len(y_test)

(6220, 6220)

In [13]:
del train_features
del test_features

In [14]:
import gc
gc.collect()

80

In [15]:
videos_objects_size = 25
def split_videos(X, y, videos_objects_size):
    data = []
    labels = []
    for x, label in zip(X, y):
        sample_size = videos_objects_size
        count = (x.shape[0] - 1) // videos_objects_size + 1
        for i in range(count):
            x_sample = x[i * sample_size:(i + 1) * sample_size]
            x_sample = np.mean(x_sample, axis=0)

            data.append(x_sample)
            labels.append(label)
            
    data = np.row_stack(data)
    labels = np.array(labels)
    return data, labels

In [16]:
train_data, train_labels = split_videos(X_train, y_train, videos_objects_size)
train_data.shape, train_labels.shape

((6045, 2048), (6045,))

In [17]:
from sklearn.neighbors import KNeighborsClassifier

def weights(distances):
    return distances ** (-2)

params = {
    'p': 2,
    'n_neighbors': 20,
    'weights': weights,
}
model = KNeighborsClassifier(**params, n_jobs=8)
model.fit(train_data, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=8, n_neighbors=20, p=2,
                     weights=<function weights at 0x7fd722a76290>)

In [18]:
test_data, _ = split_videos(X_test, y_test, videos_objects_size)
test_data.shape

(11830, 2048)

In [19]:
p_test = model.predict_proba(test_data)

In [20]:
def average_predictions_split(X, p_all, videos_objects_size, f=np.mean):
    def count(x):
        return (x.shape[0] - 1) // videos_objects_size + 1 
    sections = np.cumsum([count(x) for x in X])[:-1]
    p_splitted = np.split(p_all, sections, axis=0)
    return np.row_stack([f(p, axis=0) for p in p_splitted])

In [21]:
p_test_grouped = average_predictions_split(X_test, p_test, videos_objects_size, f=np.mean)
ranking = (p_test_grouped).argsort(axis=1)
submission = [line.tolist() for line in ranking[:, :-6:-1]]

In [22]:
from collections import Counter

best_constant = list(Counter(map(tuple, submission)).most_common(1)[0][0])
best_constant

[68, 100, 36, 27, 28]

In [23]:
predicted_ids = np.unique(test_video_ids)

In [24]:
submission_predicted = dict(zip(predicted_ids, submission))

In [25]:
total_test_count = 6335

In [26]:
for i in range(total_test_count):
    if i not in submission_predicted:
        submission_predicted[i] = best_constant

In [27]:
submission_final = [submission_predicted[i] for i in range(total_test_count)]

In [28]:
from evaluate import submit
submit('Solo', submission_final)

{'member': 'Solo', 'rank': 10, 'score': 26.265508684863523}
