In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
def noop(*args, **kwargs): pass
warnings.warn = noop

In [3]:
from collections import ChainMap
from multiprocessing import cpu_count

In [21]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.externals.joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.preprocessing import LabelEncoder
from tsfresh.feature_extraction.feature_calculators import *
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tqdm import tqdm_notebook as tqdm

In [29]:
from basedir import SAMPLE
from info import ID_COLS
from lightgbm_helpers import accuracy
from utils import to_feather, from_feather, split, kfolds, replace_not_numbers

In [24]:
seed = 1
np.random.seed(seed)

# Features Mining

In [12]:
def stat(f, **params):
    def wrapper(x):
        return f(x, **params)
    wrapper.__name__ = f.__name__
    return wrapper


class StatsFeatures:
    def __init__(self, funcs):
        self.funcs = funcs
    
    def __call__(self, data):
        features = {}
        for col in data.columns:
            for func in self.funcs:
                result = func(data[col].values)
                if type(result) == zip:
                    result = dict(result)
                    for key, value in result.items():
                        features[f'{col}__{key}'] = value
                elif hasattr(result, '__len__'):
                    for key, value in result:
                        features[f'{col}__{func.__name__}__{key}'] = value
                else:
                    features[f'{col}__{func.__name__}'] = result
        features = {
            k: int(v) if v in (True, False) else v 
            for k, v in features.items()}
        return features
    
class SliceFeatures:
    def __init__(self, mode='first', n=5):
        if mode not in {'first', 'middle', 'last'}:
            raise ValueError('unexpected mode')
        self.mode = mode
        self.n = n
    
    def __call__(self, data):
        if self.mode == 'first':
            start, end = 0, self.n
        elif self.mode == 'last':
            start, end = -self.n, len(data)
        elif self.mode == 'middle':
            mid = len(data) // 2
            div, mod = divmod(self.n, 2)
            start, end = mid-div, mid+div+mod
        cols = data.columns
        vec = data.iloc[start:end].values.T.ravel()
        new_cols = [f'{col}_{self.mode}{i}' for i in range(self.n) for col in cols]
        return dict(zip(new_cols, vec))

    
def add_euler_angles(X):
    X = X.copy()
    x, y, z, w = [X[f'orientation_{s}'] for s in list('XYZW')]
    nx, ny, nz = quaternion_to_euler(x, y, z, w)
    X['euler_X'] = nx
    X['euler_Y'] = ny
    X['euler_Z'] = nz
    return X


def quaternion_to_euler(x, y, z, w):
    t0 = 2.0*(w*x + y*z)
    t1 = 1.0 - 2.0*(x*x + y*y)
    X = np.arctan2(t0, t1)
    
    t2 = np.clip(2.0*(w*y - z*x), -1, 1)
    Y = np.arcsin(t2)
    
    t3 = 2.0*(w*z + x*y)
    t4 = 1.0 - 2.0*(y*y + z*z)
    Z = np.arctan2(t3, t4)
    
    return X, Y, Z
    
    
def generate_features(data, features, ignore=None):
    with Parallel(n_jobs=cpu_count()) as parallel:
        extracted = parallel(delayed(generate_features_for_group)(
            group=group.drop(columns=ignore or []),
            features=features
        ) for _, group in tqdm(data.groupby('series_id')))
    return pd.DataFrame(extracted)


def generate_features_for_group(group, features):
    return dict(ChainMap(*[feat(group) for feat in features]))

## Feature Calculation Functions Definition

In [13]:
chosen_features = [
    StatsFeatures(funcs=(
        mean, median, standard_deviation, variance, skewness, kurtosis, maximum, minimum,
        mean_change, mean_abs_change, count_above_mean, count_below_mean, 
        mean_second_derivative_central, sum_of_reoccurring_data_points,
        abs_energy, sum_values, sample_entropy, longest_strike_above_mean,
        longest_strike_below_mean, first_location_of_minimum, first_location_of_maximum,
        stat(partial_autocorrelation, param=[{'lag': lag} for lag in range(10)]),
        stat(agg_autocorrelation, 
             param=[{'f_agg': s, 'maxlag': 40} for s in ('mean', 'median', 'var')]),
        stat(linear_trend,
             param=[{'attr': a} for a in ('pvalue', 'rvalue', 'intercept', 'slope', 'stderr')]),
        stat(index_mass_quantile, 
             param=[{'q': q} for q in (.1, .2, .3, .4, .6, .7, .8, .9)]),
        stat(fft_aggregated, 
             param=[{'aggtype': t} for t in ('centroid', 'variance', 'skew', 'kurtosis')]),
        stat(symmetry_looking, param=[{'r': r*0.05} for r in range(1, 20)]),
        *[stat(large_standard_deviation, r=r*0.05) for r in range(1, 20)],
        *[stat(autocorrelation, lag=lag) for lag in range(1, 10)], 
        *[stat(number_peaks, n=n) for n in (1, 2, 3, 5, 7, 10, 25, 50)],
        *[stat(c3, lag=lag) for lag in range(1, 5)],
        *[stat(quantile, q=q) for q in (.1, .2, .3, .4, .5, .6, .7, .8, .9)],
        *[stat(number_crossing_m, m=m) for m in (-1, 0, 1)],
        *[stat(ratio_beyond_r_sigma, r=r) for r in (0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10)],
        *[stat(value_count, value=v) for v in (-1, 0, 1)],
        *[stat(range_count, min=lo, max=hi) for (lo, hi) in [(-1, 1), (1e12, 0), (0, 1e12)]]
    )), 
    SliceFeatures('first'),
    SliceFeatures('middle'),
    SliceFeatures('last')
]

## Extending The Original Dataset

In [14]:
x_trn, x_tst = from_feather('x_trn', 'x_tst')

In [15]:
# _, g = next(iter(x_trn.groupby('series_id')))
# generate_features_for_group(g, chosen_features)

In [16]:
print('Feature extraction on train dataset')
x_trn_rich = (
    replace_not_numbers(
        generate_features(
            data=add_euler_angles(x_trn), 
            features=chosen_features, 
            ignore=ID_COLS)))

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))

In [17]:
print('Feature extraction on train dataset')
x_tst_rich = (
    replace_not_numbers(
        generate_features(
            data=add_euler_angles(x_tst), 
            features=chosen_features, 
            ignore=ID_COLS)))

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3816), HTML(value='')))

In [18]:
to_feather(x_trn_rich, 'trn_rich')

PosixPath('/home/ck/data/careercon2019/tmp/trn_rich.feather')

In [19]:
to_feather(x_tst_rich, 'tst_rich')

PosixPath('/home/ck/data/careercon2019/tmp/tst_rich.feather')


# Fitting The Model

In [22]:
x_trn, x_tst, y_trn = from_feather('trn_rich', 'tst_rich', 'y_trn')
enc = LabelEncoder()
y = enc.fit_transform(y_trn['surface'])

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(x_trn, y, test_size=0.1, random_state=seed)

In [26]:
model = lgb.LGBMClassifier(
    n_estimators=10000, learning_rate=0.1,
    colsample_bytree=0.4, objective='multiclass',
    num_leaves=500, num_class=9)

In [27]:
model.fit(
    X_train, y_train, 
    eval_set=[(X_valid, y_valid)], 
    eval_metric=accuracy,
    early_stopping_rounds=250,
    verbose=100)

Training until validation scores don't improve for 250 rounds.
[100]	valid_0's multi_logloss: 0.255882	valid_0's accuracy: 0.91601
[200]	valid_0's multi_logloss: 0.298802	valid_0's accuracy: 0.926509
[300]	valid_0's multi_logloss: 0.311145	valid_0's accuracy: 0.929134
Early stopping, best iteration is:
[83]	valid_0's multi_logloss: 0.252839	valid_0's accuracy: 0.918635


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=10000, n_jobs=-1, num_class=9, num_leaves=500,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [31]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(model.predict(x_tst))
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LightGBM + fft features"

100%|██████████████████████████████████████| 52.9k/52.9k [00:03<00:00, 14.5kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 