In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
def noop(*args, **kwargs): pass
warnings.warn = noop

In [3]:
from collections import ChainMap
from itertools import product
from multiprocessing import cpu_count

In [4]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.externals.joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, ParameterSampler
from sklearn.preprocessing import LabelEncoder
from tsfresh.feature_extraction.feature_calculators import *
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tqdm import tqdm_notebook as tqdm

In [5]:
from basedir import SAMPLE
from info import ID_COLS
from lightgbm_helpers import accuracy
from utils import to_feather, from_feather, split, kfolds, replace_not_numbers

In [6]:
seed = 1
np.random.seed(seed)

# Features Mining

In [7]:
def stat(f, **params):
    def wrapper(x):
        return f(x, **params)
    wrapper.__name__ = f.__name__
    return wrapper


class StatsFeatures:
    def __init__(self, funcs):
        self.funcs = funcs
    
    def __call__(self, data):
        features = {}
        for col in data.columns:
            for func in self.funcs:
                result = func(data[col].values)
                if type(result) == zip:
                    result = dict(result)
                    for key, value in result.items():
                        features[f'{col}__{key}'] = value
                elif hasattr(result, '__len__'):
                    for key, value in result:
                        features[f'{col}__{func.__name__}__{key}'] = value
                else:
                    features[f'{col}__{func.__name__}'] = result
        features = {
            k: int(v) if v in (True, False) else v 
            for k, v in features.items()}
        return features
    
class SliceFeatures:
    def __init__(self, mode='first', n=5):
        if mode not in {'first', 'middle', 'last'}:
            raise ValueError('unexpected mode')
        self.mode = mode
        self.n = n
    
    def __call__(self, data):
        if self.mode == 'first':
            start, end = 0, self.n
        elif self.mode == 'last':
            start, end = -self.n, len(data)
        elif self.mode == 'middle':
            mid = len(data) // 2
            div, mod = divmod(self.n, 2)
            start, end = mid-div, mid+div+mod
        cols = data.columns
        vec = data.iloc[start:end].values.T.ravel()
        new_cols = [f'{col}_{self.mode}{i}' for i in range(self.n) for col in cols]
        return dict(zip(new_cols, vec))

    
def add_euler_angles(X):
    X = X.copy()
    x, y, z, w = [X[f'orientation_{s}'] for s in list('XYZW')]
    nx, ny, nz = quaternion_to_euler(x, y, z, w)
    X['euler_X'] = nx
    X['euler_Y'] = ny
    X['euler_Z'] = nz
    return X


def quaternion_to_euler(x, y, z, w):
    t0 = 2.0*(w*x + y*z)
    t1 = 1.0 - 2.0*(x*x + y*y)
    X = np.arctan2(t0, t1)
    
    t2 = np.clip(2.0*(w*y - z*x), -1, 1)
    Y = np.arcsin(t2)
    
    t3 = 2.0*(w*z + x*y)
    t4 = 1.0 - 2.0*(y*y + z*z)
    Z = np.arctan2(t3, t4)
    
    return X, Y, Z
    
    
def generate_features(data, features, ignore=None):
    with Parallel(n_jobs=cpu_count()) as parallel:
        extracted = parallel(delayed(generate_features_for_group)(
            group=group.drop(columns=ignore or []),
            features=features
        ) for _, group in tqdm(data.groupby('series_id')))
    return pd.DataFrame(extracted)


def generate_features_for_group(group, features):
    return dict(ChainMap(*[feat(group) for feat in features]))

## Feature Calculation Functions Definition

In [8]:
chosen_features = [
    StatsFeatures(funcs=(
        stat(fft_aggregated, param=[
            {'aggtype': s} 
            for s in ('centroid', 'variance', 'skew', 'kurtosis')
        ]),
        stat(fft_coefficient, param=[
            {'coeff': k, 'attr': a}
            for k, a in product(range(100), ('real', 'imag', 'abs', 'angle'))
        ]),
        # more features
        stat(cwt_coefficients, param=[
            {'widths': width, 'coeff': coef, 'w': w}
            for width in [(2, 5, 10, 20)]
            for coeff in range(15)
            for w in (2, 5, 10, 20)
        ]),
        stat(spkt_welch_density, param=[{'coeff': k} for k in (2, 5, 8)])
    )), 
]

In [58]:
tsfresh_default = [
    *[stat(time_reversal_asymmetry_statistic, lag=lag) for lag in range(1, 4)],
    *[stat(c3, lag=lag) for lag in range(1, 4)],
    stat(cid_ce, normalize=True), stat(cid_ce, normalize=False),
    stat(symmetry_looking, param=[{'r': r*0.05} for r in range(1, 20)]),
    *[stat(quantile, q=q) for q in (.1, .2, .3, .4, .6, .7, .8, .9)],
    *[stat(autocorrelation, lag=lag) for lag in range(10)],
    stat(agg_autocorrelation, param=[
        {'f_agg': s, 'maxlag': 40} for s in ('mean', 'median', 'var')
    ]),
    stat(partial_autocorrelation, param=[
        {'lag': lag} for lag in range(10)
    ]),
    *[stat(number_cwt_peaks, n=n) for n in (1, 5)],
    *[stat(number_peaks, n=n) for n in (1, 3, 5, 10, 50)],
    *[stat(binned_entropy, max_bins=b) for b in [10]],
    stat(index_mass_quantile, param=[{'q': q} for q in (.1, .2, .3, .4, .6, .7, .8, .9)]),
    stat(cwt_coefficients, param=[
        {'widths': width, 'coeff': coeff, 'w': w}
        for width in [(2, 5, 10, 20)]
        for coeff in range(15)
        for w in (2, 5, 10, 20)
    ]),
    stat(spkt_welch_density, param=[{'coeff': k} for k in (2, 5, 8)]),
    stat(ar_coefficient, param=[
        {'coeff': coeff, 'k': k} for coeff in range(5) for k in [10]
    ]),
    *[stat(change_quantiles, ql=ql, qh=qh, isabs=b, f_agg=f)
        for ql in (0., .2, .4, .6, .8)
        for qh in (.2, .4, .6, .8, 1.)
        for b in (False, True)
        for f in ('mean', 'var')
    ],
    stat(fft_aggregated, param=[
        {'aggtype': s} 
        for s in ('centroid', 'variance', 'skew', 'kurtosis')
    ]),
    stat(fft_coefficient, param=[
        {'coeff': k, 'attr': a}
        for k, a in product(range(100), ('real', 'imag', 'abs', 'angle'))
    ]),
    *[stat(value_count, value=v) for v in (-1, 0, 1)],
    *[stat(range_count, min=lo, max=hi) for lo, hi in [(-1, 1), (1e12, 0), (0, 1e12)]],
    *[stat(approximate_entropy, m=2, r=r) for r in (.1, .3, .5, .7, .9)],
    stat(friedrich_coefficients, param=[
        {'coeff': coeff, 'm': 3, 'r': 30}
        for coeff in range(4)
    ]),
    stat(max_langevin_fixed_point, m=3, r=30),
    stat(linear_trend, param=[
        {'attr': a} for a in ('pvalue', 'rvalue', 'intercept', 'slope', 'stderr')
    ]),
    stat(agg_linear_trend, param=[
        {'attr': attr, 'chunk_len': i, 'f_agg': f}
        for attr in ('rvalue', 'intercept', 'slope', 'stderr')
        for i in (5, 10, 50)
        for f in ('max', 'min', 'mean', 'var')
    ]),
    stat(augmented_dickey_fuller, param=[
        {'attr': a} 
        for a in ('teststat', 'pvalue', 'usedlag')
    ]),
    *[stat(number_crossing_m, m=m) for m in (-1, 0, 1)],
    stat(energy_ratio_by_chunks, param=[
        {'num_segments': 10, 'segment_focus': i}
        for i in range(10)
    ]),
    *[stat(ratio_beyond_r_sigma, r=r) for r in (0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10)],
    # stat(linear_trend_timewise, param=[
    #     {'attr': a}
    #     for a in ('pvalue', 'rvalue', 'intercept', 'slope', 'stderr')
    # ])
]

In [59]:
funcs = tsfresh_default

In [60]:
features = [StatsFeatures(funcs=funcs)]

## Extending The Original Dataset

In [61]:
x_trn, x_tst = from_feather('x_trn', 'x_tst')

In [65]:
_, g = next(iter(x_trn.groupby('series_id')))
len(generate_features_for_group(g, features))

  phi[1, 1] = sxx_m[1] / sxx_m[0]
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return y.dot(np.arange(len(y))**moment) / y.sum()
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  return self.params / self.bse
  res_data.append(np.sum(np.array_split(x, num_segments)[segment_focus] ** 2.0)/full_series_energy)
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


7164

In [66]:
print('Feature extraction on train dataset')
x_trn_rich = (
    replace_not_numbers(
        generate_features(
            data=x_trn,
            features=features, 
            ignore=ID_COLS)))

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))




In [67]:
print('Feature extraction on train dataset')
x_tst_rich = (
    replace_not_numbers(
        generate_features(
            data=x_tst,
            features=features, 
            ignore=ID_COLS)))

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3816), HTML(value='')))




In [68]:
# to_feather(x_trn_rich, 'trn_rich')
to_feather(x_trn_rich, 'trn_all')

PosixPath('/home/ck/data/careercon2019/tmp/trn_all.feather')

In [69]:
# to_feather(x_tst_rich, 'tst_rich')
to_feather(x_tst_rich, 'tst_all')

PosixPath('/home/ck/data/careercon2019/tmp/tst_all.feather')


# Fitting The Model

In [70]:
# x_trn, x_tst, y_trn = from_feather('trn_rich', 'tst_rich', 'y_trn')
x_trn, x_tst, y_trn = from_feather('trn_all', 'tst_all', 'y_trn')
enc = LabelEncoder()
y_trn = pd.Series(enc.fit_transform(y_trn['surface']))

In [71]:
relevance = calculate_relevance_table(x_trn, y_trn, ml_task='classification')

In [72]:
rel_cols = relevance[relevance['relevant']].index.tolist()

In [73]:
x_trn_rel = x_trn[rel_cols]
x_tst_rel = x_tst[rel_cols]

In [74]:
X_train, X_valid, y_train, y_valid = train_test_split(
    x_trn_rel, y_trn, test_size=0.1, random_state=seed)

In [42]:
n_iter = 100

base = dict(num_iterations=100, seed=seed, objective='mutliclass')

sampler = ParameterSampler({
    'boosting': ['gbdt', 'dart'],
    'colsamples_bytree': stats.uniform(0.3, 0.7),
    'learning_rate': stats.uniform(0.005, 0.3),
    'num_leaves': stats.randint(31, 500),
    'min_data_in_leaf': stats.randint(20, 50),
    'lambda_l1': stats.uniform(0.0, 0.001),
    'lambda_l2': stats.uniform(0.0, 0.001),
    'drop_rate': stats.uniform(0.05, 0.3)
}, n_iter=n_iter)

best_acc = 0
best_params = None
expo = 1

for i, params in enumerate(sampler):
    if i % expo == 0 or i == (n_iter - 1):
        print(f'Sample {i+1:d}/{n_iter:d}')
        expo *= 2
    model = lgb.LGBMClassifier(**base, **params)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_valid)
    acc = (y_hat == y_valid).mean()
    if acc > best_acc:
        print(f'\taccuracy improved: {acc:2.2%}')
        best_acc = acc
        best_params = params

Sample 1/100
	accuracy improved: 86.61%
Sample 3/100
Sample 5/100
	accuracy improved: 90.29%
Sample 9/100
Sample 17/100
Sample 33/100
Sample 65/100
Sample 100/100


In [44]:
best_params

{'boosting': 'gbdt',
 'colsamples_bytree': 0.8724378755450763,
 'drop_rate': 0.24689475810489686,
 'lambda_l1': 1.1280314202778308e-05,
 'lambda_l2': 0.0007353536939500786,
 'learning_rate': 0.18604839329048833,
 'min_data_in_leaf': 46,
 'num_leaves': 447}

In [75]:
# model = lgb.LGBMClassifier(
#     boosting='rf', bagging_freq=1, bagging_fraction=0.66,
#     n_estimators=1000, learning_rate=0.005,
#     colsample_bytree=0.3, objective='multiclass',
#     metric='None', num_leaves=200, num_class=9)

In [45]:
# model = lgb.LGBMClassifier(num_iterations=10000, seed=seed, metric='None', **best_params)

In [76]:
# model.fit(
#     X_train, y_train, 
#     eval_set=[(X_valid, y_valid)], 
#     eval_metric=accuracy,
#     early_stopping_rounds=1000,
#     verbose=100)

Training until validation scores don't improve for 1000 rounds.
[100]	valid_0's accuracy: 0.863517
[200]	valid_0's accuracy: 0.87664
[300]	valid_0's accuracy: 0.884514
[400]	valid_0's accuracy: 0.889764
[500]	valid_0's accuracy: 0.892388
[600]	valid_0's accuracy: 0.892388
[700]	valid_0's accuracy: 0.892388
[800]	valid_0's accuracy: 0.892388
[900]	valid_0's accuracy: 0.895013
[1000]	valid_0's accuracy: 0.895013
[1100]	valid_0's accuracy: 0.889764
[1200]	valid_0's accuracy: 0.889764
[1300]	valid_0's accuracy: 0.889764
[1400]	valid_0's accuracy: 0.887139
Early stopping, best iteration is:
[423]	valid_0's accuracy: 0.895013


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.05, max_depth=-1,
        metric='None', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=10000, n_jobs=-1, num_class=9,
        num_leaves=100, objective='multiclass', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [None]:
# n = None
# assert n is not None, 'Need to set number of iterations'

In [47]:
# model = lgb.LGBMClassifier(num_iterations=n, seed=seed, metric='None', **best_params)

In [51]:
# model.fit(x_trn_rel, y_trn)

LGBMClassifier(boosting='gbdt', boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, colsamples_bytree=0.8724378755450763,
        drop_rate=0.24689475810489686, importance_type='split',
        lambda_l1=1.1280314202778308e-05, lambda_l2=0.0007353536939500786,
        learning_rate=0.18604839329048833, max_depth=-1, metric='None',
        min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=46,
        min_split_gain=0.0, n_estimators=100, n_jobs=-1,
        num_iterations=400, num_leaves=447, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [83]:
model = lgb.LGBMClassifier(
    boosting='dart', bagging_freq=1, bagging_fraction=0.66,
    n_estimators=10000, learning_rate=0.005,
    colsample_bytree=0.3, objective='multiclass',
    metric='None', num_leaves=200, num_class=9)

In [84]:
model.fit(X_train, y_train,
          eval_set=[(X_valid, y_valid)],
          eval_metric=accuracy,
          verbose=100)

[100]	valid_0's accuracy: 0.800525
[200]	valid_0's accuracy: 0.813648
[300]	valid_0's accuracy: 0.816273
[400]	valid_0's accuracy: 0.818898
[500]	valid_0's accuracy: 0.834646
[600]	valid_0's accuracy: 0.83727
[700]	valid_0's accuracy: 0.83727
[800]	valid_0's accuracy: 0.84252
[900]	valid_0's accuracy: 0.845144
[1000]	valid_0's accuracy: 0.847769
[1100]	valid_0's accuracy: 0.847769
[1200]	valid_0's accuracy: 0.853018
[1300]	valid_0's accuracy: 0.855643
[1400]	valid_0's accuracy: 0.858268
[1500]	valid_0's accuracy: 0.855643
[1600]	valid_0's accuracy: 0.858268
[1700]	valid_0's accuracy: 0.858268
[1800]	valid_0's accuracy: 0.858268
[1900]	valid_0's accuracy: 0.858268
[2000]	valid_0's accuracy: 0.858268
[2100]	valid_0's accuracy: 0.860892
[2200]	valid_0's accuracy: 0.860892
[2300]	valid_0's accuracy: 0.860892
[2400]	valid_0's accuracy: 0.868766
[2500]	valid_0's accuracy: 0.868766
[2600]	valid_0's accuracy: 0.863517
[2700]	valid_0's accuracy: 0.863517
[2800]	valid_0's accuracy: 0.863517
[290

LGBMClassifier(bagging_fraction=0.66, bagging_freq=1, boosting='dart',
        boosting_type='gbdt', class_weight=None, colsample_bytree=0.3,
        importance_type='split', learning_rate=0.005, max_depth=-1,
        metric='None', min_child_samples=20, min_child_weight=0.001,
        min_split_gain=0.0, n_estimators=10000, n_jobs=-1, num_class=9,
        num_leaves=200, objective='multiclass', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [85]:
test = model.predict(x_tst_rel)

In [86]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(test)
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "10000 trees"

100%|██████████████████████████████████████| 52.3k/52.3k [00:00<00:00, 45.9kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 