In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
def noop(*args, **kwargs): pass
warnings.warn = noop

In [3]:
from collections import ChainMap
from multiprocessing import cpu_count

In [4]:
from IPython.display import display
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.externals.joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, StratifiedKFold
from tsfresh import extract_features, extract_relevant_features
from tqdm import tqdm_notebook as tqdm

In [24]:
from basedir import SAMPLE
from utils import from_feather, to_feather, kfolds

In [6]:
seed = 1
np.random.seed(seed)

In [7]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

In [8]:
from tsfresh.feature_extraction.feature_calculators import (
    mean, median, standard_deviation, variance, skewness, kurtosis,
    mean_abs_change, mean_change, mean_second_derivative_central, 
    quantile, autocorrelation, agg_autocorrelation, partial_autocorrelation,
    abs_energy, count_above_mean, count_below_mean, maximum, minimum,
    first_location_of_minimum, first_location_of_maximum, linear_trend,
    sample_entropy, c3, 
    longest_strike_below_mean, longest_strike_above_mean, 
    number_peaks, sum_of_reoccurring_data_points, sum_values,
    large_standard_deviation,
    number_crossing_m, value_count, range_count,
)

In [9]:
def stat(f, **params):
    def wrapper(x):
        return f(x, **params)
    wrapper.__name__ = f.__name__
    return wrapper

In [10]:
def get_series(data, ser_id, *ser_ids):
    ids = [ser_id] + list(ser_ids)
    return data[data.series_id.isin(ids)].copy()

In [11]:
default_stats = (
    mean, median, standard_deviation, variance, skewness, kurtosis, maximum, minimum,
    mean_change, mean_abs_change, count_above_mean, count_below_mean,
    mean_second_derivative_central, sum_of_reoccurring_data_points, 
    abs_energy, sum_values, sample_entropy,
    longest_strike_above_mean, longest_strike_below_mean,
    first_location_of_minimum, first_location_of_maximum,
    *[stat(large_standard_deviation, r=r*0.05) for r in range(1, 20)],
    *[stat(autocorrelation, lag=lag) for lag in range(1, 25)], 
    *[stat(number_peaks, n=n) for n in (1, 2, 3, 5, 7, 10, 25, 50)],
    *[stat(c3, lag=lag) for lag in range(1, 5)],
    *[stat(quantile, q=q) for q in (.1, .2, .3, .4, .5, .6, .7, .8, .9)],
    stat(partial_autocorrelation, param=[{'lag': lag} for lag in range(25)]),
    stat(agg_autocorrelation, param=[{'f_agg': s, 'maxlag': 40} for s in ('mean', 'median', 'var')]),
    stat(linear_trend, param=[
        {'attr': a} for a in ('pvalue', 'rvalue', 'intercept', 'slope', 'stderr')])
)

In [12]:
class StatsFeatures:
    def __init__(self, funcs=default_stats):
        self.funcs = funcs
    
    def __call__(self, data):
        features = {}
        for col in data.columns:
            for func in self.funcs:
                result = func(data[col].values) 
                if hasattr(result, '__len__'):
                    for key, value in result:
                        features[f'{col}__{func.__name__}__{key}'] = value
                else:
                    features[f'{col}__{func.__name__}'] = result
        return features

In [13]:
class SliceFeatures:
    def __init__(self, mode='first', n=5):
        if mode not in {'first', 'middle', 'last'}:
            raise ValueError('unexpected mode')
        self.mode = mode
        self.n = n
    
    def __call__(self, data):
        if self.mode == 'first':
            start, end = 0, self.n
        elif self.mode == 'last':
            start, end = -self.n, len(data)
        elif self.mode == 'middle':
            mid = len(data) // 2
            div, mod = divmod(self.n, 2)
            start, end = mid-div, mid+div+mod
        cols = data.columns
        vec = data.iloc[start:end].values.T.ravel()
        new_cols = [f'{col}_{self.mode}{i}' for i in range(self.n) for col in cols]
        return dict(zip(new_cols, vec))

In [15]:
def generate_features(data, features, ignore=None):
    with Parallel(n_jobs=cpu_count()) as parallel:
        extracted = parallel(delayed(generate_features_for_group)(
            group=group.drop(columns=ignore or []),
            features=features
        ) for _, group in tqdm(data.groupby('series_id')))
    return pd.DataFrame(extracted)

In [16]:
def generate_features_for_group(group, features):
    return dict(ChainMap(*[feat(group) for feat in features]))

In [17]:
ignore = ['series_id', 'measurement_number']

In [None]:
features = [
    StatsFeatures(),
    SliceFeatures('first'),
    SliceFeatures('middle'),
    SliceFeatures('last')
]

In [18]:
print('Feature extraction on train dataset')
x_trn_rich = generate_features(x_trn, features, ignore=ignore)

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))




In [19]:
print('Feature extraction on train dataset')
x_tst_rich = generate_features(x_tst, features, ignore=ignore)

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3816), HTML(value='')))




In [20]:
enc = LabelEncoder()
y = enc.fit_transform(y_trn['surface'])

In [21]:
def accuracy(y_true, y_pred):
    n = len(y_true)
    y_hat = y_pred.reshape(9, n).argmax(axis=0)
    value = (y_true == y_hat).mean()
    return 'accuracy', value, True

In [28]:
k = 10
test = np.zeros((len(x_tst_rich), 9), dtype=np.float32)
for i, x_trn, x_val, y_trn, y_val in kfolds(x_trn_rich, pd.Series(y), k):
    print(f'Running K-fold #{i+1:d}')
    model = lgb.LGBMClassifier(
        n_estimators=3000, learning_rate=0.005,
        colsample_bytree=0.4, objective='multiclass',
        num_leaves=500, num_class=9)
    model.fit(x_trn, y_trn,
              eval_set=[(x_val, y_val)],
              eval_metric=accuracy,
              early_stopping_rounds=300,
              verbose=150)
    test += model.predict_proba(x_tst_rich)
test /= k

Running K-fold #1
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's multi_logloss: 1.20245	valid_0's accuracy: 0.849351
[300]	valid_0's multi_logloss: 0.830369	valid_0's accuracy: 0.864935
[450]	valid_0's multi_logloss: 0.636175	valid_0's accuracy: 0.880519
[600]	valid_0's multi_logloss: 0.521158	valid_0's accuracy: 0.883117
[750]	valid_0's multi_logloss: 0.451773	valid_0's accuracy: 0.885714
[900]	valid_0's multi_logloss: 0.406977	valid_0's accuracy: 0.893506
[1050]	valid_0's multi_logloss: 0.376655	valid_0's accuracy: 0.890909
Early stopping, best iteration is:
[897]	valid_0's multi_logloss: 0.407856	valid_0's accuracy: 0.893506
Running K-fold #2
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's multi_logloss: 1.15719	valid_0's accuracy: 0.895833
[300]	valid_0's multi_logloss: 0.775953	valid_0's accuracy: 0.895833
[450]	valid_0's multi_logloss: 0.579844	valid_0's accuracy: 0.898438
[600]	valid_0's multi_logloss: 0.468043	valid

In [41]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(model.predict(x_tst_rich))
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LightGBM tsfresh with more features"

100%|██████████████████████████████████████| 52.5k/52.5k [00:00<00:00, 46.6kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 