In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
def noop(*args, **kwargs): pass
warnings.warn = noop

In [3]:
from collections import ChainMap
from multiprocessing import cpu_count

In [4]:
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.externals.joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from tsfresh import extract_features, extract_relevant_features
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

In [5]:
from basedir import SAMPLE
from utils import from_feather, to_feather

In [6]:
seed = 1
np.random.seed(seed)

In [7]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

In [8]:
from tsfresh.feature_extraction.feature_calculators import (
    mean, median, standard_deviation, variance, skewness, kurtosis, autocorrelation,
    abs_energy, count_above_mean, count_below_mean, maximum, minimum,
    first_location_of_minimum, first_location_of_maximum, linear_trend,
    c3, longest_strike_below_mean, longest_strike_above_mean, number_peaks,
    mean_abs_change, mean_change, mean_second_derivative_central, 
    sum_of_reoccurring_data_points, sum_values
)

In [9]:
def stat(f, **params):
    def wrapper(x):
        return f(x, **params)
    wrapper.__name__ = f.__name__
    return wrapper

In [10]:
default_stats = (
    mean, median, standard_deviation, variance, skewness, kurtosis,
    maximum, minimum, longest_strike_above_mean, longest_strike_below_mean,
    first_location_of_minimum, first_location_of_maximum, abs_energy, 
    count_above_mean, count_below_mean, mean_abs_change, mean_change, 
    mean_second_derivative_central, sum_of_reoccurring_data_points, sum_values,
    *[stat(autocorrelation, lag=lag) for lag in range(1, 25)], 
    *[stat(number_peaks, n=n) for n in (1, 2, 3, 5, 7, 10, 15)],
    *[stat(c3, lag=lag) for lag in (1, 2, 3, 5, 7, 10, 15, 20, 25)]
)

In [11]:
class StatsFeatures:
    def __init__(self, funcs=default_stats):
        self.funcs = funcs
    
    def __call__(self, data):
        features = {}
        for col in data.columns:
            features.update({
                f'{col}_{func.__name__}': func(data[col].values) 
                for func in self.funcs})
        return features

In [12]:
class SliceFeatures:
    def __init__(self, mode='first', n=5):
        if mode not in {'first', 'middle', 'last'}:
            raise ValueError('unexpected mode')
        self.mode = mode
        self.n = n
    
    def __call__(self, data):
        if self.mode == 'first':
            start, end = 0, self.n
        elif self.mode == 'last':
            start, end = -self.n, len(data)
        elif self.mode == 'middle':
            mid = len(data) // 2
            div, mod = divmod(self.n, 2)
            start, end = mid-div, mid+div+mod
        cols = data.columns
        vec = data.iloc[start:end].values.T.ravel()
        new_cols = [f'{col}_{self.mode}{i}' for i in range(self.n) for col in cols]
        return dict(zip(new_cols, vec))

In [13]:
_, group = next(iter(x_trn.groupby('series_id')))
group = group.drop(columns=['series_id', 'measurement_number'])

In [14]:
features = [
    StatsFeatures(),
    SliceFeatures('first'),
    SliceFeatures('middle'),
    SliceFeatures('last')
]

In [15]:
def generate_features(data, features, ignore=None):
    with Parallel(n_jobs=cpu_count()) as parallel:
        extracted = parallel(delayed(generate_features_for_group)(
            group=group.drop(columns=ignore or []),
            features=features
        ) for _, group in tqdm(data.groupby('series_id')))
    return pd.DataFrame(extracted)

In [16]:
def generate_features_for_group(group, features):
    return dict(ChainMap(*[feat(group) for feat in features]))

In [17]:
ignore = ['series_id', 'measurement_number']

In [18]:
print('Feature extraction on train dataset')
x_trn_rich = generate_features(x_trn, features, ignore=ignore)

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))




In [19]:
print('Feature extraction on train dataset')
x_tst_rich = generate_features(x_tst, features, ignore=ignore)

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3816), HTML(value='')))




In [20]:
to_feather(x_trn_rich, 'trn_rich.feather')

PosixPath('/home/ck/data/careercon2019/tmp/trn_rich.feather.feather')

In [21]:
to_feather(x_tst_rich, 'tst_rich.feather')

PosixPath('/home/ck/data/careercon2019/tmp/tst_rich.feather.feather')

In [22]:
enc = LabelEncoder()
y = enc.fit_transform(y_trn['surface'])

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(x_trn_rich, y, test_size=0.1)

In [24]:
def accuracy(y_true, y_pred):
    n = len(y_true)
    y_hat = y_pred.reshape(9, n).argmax(axis=0)
    value = (y_true == y_hat).mean()
    return 'accuracy', value, True

In [25]:
model = lgb.LGBMClassifier(
    n_estimators=1000, learning_rate=0.005,
    colsample_bytree=0.4, objective='multiclass',
    num_leaves=500, num_class=9)

In [26]:
model.fit(
    X_train, y_train, 
    eval_set=[(X_valid, y_valid)], 
    eval_metric=accuracy,
    early_stopping_rounds=300,
    verbose=150)

Training until validation scores don't improve for 300 rounds.
[150]	valid_0's multi_logloss: 1.20644	valid_0's accuracy: 0.855643
[300]	valid_0's multi_logloss: 0.840936	valid_0's accuracy: 0.863517
[450]	valid_0's multi_logloss: 0.651832	valid_0's accuracy: 0.866142
[600]	valid_0's multi_logloss: 0.54337	valid_0's accuracy: 0.871391
[750]	valid_0's multi_logloss: 0.477731	valid_0's accuracy: 0.874016
[900]	valid_0's multi_logloss: 0.437003	valid_0's accuracy: 0.879265
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.418861	valid_0's accuracy: 0.879265


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.4,
        importance_type='split', learning_rate=0.005, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=1000, n_jobs=-1, num_class=9, num_leaves=500,
        objective='multiclass', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [28]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(model.predict(x_tst_rich))
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LightGBM tsfresh (no early stopping)"

100%|██████████████████████████████████████| 52.7k/52.7k [00:00<00:00, 46.1kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 