In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
def noop(*args, **kwargs): pass
warnings.warn = noop

In [3]:
from collections import ChainMap
from multiprocessing import cpu_count

In [4]:
import numpy as np
import pandas as pd
from sklearn.externals.joblib import Parallel, delayed
from tsfresh import extract_features, extract_relevant_features
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

In [5]:
from utils import from_feather, to_feather

In [6]:
seed = 1
np.random.seed(seed)

In [7]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

In [8]:
from tsfresh.feature_extraction.feature_calculators import (
    mean, median, standard_deviation, variance, skewness, kurtosis, autocorrelation,
    abs_energy, count_above_mean, count_below_mean, maximum, minimum,
    first_location_of_minimum, first_location_of_maximum, linear_trend,
    c3, longest_strike_below_mean, longest_strike_above_mean, number_peaks
)

In [9]:
def peaks(n):
    def _peaks(x): return number_peaks(x, n)
    _peaks.__name__ = f'peaks_{n}'
    return _peaks

In [10]:
def autocorr(lag):
    def _autocorr(x): return autocorrelation(x, lag)
    _autocorr.__name__ = f'autocorr_{lag}'
    return _autocorr

In [11]:
default_stats = (
    mean, median, standard_deviation, variance, skewness, kurtosis,
    maximum, minimum, longest_strike_above_mean, longest_strike_below_mean,
    first_location_of_minimum, first_location_of_maximum, abs_energy, 
    *[autocorr(lag) for lag in range(1, 6)], 
    *[peaks(n) for n in (1, 2, 3, 5, 7, 10, 15)]
)

In [12]:
class StatsFeatures:
    def __init__(self, funcs=default_stats):
        self.funcs = funcs
    
    def __call__(self, data):
        features = {}
        for col in data.columns:
            features.update({
                f'{col}_{func.__name__}': func(data[col].values) 
                for func in self.funcs})
        return features

In [13]:
class SliceFeatures:
    def __init__(self, mode='first', n=5):
        if mode not in {'first', 'middle', 'last'}:
            raise ValueError('unexpected mode')
        self.mode = mode
        self.n = n
    
    def __call__(self, data):
        if self.mode == 'first':
            start, end = 0, self.n
        elif self.mode == 'last':
            start, end = -self.n, len(data)
        elif self.mode == 'middle':
            mid = len(data) // 2
            div, mod = divmod(self.n, 2)
            start, end = mid-div, mid+div+mod
        cols = data.columns
        vec = data.iloc[start:end].values.T.ravel()
        new_cols = [f'{col}_{self.mode}{i}' for i in range(self.n) for col in cols]
        return dict(zip(new_cols, vec))

In [14]:
_, group = next(iter(x_trn.groupby('series_id')))
group = group.drop(columns=['series_id', 'measurement_number'])

In [15]:
features = [
    StatsFeatures(),
    SliceFeatures('first'),
    SliceFeatures('middle'),
    SliceFeatures('last')
]

In [16]:
def generate_features(data, features, ignore=None):
    with Parallel(n_jobs=cpu_count()) as parallel:
        extracted = parallel(delayed(generate_features_for_group)(
            group=group.drop(columns=ignore or []),
            features=features
        ) for _, group in tqdm(data.groupby('series_id')))
    return pd.DataFrame(extracted)

In [17]:
def generate_features_for_group(group, features):
    return dict(ChainMap(*[feat(group) for feat in features]))

In [18]:
ignore = ['series_id', 'measurement_number']

In [19]:
print('Feature extraction on train dataset')
x_trn_rich = generate_features(x_trn, features, ignore=ignore)

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))




In [20]:
print('Feature extraction on train dataset')
x_tst_rich = generate_features(x_tst, features, ignore=ignore)

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3816), HTML(value='')))




In [21]:
to_feather(x_trn_rich, 'trn_rich.feather')

PosixPath('/home/ck/data/careercon2019/tmp/trn_rich.feather.feather')

In [22]:
to_feather(x_tst_rich, 'tst_rich.feather')

PosixPath('/home/ck/data/careercon2019/tmp/tst_rich.feather.feather')