In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
def noop(*args, **kwargs): pass
warnings.warn = noop

In [3]:
from collections import ChainMap
from multiprocessing import cpu_count
import pickle

In [4]:
from IPython.display import display
import lightgbm as lgb
from lightgbm.engine import LightGBMError
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.externals.joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, ParameterSampler
from tsfresh import extract_features, extract_relevant_features
from tqdm import tqdm_notebook as tqdm

In [5]:
from basedir import SAMPLE
from utils import from_feather, to_feather, kfolds 

In [6]:
seed = 1
np.random.seed(seed)

In [7]:
x_trn, y_trn, x_tst = from_feather('x_trn', 'y_trn', 'x_tst')

In [8]:
from tsfresh.feature_extraction.feature_calculators import (
    mean, median, standard_deviation, variance, skewness, kurtosis,
    mean_abs_change, mean_change, mean_second_derivative_central, 
    quantile, autocorrelation, agg_autocorrelation, partial_autocorrelation,
    abs_energy, count_above_mean, count_below_mean, maximum, minimum,
    first_location_of_minimum, first_location_of_maximum, linear_trend,
    sample_entropy, c3, 
    longest_strike_below_mean, longest_strike_above_mean, 
    number_peaks, sum_of_reoccurring_data_points, sum_values,
    large_standard_deviation,
    number_crossing_m, value_count, range_count,
    # ratio_beyond_r_sigma, index_mass_quantile,
    # symmetry_looking
)
from tsfresh.feature_selection.relevance import calculate_relevance_table

In [9]:
def stat(f, **params):
    def wrapper(x):
        return f(x, **params)
    wrapper.__name__ = f.__name__
    return wrapper

In [10]:
def get_series(data, ser_id, *ser_ids):
    ids = [ser_id] + list(ser_ids)
    return data[data.series_id.isin(ids)].copy()

In [11]:
default_stats = (
    mean, median, standard_deviation, variance, skewness, kurtosis, maximum, minimum,
    mean_change, mean_abs_change, count_above_mean, count_below_mean,
    mean_second_derivative_central, sum_of_reoccurring_data_points, 
    abs_energy, sum_values, sample_entropy,
    longest_strike_above_mean, longest_strike_below_mean,
    first_location_of_minimum, first_location_of_maximum,
    *[stat(large_standard_deviation, r=r*0.05) for r in range(1, 20)],
    *[stat(autocorrelation, lag=lag) for lag in range(1, 25)], 
    *[stat(number_peaks, n=n) for n in (1, 2, 3, 5, 7, 10, 25, 50)],
    *[stat(c3, lag=lag) for lag in range(1, 5)],
    *[stat(quantile, q=q) for q in (.1, .2, .3, .4, .5, .6, .7, .8, .9)],
    stat(partial_autocorrelation, param=[{'lag': lag} for lag in range(25)]),
    stat(agg_autocorrelation, param=[{'f_agg': s, 'maxlag': 40} for s in ('mean', 'median', 'var')]),
    stat(linear_trend, param=[
        {'attr': a} for a in ('pvalue', 'rvalue', 'intercept', 'slope', 'stderr')]),
    # *[stat(number_crossing_m, m=m) for m in (-1, 0, 1)],
    # *[stat(value_count, value=v) for v in (-1, 0, 1)],
    # *[stat(range_count, min=lo, max=hi) for lo, hi in ((-1, 1), (1e12, 0), (0, 1e12))],
    # *[stat(ratio_beyond_r_sigma, r=r) for r in (0.5, 1, 1.5, 2, 2.5, 3, 5, 6, 7, 10)],
    # stat(index_mass_quantile, param=[{'q': q} for q in (.1, .2, .3, .4, .5, .6, .7, .8, .9)]),
    # stat(symmetry_looking, param=[{'r': r*0.05} for r in range(20)])
)

In [12]:
# def add_quaternion_norm(X):
#     X = X.copy()
#     cols = ['orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W']
#     X['quat_norm'] = np.sum([X[col]**2 for col in cols], axis=0)
#     X['quat_mod'] = np.sqrt(X['quat_norm'])
#     for col in cols:
#         axis = col.split('_')[-1]
#         X[f'norm_{axis}'] = X[col] / X['quat_mod']
#     return X

In [13]:
def add_euler_angles(X):
    X = X.copy()
    # x, y, z, w = [X[f'norm_{s}'] for s in list('XYZW')]
    x, y, z, w = [X[f'orientation_{s}'] for s in list('XYZW')]
    nx, ny, nz = quaternion_to_euler(x, y, z, w)
    X['euler_X'] = nx
    X['euler_Y'] = ny
    X['euler_Z'] = nz
    return X


def quaternion_to_euler(x, y, z, w):
    t0 = 2.0*(w*x + y*z)
    t1 = 1.0 - 2.0*(x*x + y*y)
    X = np.arctan2(t0, t1)
    
    t2 = np.clip(2.0*(w*y - z*x), -1, 1)
    Y = np.arcsin(t2)
    
    t3 = 2.0*(w*z + x*y)
    t4 = 1.0 - 2.0*(y*y + z*z)
    Z = np.arctan2(t3, t4)
    
    return X, Y, Z

In [14]:
# x_trn = add_euler_angles(add_quaternion_norm(x_trn)).drop(columns=[
#     'orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W'])

In [15]:
# x_tst = add_euler_angles(add_quaternion_norm(x_tst)).drop(columns=[
#     'orientation_X', 'orientation_Y', 'orientation_Z', 'orientation_W'])

In [16]:
class StatsFeatures:
    def __init__(self, funcs=default_stats):
        self.funcs = funcs
    
    def __call__(self, data):
        features = {}
        for col in data.columns:
            for func in self.funcs:
                result = func(data[col].values) 
                if hasattr(result, '__len__'):
                    for key, value in result:
                        features[f'{col}__{func.__name__}__{key}'] = value
                else:
                    features[f'{col}__{func.__name__}'] = result
        features = {
            k: int(v) if v in (True, False) else v 
            for k, v in features.items()}
        return features

In [17]:
class SliceFeatures:
    def __init__(self, mode='first', n=5):
        if mode not in {'first', 'middle', 'last'}:
            raise ValueError('unexpected mode')
        self.mode = mode
        self.n = n
    
    def __call__(self, data):
        if self.mode == 'first':
            start, end = 0, self.n
        elif self.mode == 'last':
            start, end = -self.n, len(data)
        elif self.mode == 'middle':
            mid = len(data) // 2
            div, mod = divmod(self.n, 2)
            start, end = mid-div, mid+div+mod
        cols = data.columns
        vec = data.iloc[start:end].values.T.ravel()
        new_cols = [f'{col}_{self.mode}{i}' for i in range(self.n) for col in cols]
        return dict(zip(new_cols, vec))

In [18]:
_, group = next(iter(x_trn.groupby('series_id')))
group = group.drop(columns=['series_id', 'measurement_number'])

In [19]:
features = [
    StatsFeatures(),
    SliceFeatures('first'),
    SliceFeatures('middle'),
    SliceFeatures('last')
]

In [20]:
def generate_features(data, features, ignore=None):
    with Parallel(n_jobs=cpu_count()) as parallel:
        extracted = parallel(delayed(generate_features_for_group)(
            group=group.drop(columns=ignore or []),
            features=features
        ) for _, group in tqdm(data.groupby('series_id')))
    return pd.DataFrame(extracted)

In [21]:
def generate_features_for_group(group, features):
    return dict(ChainMap(*[feat(group) for feat in features]))

In [22]:
def replace_not_numbers(data, const=0):
    return data.fillna(const).replace(-np.inf, const).replace(+np.inf, const)

In [23]:
# generate_features_for_group(group, features)

In [24]:
ignore = ['series_id', 'measurement_number']

In [25]:
print('Feature extraction on train dataset')
x_trn_rich = (
    replace_not_numbers(
        generate_features(
            data=add_euler_angles(x_trn), 
            features=features, 
            ignore=ignore)))

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3810), HTML(value='')))

KeyboardInterrupt: 

In [38]:
print('Feature extraction on train dataset')
x_tst_rich = (
    replace_not_numbers(
        generate_features(
            data=add_euler_angles(x_tst), 
            features=features,
            ignore=ignore)))

Feature extraction on train dataset


HBox(children=(IntProgress(value=0, max=3816), HTML(value='')))




In [39]:
to_feather(x_trn_rich, 'trn_rich')

PosixPath('/home/ck/data/careercon2019/tmp/trn_rich.feather')

In [40]:
to_feather(x_tst_rich, 'tst_rich')

PosixPath('/home/ck/data/careercon2019/tmp/tst_rich.feather')

In [41]:
x_trn_rich, x_tst_rich, y_trn = from_feather('trn_rich', 'tst_rich', 'y_trn')
enc = LabelEncoder()
y = enc.fit_transform(y_trn['surface'])

In [13]:
# relevance = calculate_relevance_table(x_trn_rich, pd.Series(y), ml_task='classification')

In [14]:
# rel_cols = relevance[relevance['relevant']].index.tolist()

In [15]:
# x_trn_rich = x_trn_rich[rel_cols]

In [16]:
# x_tst_rich = x_tst_rich[rel_cols]

In [23]:
# X_train, X_valid, y_train, y_valid = train_test_split(
#     x_trn_rich, y, test_size=0.1, random_state=seed)

In [42]:
def accuracy(y_true, y_pred):
    n = len(y_true)
    y_hat = y_pred.reshape(9, n).argmax(axis=0)
    value = (y_true == y_hat).mean()
    return 'accuracy', value, True

In [43]:
n_splits = 5

test_results = np.zeros((len(x_tst_rich), 9), dtype=np.float32)

for i, x_trn, x_val, y_trn, y_val in kfolds(x_trn_rich, pd.Series(y), n_splits):
    print(f'Running fold {i+1:d}/{n_splits:d}')
    model = lgb.LGBMClassifier(
        num_class=9, metric='None', n_estimators=3000, 
        learning_rate=0.005, colsample_bytree=0.4, 
        objective='multiclass', num_leaves=500, 
        random_state=seed)
    model.fit(x_trn, y_trn,
              eval_set=[(x_val, y_val)],
              eval_metric=accuracy,
              early_stopping_rounds=300,
              verbose=150)
    test_results += model.predict_proba(x_tst_rich)
    
test_results /= n_splits

Running fold 1/5
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's accuracy: 0.842037
[300]	valid_0's accuracy: 0.861619
[450]	valid_0's accuracy: 0.874674
[600]	valid_0's accuracy: 0.882507
[750]	valid_0's accuracy: 0.890339
[900]	valid_0's accuracy: 0.89295
[1050]	valid_0's accuracy: 0.896867
[1200]	valid_0's accuracy: 0.900783
[1350]	valid_0's accuracy: 0.902089
[1500]	valid_0's accuracy: 0.9047
[1650]	valid_0's accuracy: 0.9047
Early stopping, best iteration is:
[1371]	valid_0's accuracy: 0.9047
Running fold 2/5
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's accuracy: 0.881046
[300]	valid_0's accuracy: 0.89281
[450]	valid_0's accuracy: 0.894118
[600]	valid_0's accuracy: 0.899346
[750]	valid_0's accuracy: 0.908497
[900]	valid_0's accuracy: 0.909804
[1050]	valid_0's accuracy: 0.909804
[1200]	valid_0's accuracy: 0.911111
[1350]	valid_0's accuracy: 0.915033
[1500]	valid_0's accuracy: 0.918954
[1650]	valid_0's accuracy: 0.9189

In [48]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(np.argmax(test_results, axis=1))
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LightGBM features with k-fold"

100%|██████████████████████████████████████| 53.0k/53.0k [00:00<00:00, 48.7kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 

In [66]:
n_iter = 50

base = dict(num_iterations=100, seed=seed)

sampler = ParameterSampler({
    'objective': ['multiclass', 'ova'],
    'method': [
        ('gbdt', [0.5, 0.9]),
        ('dart', [0.5, 0.9]),
        ('rf', ([0.5, 0.7, 0.9], [0.5, 0.9]))],
    'learning_rate': stats.truncnorm(0.01, 0.3),
    'num_leaves': stats.randint(31, 500),
    'min_data_in_leaf': stats.randint(20, 50),
    'lambda_l1': stats.truncnorm(0.0, 0.001),
    'lambda_l2': stats.truncnorm(0.0, 0.001),
    'drop_rate': stats.uniform(0.05, 0.3)
}, n_iter=n_iter)

best_acc = 0
best_params = None
expo = 1

for i, params in enumerate(sampler):
    if i % expo == 0 or i == (n_iter - 1):
        print(f'Sample {i+1:d}/{n_iter:d}')
        expo *= 2
        
    method, special_params = params.pop('method')
    if method == 'rf':
        params['bagging_fraction'] = np.random.choice(special_params[0])
        params['feature_fraction'] = np.random.uniform(*special_params[1])
        params['bagging_freq'] = 1
    else:
        params['feature_fraction'] = np.random.uniform(*special_params)
    params['boosting'] = method

    model = lgb.LGBMClassifier(**base, **params)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_valid)
    acc = (y_hat == y_valid).mean()
    if acc > best_acc:
        print(f'\taccuracy improved: {acc:2.2%}')
        best_acc = acc
        best_params = params

Sample 1/50
	accuracy improved: 83.20%
	accuracy improved: 83.73%
Sample 3/50
	accuracy improved: 85.04%
Sample 5/50
	accuracy improved: 91.34%
Sample 9/50
	accuracy improved: 91.86%
Sample 17/50
	accuracy improved: 92.39%
Sample 33/50
Sample 50/50


In [67]:
with open('best_params.pickle', 'wb') as file:
    pickle.dump(best_params, file, protocol=pickle.HIGHEST_PROTOCOL)

In [80]:
with open('best_params.pickle', 'rb') as file:
    best_params = pickle.load(file)

In [81]:
best_params

{'drop_rate': 0.0654425698035373,
 'lambda_l1': 0.0009196951765450926,
 'lambda_l2': 1.1288862652355627e-05,
 'learning_rate': 0.26082614216073113,
 'min_data_in_leaf': 23,
 'num_leaves': 202,
 'objective': 'ova',
 'feature_fraction': 0.6699632785552281,
 'boosting': 'dart'}

In [86]:
n_splits = 5
base['num_iterations'] = 30000
test_results = np.zeros((len(x_tst_rich), 9), dtype=np.float32)

for i, x_trn, x_val, y_trn, y_val in kfolds(x_trn_rich, pd.Series(y), n_splits):
    print(f'Running fold {i+1:d}/{n_splits:d}')
    model = lgb.LGBMClassifier(num_class=9, metric='None', **base, **best_params)
    model.fit(x_trn, y_trn,
              eval_set=[(x_val, y_val)],
              eval_metric=accuracy,
              early_stopping_rounds=300,
              verbose=150)
    test_results += model.predict_proba(x_tst_rich)
    
test_results /= n_splits

Running fold 1/5
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's accuracy: 0.912533
[300]	valid_0's accuracy: 0.920366
[450]	valid_0's accuracy: 0.922977
[600]	valid_0's accuracy: 0.925587
[750]	valid_0's accuracy: 0.924282
Early stopping, best iteration is:
[576]	valid_0's accuracy: 0.926893
Running fold 2/5
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's accuracy: 0.926797
[300]	valid_0's accuracy: 0.928105
Early stopping, best iteration is:
[74]	valid_0's accuracy: 0.932026
Running fold 3/5
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's accuracy: 0.913386
[300]	valid_0's accuracy: 0.922572
[450]	valid_0's accuracy: 0.923885
[600]	valid_0's accuracy: 0.922572
[750]	valid_0's accuracy: 0.925197
Early stopping, best iteration is:
[478]	valid_0's accuracy: 0.927822
Running fold 4/5
Training until validation scores don't improve for 300 rounds.
[150]	valid_0's accuracy: 0.921053
[300]	valid_0's 

In [None]:
# model = lgb.LGBMClassifier(
#     n_estimators=10000, learning_rate=0.1,
#     colsample_bytree=0.4, objective='multiclass',
#     num_leaves=500, num_class=9)

In [None]:
# model.fit(
#     X_train, y_train, 
#     eval_set=[(X_valid, y_valid)], 
#     eval_metric=accuracy,
#     early_stopping_rounds=250,
#     verbose=100)

In [None]:
# imp = model.feature_importances_
# idx = np.argsort(imp)[:100]

In [None]:
# f, ax = plt.subplots(1, 1, figsize=(8, 20))
# ax.barh(X_train.columns[idx], imp[idx])
# ax.set_title('Feature Importance');

In [93]:
submit = pd.read_csv(SAMPLE)
submit['surface'] = enc.inverse_transform(np.argmax(test_results, axis=1))
submit.to_csv('submit.csv', index=None)
!kaggle c submit career-con-2019 -f 'submit.csv' -m "LightGBM + relevant features + kfold"

100%|██████████████████████████████████████| 52.7k/52.7k [00:00<00:00, 48.9kB/s]
Successfully submitted to CareerCon 2019 - Help Navigate Robots 