In [1]:
import numpy as np
import pandas as pd
import os
import gc

from custom_fn import *

%matplotlib inline
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import tsfresh

  from pandas.core import datetools


In [2]:
submission, X_train, X_test, y_train = load_files()

In [3]:
features_cols = [c for c in X_train.columns if X_train[c].dtype == np.float64]

In [4]:
group_col = 'series_id'

X_train_3d, features, X_train_feats = preprocess_data(X_train, group_col, cum=True)
X_test_3d, _, X_test_feats = preprocess_data(X_test, group_col, cum=True)

## Feature calculation

### TSfresh

In [5]:
fc_parameters = {
    'abs_energy': None,
    'absolute_sum_of_changes': None,
    'agg_autocorrelation': [{'f_agg': 'mean', 'maxlag': 10}, 
                            {'f_agg': 'std', 'maxlag': 10}],
    'augmented_dickey_fuller': [{'attr': 'teststat'}],
    'c3': [{'lag': 1}, {'lag': 2}],
    'cid_ce': [{'normalize': True}],
    'count_above_mean': None,
    'count_below_mean': None,
    'fft_aggregated': [{'aggtype': 'centroid'},
                       {'aggtype': 'variance'},
                       {'aggtype': 'skew'},
                       {'aggtype': 'kurtosis'}],
    'mean': None,
    'median': None,
    'skewness': None,
    'kurtosis': None,
    'standard_deviation': None,
    'variance': None,
    'sample_entropy': None,
    'number_peaks': [{'n': 3}],
    'partial_autocorrelation': [{'lag': 10}],
    'mean_change': None,
    'mean_abs_change': None,
    'mean_second_derivative_central': None
}

In [6]:
import copy

In [7]:
X_train_feats.columns

Index(['row_id', 'series_id', 'measurement_number', 'orientation_X',
       'orientation_Y', 'orientation_Z', 'orientation_W', 'angular_velocity_X',
       'angular_velocity_Y', 'angular_velocity_Z', 'linear_acceleration_X',
       'linear_acceleration_Y', 'linear_acceleration_Z', 'euler_X', 'euler_Y',
       'euler_Z', 'angular_velocity_pow2_X', 'angular_velocity_pow2_Y',
       'angular_velocity_pow2_Z', 'total_angular_velocity',
       'linear_acceleration_pow2_X', 'linear_acceleration_pow2_Y',
       'linear_acceleration_pow2_Z', 'total_linear_acc'],
      dtype='object')

In [8]:
feats_ids = copy.deepcopy(features)
feats_ids.append('series_id')
feats_ids.append('measurement_number')

X_tsfresh = tsfresh.feature_extraction.extract_features(
    X_train_feats[feats_ids],
    column_id='series_id',
    column_sort='measurement_number',
    column_kind=None,
    column_value=None,
    n_jobs=6,
    default_fc_parameters=fc_parameters
)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [09:07<00:00,  6.81s/it]


## Cross validation and feature selection

In [10]:
from sklearn.preprocessing import Imputer

In [50]:
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate, RandomizedSearchCV

In [12]:
le = LabelEncoder()
y_enc = le.fit_transform(y_train['surface'])

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
import warnings
from scipy.stats import randint as sp_randint

warnings.filterwarnings('ignore')

In [58]:
estimator = Pipeline([
    ('imputer', Imputer()),
    ('selector', SelectFromModel(Lasso(alpha=0.7), threshold='median')),
    ('clf', RandomForestClassifier(random_state=11, n_estimators=300, criterion='entropy'))
])

In [56]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=11)
cv = folds.split(X_tsfresh, y_train['group_id'])

res = cross_validate(
    estimator, 
    X_tsfresh, 
    y_enc, 
    scoring='accuracy', 
    cv=cv, 
    verbose=3,
    n_jobs=5)

print('Mean CV: {:.2f}'.format(res['test_score'].mean()))
print('Train scores {}'.format(res['train_score']))

[Parallel(n_jobs=5)]: Done   5 out of  10 | elapsed:   23.1s remaining:   23.1s
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:   40.6s finished


Mean CV: 0.80
Train scores [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [64]:
param_dist = {"clf__max_depth": sp_randint(2, 4),
              "clf__max_features": sp_randint(1, 11),
              "clf__min_samples_split": sp_randint(2, 11),
              "clf__bootstrap": [True, False],
              "clf__criterion": ["gini", "entropy"],
              "selector__threshold": ['median', 'mean']}

n_iter_search = 20

In [65]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=11)
cv = folds.split(X_tsfresh, y_train['group_id'])

random_search = RandomizedSearchCV(
    estimator, 
    param_distributions=param_dist,
    n_iter=n_iter_search,
    scoring='accuracy',
    cv=cv,
    verbose=3,
    n_jobs=5)

random_search.fit(X_tsfresh, y_enc)

print('Best mean CV: {:.2f}'.format(random_search.best_score_))

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=5)]: Done  22 tasks      | elapsed:   36.9s
[Parallel(n_jobs=5)]: Done 118 tasks      | elapsed:  4.7min
[Parallel(n_jobs=5)]: Done 200 out of 200 | elapsed:  7.7min finished


Best mean CV: 0.81


In [66]:
random_search.best_estimator_.predict(X_te)

{'clf__bootstrap': False,
 'clf__criterion': 'entropy',
 'clf__max_depth': 8,
 'clf__max_features': 9,
 'clf__min_samples_split': 6,
 'selector__threshold': 'median'}

In [67]:
X_test_tsfresh = tsfresh.feature_extraction.extract_features(
    X_test_feats[feats_ids],
    column_id='series_id',
    column_sort='measurement_number',
    column_kind=None,
    column_value=None,
    n_jobs=6,
    default_fc_parameters=fc_parameters
)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 30/30 [08:26<00:00,  5.78s/it]


In [68]:
y_preds = random_search.best_estimator_.predict(X_test_tsfresh)

In [71]:
submission.drop(['target'], axis=1, inplace=True)

In [72]:
submission['surface'] = le.inverse_transform(y_preds)
submission.to_csv('submission_rf_selector_CV081.csv', index=False)

## Light GBM

In [74]:
import lightgbm

In [81]:
lgbm = lightgbm.LGBMClassifier(
    learning_rate=0.1, 
    n_estimators=300,
    reg_lambda=0.2,
    random_state=11,
    n_jobs=1)

pipegbm = Pipeline([
    ('imputer', Imputer()),
    ('clf', lgbm)
])

In [83]:
param_lgbm = {"clf__learning_rate": [0.1],
              "clf__n_estimators": [300],
              "clf__reg_lambda": [0.0, 0.2]}

In [84]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=11)
cv = folds.split(X_tsfresh, y_train['group_id'])

lgbm_gs = GridSearchCV(
    estimator=pipegbm,
    param_grid=param_lgbm,
    cv=cv,
    scoring='accuracy',
    verbose=3,
    n_jobs=5)

lgbm_gs.fit(X_tsfresh, y_enc)

print('Best mean CV: {:.2f}'.format(lgbm_gs.best_score_))

Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=5)]: Done  18 out of  20 | elapsed: 18.0min remaining:  2.0min
[Parallel(n_jobs=5)]: Done  20 out of  20 | elapsed: 18.0min finished


Best mean CV: 0.92


In [85]:
y_preds_lgbm = lgbm_gs.best_estimator_.predict(X_test_tsfresh)
submission['surface'] = le.inverse_transform(y_preds_lgbm)
submission.to_csv('submission_lgbm_selector_CV092.csv', index=False)

In [87]:
lgbm_gs.grid_scores_

[mean: 0.92467, std: 0.01234, params: {'clf__learning_rate': 0.1, 'clf__n_estimators': 300, 'clf__reg_lambda': 0.0},
 mean: 0.92021, std: 0.01364, params: {'clf__learning_rate': 0.1, 'clf__n_estimators': 300, 'clf__reg_lambda': 0.2}]

In [88]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = Pipeline([('imp', Imputer()), ('dt', DecisionTreeClassifier())])

## Lightgbm 3-CV

In [100]:
folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=11)
cv = folds.split(X_tsfresh, y_train['group_id'])

lgbm_gs_3cv = GridSearchCV(
    estimator=pipegbm,
    param_grid=param_lgbm,
    cv=cv,
    scoring='accuracy',
    verbose=3,
    n_jobs=5)

lgbm_gs_3cv.fit(X_tsfresh, y_enc)

print('Best mean CV: {:.2f}'.format(lgbm_gs_3cv.best_score_))

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=5)]: Done   3 out of   6 | elapsed:  3.5min remaining:  3.5min
[Parallel(n_jobs=5)]: Done   6 out of   6 | elapsed:  4.8min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   6 out of   6 | elapsed:  4.8min finished


Best mean CV: 0.90


In [101]:
y_preds_lgbm_3cv = lgbm_gs_3cv.best_estimator_.predict(X_test_tsfresh)
submission['surface'] = le.inverse_transform(y_preds_lgbm_3cv)
submission.to_csv('submission_lgbm_selector_3-CV090.csv', index=False)