In [1]:
import gc
import math
import os
import pathlib
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [182]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes.sort()
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)
    
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()

    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        feature_values_list = list(range(sample_size))
        
        half_window_size = window_size // 2
        
        sample_begin_indexes = sample_indexes[:half_window_size]

        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        
        min_begin_index = min(full_data_begin_indexes)
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            set(range(min_begin_index, min_begin_index + half_window_size))
        )
        
        sample_end_indexes = sample_indexes[-half_window_size:]

        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        
        max_end_index = max(full_data_end_indexes)
        in_window_end_indexes = full_data_end_indexes.intersection(
            set(range(max_end_index - half_window_size, max_end_index))
        )
        
        if in_window_begin_indexes:
            begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
'acoustic_data'].mean()
                value = df.iloc[b_idx]['acoustic_data']
                value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                feature_values_list[i] = value
        if in_window_end_indexes:
            end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                value = df.iloc[e_idx]['acoustic_data']
                value = value - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                feature_values_list[-i] = value
        slice_begin = len(begin_indexes_set)
        slice_end = sample_df_len - len(end_indexes_set)
        for i in range(sample_df_len)[slice_begin:slice_end]:
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[sample_idx - half_window_size:sample_idx + half_window_size].mean()

        sample_df[feature_name] = feature_values_list
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [None]:
def train_models(
        earthquake_margin_indexes,
        complete_earthquakes_length,
        not_seen_data_df,
        params,
        sample_size=None,
        holdout_size=None,
    ):
    #start_time = time.time()
    #not_seen_data_df, _ = add_features(
    #    not_seen_data_df,
    #    sample_size=not_seen_data_df.shape[0],
    #    holdout_size=0
    #)
    #print("add features to not seen data: {} min".format(time.time() - start_time) / 60)
    
    models_training_start_time = time.time()
    for i in range(len(complete_earthquakes_length)):
        model_training_start_time = time.time()
        earthquake_df = pd.read_csv(
                '../input/train/train.csv',
                #nrows=100000000,
                names=['acoustic_data', 'time_to_failure'],
                dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
                skiprows=earthquake_margin_indexes[i],
                nrows=complete_earthquakes_length[i]
            )
        if not sample_size:
            sample_size = complete_earthquakes_length[i] // 100
        if not holdout_size:
            holdout_size = complete_earthquakes_length[i] // 500
        earthquake_add_features_df, holdout_add_features_df = add_features(
                earthquake_df,
                sample_size=sample_size,
                holdout_size=holdout_size
            )
        X_all = earthquake_add_features_df[earthquake_add_features_df.columns.drop('time_to_failure')]
        y_all = earthquake_add_features_df['time_to_failure']

        X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.2, random_state=0)

        model = lgb.LGBMRegressor(**params, n_estimators = 20000, n_jobs = 10, num_iterations=40000)
        model.fit(
                X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                eval_metric='mae',
                verbose=1000,
                early_stopping_rounds=4000
            )
        X_holdout = holdout_df[holdout_df.columns.drop('time_to_failure')]
        y_holdout = holdout_df['time_to_failure']
        
        y_holdout_predict = model.predict(X_holdout)
        print("earthquake {} mae {}".format(i, mean_absolute_error(y_holdout, y_holdout_predict)))

        not_seen_data_predict = model.predict(not_seen_data_df)
        not_seen_data_predict_df = pd.DataFrame({'time_to_failure': not_seen_data_predict})
        not_seen_data_predict_df.to_csv('not_seend_data_earthquake_{}_model_predict.csv', index=False)

        model.save_model('earthquake_{}_model.txt'.format(i))
        print("model {} training and prediction time: {} min".format(i, (time.time() - model_training_start_time) / 60))
    print("all training and prediction time: {} min".format(time.time() - models_training_start_time))

    return

In [4]:
earthquake_margin_indexes =[
    5656573,
    50085877,
    104677355,
    138772452,
    187641819,
    218652629,
    245829584,
    307838916,
    338276286,
    375377847,
    419368879,
    461811622,
    495800224,
    528777114,
    585568143,
    621985672
]

In [5]:
earthquakes_length = [earthquake_margin_indexes[i + 1] - earthquake_margin_indexes[i] for i in range(len(earthquake_margin_indexes) - 1)]

In [6]:
earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029,
 36417529]

In [7]:
complete_earthquakes_length = earthquakes_length[:-1]

In [8]:
#complete_earthquaces_length = complete_earthquaces_length[:-1]

In [9]:
complete_earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029]

In [136]:
params = {
    #'num_leaves': 51,
    'num_leaves': 27,
    #'min_data_in_leaf': 10,
    'min_data_in_leaf': 8,
    'objective':'regression',
    #'max_depth': -1,
    'max_depth': 5,
    'learning_rate': 0.001,
    'boosting': 'gbdt',
    #'feature_fraction': 0.91,
    #'bagging_freq': 1,
    #'bagging_fraction': 0.91,
    #'bagging_seed': 42,
    'metric': 'mae',
    #'lambda_l1': 0.1,
    'verbosity': -1,
    'nthread': 10,
    'random_state': 42
}

In [None]:
not_seen_data_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=1,
    nrows=5656572
)

In [None]:
%%time
featured_not_seen_data_df, _ = add_features(
    not_seen_data_df,
    sample_size=not_seen_data_df.shape[0],
    holdout_size=0
)

In [151]:
%%time
train_models(earthquake_margin_indexes, complete_earthquakes_length, params, featured_not_seen_data_df)

KeyError: 5656569

In [155]:
%%time
not_seen_data_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=1,
    nrows=5656572
)

CPU times: user 1.34 s, sys: 56 ms, total: 1.39 s
Wall time: 1.39 s


In [156]:
not_seen_data_df.shape

(5656572, 2)

In [181]:
%%time
featured_not_seen_data_df, _ = add_features(
    not_seen_data_df,
    sample_size=not_seen_data_df.shape[0],
    holdout_size=0
)




window_size: 3

sample_df.index.tolist()[:window_size]:
 [0, 1, 2]
df.index.tolist()[:window_size]:
 [0, 1, 2]
sample_df.index.tolist()[-window_size:]:
 [5656569, 5656570, 5656571]
df.index.tolist()[-window_size:]:
 [5656569, 5656570, 5656571]
sample_begin_indexes:
 [0]
full_data_begin_indexes:
 {0}
in_window_begin_indexes:
 {0}
sample_end_indexes:
 [5656570]
full_data_end_indexes: {5656570}
in_window_end_indexes:
 set()
i: 0, b_idx 0:
type(b_idx): <class 'int'>
df[:window_size]:
    acoustic_data  time_to_failure
0           12.0           1.4691
1            6.0           1.4691
2            8.0           1.4691



window_size: 5

sample_df.index.tolist()[:window_size]:
 [0, 1, 2, 3, 4]
df.index.tolist()[:window_size]:
 [0, 1, 2, 3, 4]
sample_df.index.tolist()[-window_size:]:
 [5656567, 5656568, 5656569, 5656570, 5656571]
df.index.tolist()[-window_size:]:
 [5656567, 5656568, 5656569, 5656570, 5656571]
sample_begin_indexes:
 [0 0]
full_data_begin_indexes:
 {0}
in_window_begin_index

NameError: name 'sample_df_indexes_set' is not defined

In [None]:
begin_indexes:
 [1]
sample_begin_indexes:
 [1]
full_data_begin_indexes:
 [0]
in_window_begin_indexes:
 []
sample_end_indexes:
 [5656570]
full_data_end_indexes: [5656571]
in_window_end_indexes:
 []
2 begin_indexes? :
 [1]
2 end_indexes? :
 [5656570]
begin_indexes:
 [1 2]
sample_begin_indexes:
 [1 2]
full_data_begin_indexes:
 [0, 1]
in_window_begin_indexes:
 [1]
sample_end_indexes:
 [5656569 5656570]
full_data_end_indexes: [5656570, 5656571]
in_window_end_indexes:
 [5656570]
2 begin_indexes? :
 [1 2]
2 end_indexes? :
 [5656569 5656570]
i: 0, b_idx 1:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 1

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<timed exec> in <module>()

<ipython-input-159-69232aa59e5d> in add_features(df, first_index, last_index, sample_size, holdout_size, smootch_windows_size)
     71                 print("i: {}, b_idx {}:".format(i, b_idx))
     72                 #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
---> 73                 value = df[b_idx]['acoustic_data']
     74                 value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
     75                 feature_values_list[i] = value

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2487         res = cache.get(item)
   2488         if res is None:
-> 2489             values = self._data.get(item)
   2490             res = self._box_item_values(item, values)
   2491             cache[item] = res

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3078                 return self._engine.get_loc(key)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 
   3082         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 1

In [None]:



window_size: 3

begin_indexes:
 [0]
sample_begin_indexes:
 [0]
full_data_begin_indexes:
 [0]
in_window_begin_indexes:
 [0]
sample_end_indexes:
 [5656570]
full_data_end_indexes: [5656571]
in_window_end_indexes:
 []
2 begin_indexes? :
 [0]
2 end_indexes? :
 [5656570]
i: 0, b_idx 0:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<timed exec> in <module>()

<ipython-input-161-35773f4b493b> in add_features(df, first_index, last_index, sample_size, holdout_size, smootch_windows_size)
     74                 print("i: {}, b_idx {}:".format(i, b_idx))
     75                 #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
---> 76                 value = df[b_idx]['acoustic_data']
     77                 value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
     78                 feature_values_list[i] = value

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2487         res = cache.get(item)
   2488         if res is None:
-> 2489             values = self._data.get(item)
   2490             res = self._box_item_values(item, values)
   2491             cache[item] = res

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3078                 return self._engine.get_loc(key)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 
   3082         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0
