In [2]:
import gc
import multiprocessing
from multiprocessing.pool import ThreadPool
import math
import os
import pathlib
import pickle
import random
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import dask
import dask.multiprocessing


from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

In [3]:
%matplotlib inline

In [4]:
dask.config.set(scheduler='processes')
dask.set_options( pool=ThreadPool(10) )



<dask.config.set at 0x7f55afc984e0>

In [3]:
def add_features(
        df,
        input_first_index=None,
        input_last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    if input_first_index == None or input_last_index == None:
        input_first_index = df.index.min()
        input_last_index = df.index.max() + 1
        
    
    sample_indexes = random.sample(range(input_first_index, input_last_index), sample_size)
    sample_indexes.sort()
    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    acoustic_data_series = df['acoustic_data']
    full_data_indexes = tuple(acoustic_data_series.index.tolist())

    sample_df = df.iloc[sample_indexes]

    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)

    output_first_index = 0
    output_last_index = len(sample_df) - 1
    
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
   
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):

        feature_values_list = list(range(sample_size))

        half_window_size = window_size // 2

        sample_begin_indexes = sample_indexes[:half_window_size]
        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        min_full_data_index = min(full_data_indexes)
        
        in_window_full_data_begin_indexes = set(range(input_first_index, input_first_index + half_window_size))              
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            in_window_full_data_begin_indexes
        )
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        max_full_data_index = max(full_data_end_indexes) + 1
        
        in_window_full_data_end_indexes = set(range(input_last_index - half_window_size, input_last_index))        
        in_window_end_indexes = full_data_end_indexes.intersection(
            in_window_full_data_end_indexes
        )
        if in_window_begin_indexes:
            begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                value = sample_df.iloc[i]['acoustic_data']
                temp = acoustic_data_series.iloc[input_first_index:input_first_index + window_size].mean()
                value = value - temp
                feature_values_list[output_first_index + i] = value
                
        if in_window_end_indexes:
            end_indexes_set = end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                value = sample_df.iloc[output_last_index - i]['acoustic_data']
                temp = acoustic_data_series.iloc[input_last_index - window_size:].mean()
                value = value - temp
                feature_values_list[output_last_index - i] = value
                
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[
                sample_idx - half_window_size:sample_idx + half_window_size
            ].mean()
        sample_df[feature_name] = feature_values_list
        
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [5]:
def add_features(
#def add_features_parallel(
        df,
        input_first_index=None,
        input_last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    if input_first_index == None or input_last_index == None:
        input_first_index = df.index.min()
        input_last_index = df.index.max() + 1
        
    
    sample_indexes = random.sample(range(input_first_index, input_last_index), sample_size)
    sample_indexes.sort()
    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    acoustic_data_series = df['acoustic_data']
    full_data_indexes = tuple(acoustic_data_series.index.tolist())

    sample_df = df.iloc[sample_indexes]

    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)

    output_first_index = 0
    output_last_index = len(sample_df) - 1
    
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
   
    sample_df_len = sample_df.shape[0]
    @dask.delayed
    def create_features():
        for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):

            feature_values_list = list(range(sample_size))

            half_window_size = window_size // 2

            sample_begin_indexes = sample_indexes[:half_window_size]
            full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
            min_full_data_index = min(full_data_indexes)
        
            in_window_full_data_begin_indexes = set(range(input_first_index, input_first_index + half_window_size))              
            in_window_begin_indexes = full_data_begin_indexes.intersection(
                in_window_full_data_begin_indexes
            )
        
            sample_end_indexes = sample_indexes[-half_window_size:]
            full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
            max_full_data_index = max(full_data_end_indexes) + 1
        
            in_window_full_data_end_indexes = set(range(input_last_index - half_window_size, input_last_index))        
            in_window_end_indexes = full_data_end_indexes.intersection(
                in_window_full_data_end_indexes
            )
            if in_window_begin_indexes:
                begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
                for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                    value = sample_df.iloc[i]['acoustic_data']
                    temp = acoustic_data_series.iloc[input_first_index:input_first_index + window_size].mean()
                    value = value - temp
                    feature_values_list[output_first_index + i] = value
                
            if in_window_end_indexes:
                end_indexes_set = end_indexes_set.union(in_window_end_indexes)
                for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                    value = sample_df.iloc[output_last_index - i]['acoustic_data']
                    temp = acoustic_data_series.iloc[input_last_index - window_size:].mean()
                    value = value - temp
                    feature_values_list[output_last_index - i] = value
                
            first_regular_idx = len(begin_indexes_set)
            last_regular_idx = sample_df_len - len(end_indexes_set)
            for i in range(first_regular_idx, last_regular_idx):
                sample_idx = sample_indexes[i]
                feature_values_list[i] = acoustic_data_series.iloc[
                    sample_idx - half_window_size:sample_idx + half_window_size
                ].mean()
            sample_df[feature_name] = feature_values_list
        return sample_df
        
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [6]:
earthquake_margin_indexes =[
    5656573,
    50085877,
    104677355,
    138772452,
    187641819,
    218652629,
    245829584,
    307838916,
    338276286,
    375377847,
    419368879,
    461811622,
    495800224,
    528777114,
    585568143,
    621985672
]

In [7]:
earthquakes_length = [earthquake_margin_indexes[i + 1] - earthquake_margin_indexes[i] for i in range(len(earthquake_margin_indexes) - 1)]

In [6]:
earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029,
 36417529]

In [8]:
complete_earthquakes_length = earthquakes_length[:-1]

In [8]:
#complete_earthquaces_length = complete_earthquaces_length[:-1]

In [9]:
complete_earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029]

In [11]:
params = {
    #'num_leaves': 51,
    'num_leaves': 27,
    #'min_data_in_leaf': 10,
    'min_data_in_leaf': 8,
    'objective':'regression',
    #'max_depth': -1,
    'max_depth': 5,
    'learning_rate': 0.001,
    'boosting': 'gbdt',
    #'feature_fraction': 0.91,
    #'bagging_freq': 1,
    #'bagging_fraction': 0.91,
    #'bagging_seed': 42,
    'metric': 'mae',
    #'lambda_l1': 0.1,
    'verbosity': -1,
    'nthread': 10,
    'random_state': 42
}

In [12]:
def train_models(
        earthquake_margin_indexes,
        complete_earthquakes_length,
        params,
        featured_not_seen_data_df,
        sample_size=None,
        holdout_size=None,
        #not_seen_data_begin=1,
        #not_seen_data_end=5656572 #5656569
    ):
    
    '''
    not_seen_data_df = pd.read_csv(
        '../input/train/train.csv',
        #nrows=100000000,
        names=['acoustic_data', 'time_to_failure'],
        dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
        skiprows=not_seen_data_begin,
        nrows=not_seen_data_end
    )
    
    not_seen_data_df, _ = add_features(
        not_seen_data_df,
        sample_size=not_seen_data_df.shape[0],
        holdout_size=0
    )
    '''
    y_not_seen_data = featured_not_seen_data_df['time_to_failure']
    featured_not_seen_data_df = featured_not_seen_data_df[featured_not_seen_data_df.columns.drop('time_to_failure')]
    
    for i in range(len(complete_earthquakes_length)):
        print("i:", i)
        print("complete_earthquakes_length:", complete_earthquakes_length[i])
        print("earthquake_margin_indexes:", earthquake_margin_indexes[i])
        
        earthquake_df = pd.read_csv(
                '../input/train/train.csv',
                #nrows=100000000,
                names=['acoustic_data', 'time_to_failure'],
                dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
                skiprows=earthquake_margin_indexes[i],
                nrows=complete_earthquakes_length[i]
            )
        if not sample_size:
            sample_size = complete_earthquakes_length[i] // 10
        if not holdout_size:
            holdout_size = complete_earthquakes_length[i] // 50
        earthquake_add_features_df, holdout_add_features_df = add_features(
                earthquake_df,
                sample_size=sample_size,
                holdout_size=holdout_size
            )
        X_all = earthquake_add_features_df[earthquake_add_features_df.columns.drop('time_to_failure')]
        y_all = earthquake_add_features_df['time_to_failure']

        X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.2, random_state=0)

        model = lgb.LGBMRegressor(**params, n_estimators = 20000, n_jobs = 10, num_iterations=40000)
        model.fit(
                X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                eval_metric='mae',
                verbose=1000,
                early_stopping_rounds=4000
            )
        X_holdout = holdout_add_features_df[holdout_add_features_df.columns.drop('time_to_failure')]
        y_holdout = holdout_add_features_df['time_to_failure']
        
        y_holdout_predict = model.predict(X_holdout)
        print("earthquake {} mae {}".format(i, mean_absolute_error(y_holdout, y_holdout_predict)))
        
        not_seen_data_predict = model.predict(featured_not_seen_data_df)
        print("not seen data mae: {}".format(mean_absolute_error(y_not_seen_data, not_seen_data_predict)))
        not_seen_data_predict_df = pd.DataFrame({'time_to_failure': not_seen_data_predict})
        not_seen_data_predict_df.to_csv('not_seend_data_earthquake_{}_model_predict.csv'.format(i), index=False)

        #model.save_model('earthquake_{}_model.txt'.format(i))
        #model.Booster(model_file='earthquake_{}_model.txt'.format(i))
        model_file = open('earthquake_{}_model.txt'.format(i), 'wb')
        pickle.dump(model, model_file)
        model_file.flush()
        model_file.close()

    return

In [13]:
%%time
not_seen_data_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=1,
    nrows=5656572
)

CPU times: user 1.38 s, sys: 80 ms, total: 1.46 s
Wall time: 1.46 s


In [14]:
%%time
featured_not_seen_data_df, _ = add_features(
    not_seen_data_df,
    sample_size=not_seen_data_df.shape[0] // 20,
    holdout_size=0
)

Full calculation feature value time (with slicing) 5.046526590983073e-06 min:
CPU times: user 984 ms, sys: 212 ms, total: 1.2 s
Wall time: 1.19 s


In [15]:
%%time
train_models(earthquake_margin_indexes, complete_earthquakes_length, params, featured_not_seen_data_df)

i: 0
complete_earthquakes_length: 44429304
earthquake_margin_indexes: 5656573
Full calculation feature value time (with slicing) 0.04236885706583659 min:




Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.86507	valid_1's l1: 2.86328
[2000]	training's l1: 2.86135	valid_1's l1: 2.85949
[3000]	training's l1: 2.8605	valid_1's l1: 2.8586
[4000]	training's l1: 2.86026	valid_1's l1: 2.85834
[5000]	training's l1: 2.86017	valid_1's l1: 2.85825
[6000]	training's l1: 2.86014	valid_1's l1: 2.85822
[7000]	training's l1: 2.86012	valid_1's l1: 2.85821
[8000]	training's l1: 2.86012	valid_1's l1: 2.85821
[9000]	training's l1: 2.86012	valid_1's l1: 2.85821
[10000]	training's l1: 2.86012	valid_1's l1: 2.85821
[11000]	training's l1: 2.86011	valid_1's l1: 2.85821
[12000]	training's l1: 2.86011	valid_1's l1: 2.85821
[13000]	training's l1: 2.86011	valid_1's l1: 2.85821
[14000]	training's l1: 2.86011	valid_1's l1: 2.85821
[15000]	training's l1: 2.86011	valid_1's l1: 2.85821
[16000]	training's l1: 2.86011	valid_1's l1: 2.85821
Early stopping, best iteration is:
[12350]	training's l1: 2.86011	valid_1's l1: 2.85821
earthquake 



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 3.51704	valid_1's l1: 3.51685
[2000]	training's l1: 3.51158	valid_1's l1: 3.51121
[3000]	training's l1: 3.51015	valid_1's l1: 3.50971
[4000]	training's l1: 3.5097	valid_1's l1: 3.50923
[5000]	training's l1: 3.50954	valid_1's l1: 3.50906
[6000]	training's l1: 3.50948	valid_1's l1: 3.50901
[7000]	training's l1: 3.50946	valid_1's l1: 3.50899
[8000]	training's l1: 3.50945	valid_1's l1: 3.50899
[9000]	training's l1: 3.50945	valid_1's l1: 3.50899
[10000]	training's l1: 3.50945	valid_1's l1: 3.50899
[11000]	training's l1: 3.50944	valid_1's l1: 3.50899
[12000]	training's l1: 3.50944	valid_1's l1: 3.50899
Early stopping, best iteration is:
[8255]	training's l1: 3.50945	valid_1's l1: 3.50899
earthquake 1 mae 3.5114871244390997
not seen data mae: 6.334338894587381
i: 2
complete_earthquakes_length: 34095097
earthquake_margin_indexes: 104677355
Full calculation feature value time (with slicing) 0.04176505009333293



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.18953	valid_1's l1: 2.18646
[2000]	training's l1: 2.18368	valid_1's l1: 2.18059
[3000]	training's l1: 2.182	valid_1's l1: 2.17888
[4000]	training's l1: 2.18145	valid_1's l1: 2.17833
[5000]	training's l1: 2.18125	valid_1's l1: 2.17814
[6000]	training's l1: 2.18118	valid_1's l1: 2.17807
[7000]	training's l1: 2.18115	valid_1's l1: 2.17805
[8000]	training's l1: 2.18114	valid_1's l1: 2.17804
[9000]	training's l1: 2.18114	valid_1's l1: 2.17804
[10000]	training's l1: 2.18113	valid_1's l1: 2.17804
[11000]	training's l1: 2.18113	valid_1's l1: 2.17804
[12000]	training's l1: 2.18113	valid_1's l1: 2.17804
[13000]	training's l1: 2.18113	valid_1's l1: 2.17804
[14000]	training's l1: 2.18113	valid_1's l1: 2.17804
Early stopping, best iteration is:
[10764]	training's l1: 2.18113	valid_1's l1: 2.17804
earthquake 2 mae 2.1822420931093793
not seen data mae: 3.6585048869825108
i: 3
complete_earthquakes_length: 48869367




Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 3.14204	valid_1's l1: 3.13913
[2000]	training's l1: 3.13488	valid_1's l1: 3.13186
[3000]	training's l1: 3.1329	valid_1's l1: 3.12985
[4000]	training's l1: 3.13226	valid_1's l1: 3.1292
[5000]	training's l1: 3.13203	valid_1's l1: 3.12897
[6000]	training's l1: 3.13195	valid_1's l1: 3.12889
[7000]	training's l1: 3.13192	valid_1's l1: 3.12886
[8000]	training's l1: 3.13191	valid_1's l1: 3.12886
[9000]	training's l1: 3.1319	valid_1's l1: 3.12886
[10000]	training's l1: 3.1319	valid_1's l1: 3.12886
[11000]	training's l1: 3.13189	valid_1's l1: 3.12886
Early stopping, best iteration is:
[7941]	training's l1: 3.13191	valid_1's l1: 3.12886
earthquake 3 mae 3.129220831194849
not seen data mae: 5.583504634929253
i: 4
complete_earthquakes_length: 31010810
earthquake_margin_indexes: 187641819
Full calculation feature value time (with slicing) 0.042831977208455406 min:




Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 1.99592	valid_1's l1: 1.99622
[2000]	training's l1: 1.99165	valid_1's l1: 1.99196
[3000]	training's l1: 1.99043	valid_1's l1: 1.99075
[4000]	training's l1: 1.99003	valid_1's l1: 1.99036
[5000]	training's l1: 1.98988	valid_1's l1: 1.99022
[6000]	training's l1: 1.98983	valid_1's l1: 1.99017
[7000]	training's l1: 1.98981	valid_1's l1: 1.99015
[8000]	training's l1: 1.9898	valid_1's l1: 1.99015
[9000]	training's l1: 1.98979	valid_1's l1: 1.99014
[10000]	training's l1: 1.98979	valid_1's l1: 1.99014
[11000]	training's l1: 1.98979	valid_1's l1: 1.99014
[12000]	training's l1: 1.98978	valid_1's l1: 1.99014
[13000]	training's l1: 1.98978	valid_1's l1: 1.99014
[14000]	training's l1: 1.98978	valid_1's l1: 1.99014
[15000]	training's l1: 1.98978	valid_1's l1: 1.99014
[16000]	training's l1: 1.98978	valid_1's l1: 1.99014
[17000]	training's l1: 1.98978	valid_1's l1: 1.99014
[18000]	training's l1: 1.98978	valid_1's l1: 



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 1.75285	valid_1's l1: 1.75206
[2000]	training's l1: 1.74966	valid_1's l1: 1.74895
[3000]	training's l1: 1.74871	valid_1's l1: 1.74801
[4000]	training's l1: 1.74839	valid_1's l1: 1.7477
[5000]	training's l1: 1.74827	valid_1's l1: 1.74759
[6000]	training's l1: 1.74823	valid_1's l1: 1.74755
[7000]	training's l1: 1.74822	valid_1's l1: 1.74754
[8000]	training's l1: 1.74821	valid_1's l1: 1.74754
[9000]	training's l1: 1.74821	valid_1's l1: 1.74754
[10000]	training's l1: 1.7482	valid_1's l1: 1.74753
[11000]	training's l1: 1.7482	valid_1's l1: 1.74753
[12000]	training's l1: 1.7482	valid_1's l1: 1.74753
[13000]	training's l1: 1.7482	valid_1's l1: 1.74753
[14000]	training's l1: 1.7482	valid_1's l1: 1.74753
[15000]	training's l1: 1.7482	valid_1's l1: 1.74753
[16000]	training's l1: 1.7482	valid_1's l1: 1.74753
[17000]	training's l1: 1.7482	valid_1's l1: 1.74753
[18000]	training's l1: 1.7482	valid_1's l1: 1.74753
[



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 4.00153	valid_1's l1: 3.99877
[2000]	training's l1: 3.99531	valid_1's l1: 3.99254
[3000]	training's l1: 3.99358	valid_1's l1: 3.99083
[4000]	training's l1: 3.99302	valid_1's l1: 3.99028
[5000]	training's l1: 3.99282	valid_1's l1: 3.99009
[6000]	training's l1: 3.99275	valid_1's l1: 3.99002
[7000]	training's l1: 3.99272	valid_1's l1: 3.99
[8000]	training's l1: 3.99271	valid_1's l1: 3.98999
[9000]	training's l1: 3.9927	valid_1's l1: 3.98999
[10000]	training's l1: 3.9927	valid_1's l1: 3.98999
[11000]	training's l1: 3.99269	valid_1's l1: 3.98999
[12000]	training's l1: 3.99269	valid_1's l1: 3.98999
Early stopping, best iteration is:
[8441]	training's l1: 3.9927	valid_1's l1: 3.98999
earthquake 6 mae 3.9908463494983475
not seen data mae: 7.2921466053095925
i: 7
complete_earthquakes_length: 30437370
earthquake_margin_indexes: 307838916
Full calculation feature value time (with slicing) 0.042690805594126385 mi



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 1.95717	valid_1's l1: 1.95655
[2000]	training's l1: 1.95251	valid_1's l1: 1.95184
[3000]	training's l1: 1.95115	valid_1's l1: 1.95047
[4000]	training's l1: 1.9507	valid_1's l1: 1.95002
[5000]	training's l1: 1.95054	valid_1's l1: 1.94986
[6000]	training's l1: 1.95048	valid_1's l1: 1.9498
[7000]	training's l1: 1.95046	valid_1's l1: 1.94979
[8000]	training's l1: 1.95045	valid_1's l1: 1.94978
[9000]	training's l1: 1.95045	valid_1's l1: 1.94978
[10000]	training's l1: 1.95044	valid_1's l1: 1.94978
[11000]	training's l1: 1.95044	valid_1's l1: 1.94978
[12000]	training's l1: 1.95044	valid_1's l1: 1.94978
[13000]	training's l1: 1.95044	valid_1's l1: 1.94977
[14000]	training's l1: 1.95044	valid_1's l1: 1.94977
[15000]	training's l1: 1.95043	valid_1's l1: 1.94977
[16000]	training's l1: 1.95043	valid_1's l1: 1.94977
[17000]	training's l1: 1.95043	valid_1's l1: 1.94977
[18000]	training's l1: 1.95043	valid_1's l1: 1



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.38657	valid_1's l1: 2.38638
[2000]	training's l1: 2.38128	valid_1's l1: 2.38132
[3000]	training's l1: 2.37977	valid_1's l1: 2.37989
[4000]	training's l1: 2.37926	valid_1's l1: 2.37941
[5000]	training's l1: 2.37908	valid_1's l1: 2.37925
[6000]	training's l1: 2.37901	valid_1's l1: 2.37919
[7000]	training's l1: 2.37899	valid_1's l1: 2.37918
[8000]	training's l1: 2.37898	valid_1's l1: 2.37917
[9000]	training's l1: 2.37898	valid_1's l1: 2.37917
[10000]	training's l1: 2.37897	valid_1's l1: 2.37917
[11000]	training's l1: 2.37897	valid_1's l1: 2.37917
[12000]	training's l1: 2.37897	valid_1's l1: 2.37917
[13000]	training's l1: 2.37897	valid_1's l1: 2.37917
[14000]	training's l1: 2.37896	valid_1's l1: 2.37917
[15000]	training's l1: 2.37896	valid_1's l1: 2.37917
[16000]	training's l1: 2.37896	valid_1's l1: 2.37917
Early stopping, best iteration is:
[12844]	training's l1: 2.37897	valid_1's l1: 2.37917
earthquak



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.83503	valid_1's l1: 2.83279
[2000]	training's l1: 2.82954	valid_1's l1: 2.82728
[3000]	training's l1: 2.82798	valid_1's l1: 2.8257
[4000]	training's l1: 2.82746	valid_1's l1: 2.82518
[5000]	training's l1: 2.82728	valid_1's l1: 2.825
[6000]	training's l1: 2.82721	valid_1's l1: 2.82493
[7000]	training's l1: 2.82718	valid_1's l1: 2.82492
[8000]	training's l1: 2.82717	valid_1's l1: 2.82492
[9000]	training's l1: 2.82716	valid_1's l1: 2.82492
[10000]	training's l1: 2.82716	valid_1's l1: 2.82492
[11000]	training's l1: 2.82715	valid_1's l1: 2.82492
[12000]	training's l1: 2.82715	valid_1's l1: 2.82492
[13000]	training's l1: 2.82715	valid_1's l1: 2.82492
[14000]	training's l1: 2.82714	valid_1's l1: 2.82492
[15000]	training's l1: 2.82714	valid_1's l1: 2.82492
Early stopping, best iteration is:
[11797]	training's l1: 2.82715	valid_1's l1: 2.82492
earthquake 9 mae 2.82520527008766
not seen data mae: 4.9463273808



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.73172	valid_1's l1: 2.72878
[2000]	training's l1: 2.7259	valid_1's l1: 2.72288
[3000]	training's l1: 2.72423	valid_1's l1: 2.7212
[4000]	training's l1: 2.72368	valid_1's l1: 2.72065
[5000]	training's l1: 2.72349	valid_1's l1: 2.72046
[6000]	training's l1: 2.72342	valid_1's l1: 2.72039
[7000]	training's l1: 2.72339	valid_1's l1: 2.72037
[8000]	training's l1: 2.72338	valid_1's l1: 2.72037
[9000]	training's l1: 2.72337	valid_1's l1: 2.72036
[10000]	training's l1: 2.72336	valid_1's l1: 2.72036
[11000]	training's l1: 2.72336	valid_1's l1: 2.72036
[12000]	training's l1: 2.72336	valid_1's l1: 2.72036
[13000]	training's l1: 2.72335	valid_1's l1: 2.72036
[14000]	training's l1: 2.72335	valid_1's l1: 2.72036
[15000]	training's l1: 2.72335	valid_1's l1: 2.72036
[16000]	training's l1: 2.72335	valid_1's l1: 2.72036
[17000]	training's l1: 2.72335	valid_1's l1: 2.72036
[18000]	training's l1: 2.72335	valid_1's l1: 2



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.18322	valid_1's l1: 2.18001
[2000]	training's l1: 2.17746	valid_1's l1: 2.17415
[3000]	training's l1: 2.1758	valid_1's l1: 2.17246
[4000]	training's l1: 2.17525	valid_1's l1: 2.17191
[5000]	training's l1: 2.17506	valid_1's l1: 2.17171
[6000]	training's l1: 2.17499	valid_1's l1: 2.17164
[7000]	training's l1: 2.17496	valid_1's l1: 2.17162
[8000]	training's l1: 2.17495	valid_1's l1: 2.17161
[9000]	training's l1: 2.17495	valid_1's l1: 2.17161
[10000]	training's l1: 2.17494	valid_1's l1: 2.17161
[11000]	training's l1: 2.17494	valid_1's l1: 2.17161
[12000]	training's l1: 2.17494	valid_1's l1: 2.17161
[13000]	training's l1: 2.17494	valid_1's l1: 2.17161
[14000]	training's l1: 2.17494	valid_1's l1: 2.17161
[15000]	training's l1: 2.17494	valid_1's l1: 2.17161
[16000]	training's l1: 2.17494	valid_1's l1: 2.17161
[17000]	training's l1: 2.17494	valid_1's l1: 2.17161
Early stopping, best iteration is:
[13403]	tr



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.12266	valid_1's l1: 2.12067
[2000]	training's l1: 2.1182	valid_1's l1: 2.1162
[3000]	training's l1: 2.11691	valid_1's l1: 2.1149
[4000]	training's l1: 2.11648	valid_1's l1: 2.11446
[5000]	training's l1: 2.11633	valid_1's l1: 2.11431
[6000]	training's l1: 2.11627	valid_1's l1: 2.11425
[7000]	training's l1: 2.11625	valid_1's l1: 2.11423
[8000]	training's l1: 2.11624	valid_1's l1: 2.11423
[9000]	training's l1: 2.11624	valid_1's l1: 2.11423
[10000]	training's l1: 2.11624	valid_1's l1: 2.11423
[11000]	training's l1: 2.11623	valid_1's l1: 2.11422
[12000]	training's l1: 2.11623	valid_1's l1: 2.11422
[13000]	training's l1: 2.11623	valid_1's l1: 2.11422
[14000]	training's l1: 2.11623	valid_1's l1: 2.11422
[15000]	training's l1: 2.11623	valid_1's l1: 2.11422
[16000]	training's l1: 2.11623	valid_1's l1: 2.11422
[17000]	training's l1: 2.11623	valid_1's l1: 2.11422
[18000]	training's l1: 2.11623	valid_1's l1: 2.



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 3.66196	valid_1's l1: 3.65794
[2000]	training's l1: 3.65669	valid_1's l1: 3.6526
[3000]	training's l1: 3.65527	valid_1's l1: 3.65117
[4000]	training's l1: 3.65482	valid_1's l1: 3.65072
[5000]	training's l1: 3.65466	valid_1's l1: 3.65056
[6000]	training's l1: 3.6546	valid_1's l1: 3.65051
[7000]	training's l1: 3.65458	valid_1's l1: 3.6505
[8000]	training's l1: 3.65457	valid_1's l1: 3.6505
[9000]	training's l1: 3.65456	valid_1's l1: 3.65049
[10000]	training's l1: 3.65456	valid_1's l1: 3.65049
[11000]	training's l1: 3.65455	valid_1's l1: 3.65049
[12000]	training's l1: 3.65455	valid_1's l1: 3.6505
[13000]	training's l1: 3.65454	valid_1's l1: 3.6505
Early stopping, best iteration is:
[9253]	training's l1: 3.65456	valid_1's l1: 3.65049
earthquake 13 mae 3.6547771288815145
not seen data mae: 6.627150714360123
CPU times: user 1d 7h 30min 35s, sys: 3min 25s, total: 1d 7h 34min 1s
Wall time: 3h 50min 33s


In [25]:
%%time
train_models(earthquake_margin_indexes[:5], complete_earthquakes_length, params, featured_not_seen_data_df)

i: 0
complete_earthquakes_length: 44429304
earthquake_margin_indexes: 5656573
Full calculation feature value time (with slicing) 49.81775324344635 min:




Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.85065	valid_1's l1: 2.84681
[2000]	training's l1: 2.84416	valid_1's l1: 2.84038
[3000]	training's l1: 2.84234	valid_1's l1: 2.83866
[4000]	training's l1: 2.84161	valid_1's l1: 2.83803
[5000]	training's l1: 2.84124	valid_1's l1: 2.83775
[6000]	training's l1: 2.84104	valid_1's l1: 2.83763
[7000]	training's l1: 2.8409	valid_1's l1: 2.83756
[8000]	training's l1: 2.84078	valid_1's l1: 2.83749
[9000]	training's l1: 2.84067	valid_1's l1: 2.83744
[10000]	training's l1: 2.84057	valid_1's l1: 2.83741
[11000]	training's l1: 2.84048	valid_1's l1: 2.83738
[12000]	training's l1: 2.84039	valid_1's l1: 2.83735
[13000]	training's l1: 2.84031	valid_1's l1: 2.83733
[14000]	training's l1: 2.84023	valid_1's l1: 2.83731
[15000]	training's l1: 2.84015	valid_1's l1: 2.83729
[16000]	training's l1: 2.84008	valid_1's l1: 2.83728
[17000]	training's l1: 2.84002	valid_1's l1: 2.83726
[18000]	training's l1: 2.83995	valid_1's l1: 



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 3.49634	valid_1's l1: 3.4937
[2000]	training's l1: 3.48612	valid_1's l1: 3.48335
[3000]	training's l1: 3.48298	valid_1's l1: 3.48026
[4000]	training's l1: 3.48165	valid_1's l1: 3.47903
[5000]	training's l1: 3.48099	valid_1's l1: 3.47847
[6000]	training's l1: 3.48067	valid_1's l1: 3.47824
[7000]	training's l1: 3.48048	valid_1's l1: 3.47812
[8000]	training's l1: 3.48032	valid_1's l1: 3.47804
[9000]	training's l1: 3.48017	valid_1's l1: 3.47797
[10000]	training's l1: 3.48003	valid_1's l1: 3.47791
[11000]	training's l1: 3.47989	valid_1's l1: 3.47786
[12000]	training's l1: 3.47978	valid_1's l1: 3.47783
[13000]	training's l1: 3.47967	valid_1's l1: 3.4778
[14000]	training's l1: 3.47956	valid_1's l1: 3.47777
[15000]	training's l1: 3.47946	valid_1's l1: 3.47774
[16000]	training's l1: 3.47936	valid_1's l1: 3.47771
[17000]	training's l1: 3.47926	valid_1's l1: 3.4777
[18000]	training's l1: 3.47917	valid_1's l1: 3.



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.17361	valid_1's l1: 2.17271
[2000]	training's l1: 2.16396	valid_1's l1: 2.1632
[3000]	training's l1: 2.16088	valid_1's l1: 2.16024
[4000]	training's l1: 2.15964	valid_1's l1: 2.15911
[5000]	training's l1: 2.15905	valid_1's l1: 2.15862
[6000]	training's l1: 2.15875	valid_1's l1: 2.15838
[7000]	training's l1: 2.15859	valid_1's l1: 2.15827
[8000]	training's l1: 2.15846	valid_1's l1: 2.1582
[9000]	training's l1: 2.15836	valid_1's l1: 2.15814
[10000]	training's l1: 2.15827	valid_1's l1: 2.1581
[11000]	training's l1: 2.15819	valid_1's l1: 2.15806
[12000]	training's l1: 2.15811	valid_1's l1: 2.15804
[13000]	training's l1: 2.15805	valid_1's l1: 2.15801
[14000]	training's l1: 2.15798	valid_1's l1: 2.15799
[15000]	training's l1: 2.15792	valid_1's l1: 2.15798
[16000]	training's l1: 2.15786	valid_1's l1: 2.15797
[17000]	training's l1: 2.15781	valid_1's l1: 2.15796
[18000]	training's l1: 2.15775	valid_1's l1: 2.



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 3.11877	valid_1's l1: 3.11816
[2000]	training's l1: 3.10619	valid_1's l1: 3.10578
[3000]	training's l1: 3.10225	valid_1's l1: 3.10198
[4000]	training's l1: 3.10064	valid_1's l1: 3.10049
[5000]	training's l1: 3.0999	valid_1's l1: 3.09983
[6000]	training's l1: 3.09955	valid_1's l1: 3.09955
[7000]	training's l1: 3.0993	valid_1's l1: 3.09937
[8000]	training's l1: 3.09911	valid_1's l1: 3.09923
[9000]	training's l1: 3.09894	valid_1's l1: 3.09911
[10000]	training's l1: 3.09879	valid_1's l1: 3.09902
[11000]	training's l1: 3.09865	valid_1's l1: 3.09894
[12000]	training's l1: 3.09853	valid_1's l1: 3.09888
[13000]	training's l1: 3.09842	valid_1's l1: 3.09882
[14000]	training's l1: 3.09832	valid_1's l1: 3.09878
[15000]	training's l1: 3.09822	valid_1's l1: 3.09874
[16000]	training's l1: 3.09813	valid_1's l1: 3.0987
[17000]	training's l1: 3.09804	valid_1's l1: 3.09867
[18000]	training's l1: 3.09795	valid_1's l1: 3.



Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 1.9837	valid_1's l1: 1.98141
[2000]	training's l1: 1.97651	valid_1's l1: 1.9744
[3000]	training's l1: 1.97429	valid_1's l1: 1.97228
[4000]	training's l1: 1.97342	valid_1's l1: 1.97148
[5000]	training's l1: 1.97302	valid_1's l1: 1.97114
[6000]	training's l1: 1.97284	valid_1's l1: 1.971
[7000]	training's l1: 1.97272	valid_1's l1: 1.97092
[8000]	training's l1: 1.97263	valid_1's l1: 1.97088
[9000]	training's l1: 1.97255	valid_1's l1: 1.97085
[10000]	training's l1: 1.97248	valid_1's l1: 1.97082
[11000]	training's l1: 1.97241	valid_1's l1: 1.9708
[12000]	training's l1: 1.97234	valid_1's l1: 1.97077
[13000]	training's l1: 1.97227	valid_1's l1: 1.97075
[14000]	training's l1: 1.97221	valid_1's l1: 1.97074
[15000]	training's l1: 1.97215	valid_1's l1: 1.97072
[16000]	training's l1: 1.9721	valid_1's l1: 1.97072
[17000]	training's l1: 1.97205	valid_1's l1: 1.97071
[18000]	training's l1: 1.97201	valid_1's l1: 1.970

IndexError: list index out of range

In [26]:
y_not_seen_data = featured_not_seen_data_df['time_to_failure']
featured_not_seen_data_no_time_df = featured_not_seen_data_df[featured_not_seen_data_df.columns.drop('time_to_failure')]

In [28]:
for i in range(5, 15):
    model_file = open('earthquake_{}_model.txt'.format(i), 'rb')
    model = pickle.load(model_file)
    model_file.close()
    not_seen_data_predict = model.predict(featured_not_seen_data_no_time_df)
    print("not seen data mae: {}".format(mean_absolute_error(y_not_seen_data, not_seen_data_predict)))
    not_seen_data_predict_df = pd.DataFrame({'time_to_failure': not_seen_data_predict})
    not_seen_data_predict_df.to_csv('not_seend_data_earthquake_{}_model_predict.csv'.format(i), index=False)

not seen data mae: 2.762850373105831
not seen data mae: 7.260232979477146
not seen data mae: 3.1379145540556395
not seen data mae: 4.024417336485413
not seen data mae: 4.9262122734709335
not seen data mae: 4.927543366133553
not seen data mae: 4.725996147452754
not seen data mae: 3.610067084837362
not seen data mae: 3.4795987507724666
not seen data mae: 6.62556275849406


In [None]:
#featured_not_seen_data_df.drop?

In [30]:
predictions_df = pd.DataFrame()
for i in (2, 3, 5, 7, 12, 13):
    earthquake_pred_df  = pd.read_csv('not_seend_data_earthquake_{}_model_predict.csv'.format(i))
    predictions_df['pred {}'.format(i)] = earthquake_pred_df['time_to_failure']

In [33]:
compound_prediction = pd.DataFrame()
compound_prediction['time_to_failure'] = predictions_df.mean(axis=1)

In [35]:
mean_absolute_error(y_not_seen_data, compound_prediction.values)

3.696729091456896

In [36]:
all_predictions_df = pd.DataFrame()
for i in range(15):
    earthquake_pred_df  = pd.read_csv('not_seend_data_earthquake_{}_model_predict.csv'.format(i))
    all_predictions_df['pred {}'.format(i)] = earthquake_pred_df['time_to_failure']

In [37]:
compound_all_prediction = pd.DataFrame()
compound_all_prediction['time_to_failure'] = all_predictions_df.mean(axis=1)

In [40]:
mean_absolute_error(y_not_seen_data, compound_all_prediction.values)

4.612704339503629

In [41]:
gc.collect()

528

In [42]:
!ls ../input/test/

seg_00030f.csv	seg_335170.csv	seg_6c309f.csv	seg_9d7490.csv	seg_cecd29.csv
seg_0012b5.csv	seg_337b8c.csv	seg_6c34c2.csv	seg_9d79d4.csv	seg_ced992.csv
seg_00184e.csv	seg_339f80.csv	seg_6c8a45.csv	seg_9dcae1.csv	seg_cedf3e.csv
seg_003339.csv	seg_33c30d.csv	seg_6c909f.csv	seg_9ddc99.csv	seg_cee7d0.csv
seg_0042cc.csv	seg_342ce6.csv	seg_6cfb76.csv	seg_9dea8d.csv	seg_cf0e43.csv
seg_004314.csv	seg_343571.csv	seg_6d01a3.csv	seg_9df32d.csv	seg_cf1371.csv
seg_004cd2.csv	seg_3452b2.csv	seg_6d35cd.csv	seg_9e25e9.csv	seg_cf25d2.csv
seg_004ee5.csv	seg_34a2b8.csv	seg_6d36a1.csv	seg_9e3837.csv	seg_cf3825.csv
seg_004f1f.csv	seg_34a8f7.csv	seg_6d4109.csv	seg_9e61da.csv	seg_cf5764.csv
seg_00648a.csv	seg_34abd2.csv	seg_6d4fa6.csv	seg_9e7dff.csv	seg_cf646e.csv
seg_006e4a.csv	seg_34ef79.csv	seg_6d6fad.csv	seg_9e8323.csv	seg_cf74e8.csv
seg_007a37.csv	seg_3506d6.csv	seg_6da1ff.csv	seg_9e8ca4.csv	seg_cf9a49.csv
seg_00a37e.csv	seg_35269b.csv	seg_6dac5d.csv	seg_9e962b.csv	seg_cfed24.csv
seg_00be11.c

In [46]:
seg_df = pd.read_csv('../input/test/seg_00030f.csv', dtype={'acoustic_data': np.float32})

In [47]:
seg_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 1 columns):
acoustic_data    150000 non-null float32
dtypes: float32(1)
memory usage: 586.0 KB


In [48]:
seg_df.head()

Unnamed: 0,acoustic_data
0,4.0
1,0.0
2,-2.0
3,0.0
4,2.0


In [50]:
featured_seg_df = add_features(seg_df, holdout_size=0)

Full calculation feature value time (with slicing) 1.6346824367841084 min:


In [51]:
segment_names_ls = os.listdir('../input/test/')

In [52]:
total_add_features_minutes = len(segment_names_ls) * 1.64

In [53]:
total_add_features_minutes

4303.36

In [54]:
min_in_day = 24 * 60

In [55]:
min_in_day

1440

In [56]:
days = total_add_features_minutes / min_in_day

In [57]:
days

2.9884444444444442