In [1]:
import gc
import math
import os
import pathlib
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [76]:
#np.random.randint?

In [54]:
def features_maker(df, first_index=None, last_index=None, smootch_windows_size = (3, 5, 7)):
    if first_index == None or last_index == None:
        first_index = 0
        last_index = df.shape[0] - 1

    smooth_feature_names = ['smootch_feature_{}_ws_{}'.format(i, window_size) for i, window_size in enumerate(smootch_windows_size)]
    for feature_name in smooth_feature_names:
        df[feature_name] = 0
    for i in df.index:
        for smooth_feature_name, window_size in zip(smooth_feature_names, smootch_windows_size):
            half_window_size = window_size // 2
            data_series = df['acoustic_data']
            if i < first_index + half_window_size:
                smooth_feature_value = data_series.iloc[first_index:first_index + window_size].mean()
            elif i < last_index - half_window_size:
                smooth_feature_value = data_series.iloc[last_index - window_size:last_index].mean()
            else:
                smooth_feature_value = data_series.iloc[i - half_window_size:i + half_window_size].mean()
            df.iloc[i][feature_name] = data_series[i] - smooth_feature_value
    return df

In [108]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes.sort()
        
    smootch_feature_names = ['smootch_feature_{}_ws_{}'.format(i, window_size) for i, window_size in enumerate(smootch_windows_size)]
    half_windows_size = [ws // 2 for ws in smootch_windows_size]
    print("half_windows_size:\n", half_windows_size)
    
    print("sample_indexes.shape:", sample_indexes.shape)
    print("sample_indexes[:5]", sample_indexes[:5])
    print("sample_indexes[-5:]", sample_indexes[-5:])
    print("df.index.min()", df.index.min())
    print("df.index.max()", df.index.max())
    
    
    print("before data_series")
    sample_df = df.iloc[sample_indexes]
    data_series = sample_df['acoustic_data']
    
    #data_series = df[sample_indexes]['acoustic_data']
    print("data_series.head()\n", data_series.head())
    
    #sample_df = df[sample_indexes]
    for feature_name in smootch_feature_names:
        sample_df[feature_name] = 0
    begin_smootch_features_value = []
    end_smootch_features_value = []
    for h_w_size, feature_name in zip(half_windows_size, smootch_feature_names):
        print("\n\n")
        print(feature_name, h_w_size)
        print("before begin and end indexes operations")
        begin_indexes = sample_indexes[:h_w_size]
        sample_begin_indexes = sample_indexes[:h_w_size]
        print("sample_begin_indexes:\n", sample_begin_indexes)
        full_data_begin_indexes = df.index[:h_w_size].tolist()
        print("full_data_begin_indexes:\n", full_data_begin_indexes)
        in_window_begin_indexes = [idx for idx in sample_begin_indexes if idx in full_data_begin_indexes]
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-h_w_size:]
        print("sample_end_indexes:\n", sample_end_indexes)
        full_data_end_indexes = df.index[-h_w_size:].tolist()
        print("full_data_end_indexes:\n", full_data_end_indexes)
        in_window_end_indexes = [idx for idx in sample_end_indexes if idx in full_data_end_indexes]
        print("in_window_end_indexes:\n", in_window_end_indexes)
        
        print("\n")
        print("begin_indexes <= h_w_size:\n", begin_indexes <= h_w_size)
        #begin_indexes = begin_indexes[begin_indexes <= h_w_size]
        begin_indexes = begin_indexes[begin_indexes <= h_w_size]
        
        end_indexes = sample_indexes[-h_w_size:]
        print("end_indexes >= end_indexes.max() - h_w_size:\n", end_indexes >= end_indexes.max() - h_w_size)
        print("end_indexes:\n", end_indexes)
        end_indexes = end_indexes[end_indexes >= end_indexes.max() - h_w_size]
        print("begin_idexes:\n", begin_indexes)
        print("end_idexes:\n", end_indexes)
        print("end_indexes.max():", end_indexes.max())
        print("h_w_size:", h_w_size)
        '''
        if begin_indexes:
            print("if begin_indexes")
            for i, b_idx in enumerate(begin_indexes):
                begin_smootch_features_value.append(df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean())
                sample_df.iloc[i][feature_name] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
        if end_indexes:
            print("if end_indexes")
            for i, e_idx in enumerate(end_indexes):
                end_smootch_features_value.append(df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:last_index]['acoustic_data'].mean())
                sample_df.iloc[sample_df.shape[0] - i][feature_name] = df[e_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
        '''
    return sample_df

In [101]:
earthquake_margin_indexes =[
    5656573,
    50085877,
    104677355,
    138772452,
    187641819,
    218652629,
    245829584,
    307838916,
    338276286,
    375377847,
    419368879,
    461811622,
    495800224,
    528777114,
    585568143,
    621985672
]

In [8]:
earthquakes_length = [earthquake_margin_indexes[i + 1] - earthquake_margin_indexes[i] for i in range(len(earthquake_margin_indexes) - 1)]

In [9]:
earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029,
 36417529]

In [10]:
621985672 - 585568143

36417529

In [11]:
50085877 - 5656573

44429304

In [32]:
complete_earthquakes_length = earthquakes_length[:-1]

In [15]:
#complete_earthquaces_length = complete_earthquaces_length[:-1]

In [33]:
complete_earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029]

In [34]:
len(complete_earthquakes_length)

14

In [36]:
earthquakes_length_more_2_less_3 = [length for length in complete_earthquaces_length if length > 20000000 and length < 30000000]

In [37]:
earthquakes_length_more_3_less_4 = [length for length in complete_earthquaces_length if length > 30000000 and length < 40000000]

In [38]:
earthquakes_length_more_4_less_5 = [length for length in complete_earthquaces_length if length > 40000000 and length < 50000000]

In [39]:
earthquakes_length_more_5_less_6 = [length for length in complete_earthquaces_length if length > 50000000 and length < 60000000]

In [40]:
earthquakes_length_more_6 = [length for length in complete_earthquaces_length if length > 60000000]

In [41]:
earthquakes_length_more_3_less_4

[34095097, 31010810, 30437370, 37101561, 33988602, 32976890]

In [42]:
earthquakes_length_more_2_less_3

[27176955]

In [43]:
earthquakes_length_more_3_less_4

[34095097, 31010810, 30437370, 37101561, 33988602, 32976890]

In [44]:
earthquakes_length_more_4_less_5

[44429304, 48869367, 43991032, 42442743]

In [45]:
earthquakes_length_more_5_less_6

[54591478, 56791029]

In [46]:
earthquakes_length_more_6

[62009332]

In [92]:
%time
earthquake_1_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=earthquake_margin_indexes[0],
    nrows=complete_earthquakes_length[0]
)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.68 µs


In [49]:
earthquake_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44429304 entries, 0 to 44429303
Data columns (total 2 columns):
acoustic_data      float32
time_to_failure    float32
dtypes: float32(2)
memory usage: 339.0 MB


%%time
earthquake_1_with_additional_features_df = features_maker(earthquake_1_df)

In [56]:
#np.random.randint?

In [109]:
%%time
earthquake_1_with_additional_features_df = add_features(earthquake_1_df)

half_windows_size:
 [1, 2, 3]
sample_indexes.shape: (150000,)
sample_indexes[:5] [ 863 1454 1558 2425 2557]
sample_indexes[-5:] [44427446 44427809 44427997 44428423 44428728]
df.index.min() 0
df.index.max() 44429303
before data_series
data_series.head()
 863     3.0
1454    5.0
1558    7.0
2425    1.0
2557    5.0
Name: acoustic_data, dtype: float32



smootch_feature_0_ws_3 1
before begin and end indexes operations
sample_begin_indexes:
 [863]
full_data_begin_indexes:
 [0]
in_window_begin_indexes:
 []
sample_end_indexes:
 [44428728]
full_data_end_indexes:
 [44429303]
in_window_end_indexes:
 []


begin_indexes <= h_w_size:
 [False]
end_indexes >= end_indexes.max() - h_w_size:
 [ True]
end_indexes:
 [44428728]
begin_idexes:
 []
end_idexes:
 [44428728]
end_indexes.max(): 44428728
h_w_size: 1



smootch_feature_1_ws_5 2
before begin and end indexes operations
sample_begin_indexes:
 [ 863 1454]
full_data_begin_indexes:
 [0, 1]
in_window_begin_indexes:
 []
sample_end_indexes:
 [44428423 4442

In [69]:
earthquake_1_df.index.min()

0

In [70]:
earthquake_1_df.index.max()

44429303

In [83]:
earthquake_1_df.iloc[522]

acoustic_data             2.000000
time_to_failure          11.540799
smooth_feature_0_ws_3     0.000000
smooth_feature_1_ws_5     0.000000
smooth_feature_2_ws_7     0.000000
Name: 522, dtype: float64