In [1]:
import gc
import math
import os
import pathlib
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [18]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes.sort()
    print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)
    print("just after create sample_df, sample_df.shape[0]:", sample_df.shape[0])
    
    #for feature_name in smootch_feature_names:
    #    sample_df[feature_name] = 0
    begin_smootch_features_value = []
    end_smootch_features_value = []
    
    #sample_indexes_set = set(sample_indexes)
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
    print("sample_df.shape[0] just before main loop:", sample_df.shape[0])
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        feature_values_list = list(range(sample_size))
        
        half_window_size = window_size // 2
        begin_indexes = sample_indexes[:half_window_size]        
        sample_begin_indexes = sample_indexes[:half_window_size]
        full_data_begin_indexes = df.index[:half_window_size].tolist()

        in_window_begin_indexes = [idx for idx in sample_begin_indexes if idx in full_data_begin_indexes]
        
        sample_end_indexes = sample_indexes[-half_window_size:]

        full_data_end_indexes = df.index[-half_window_size:].tolist()
        
        in_window_end_indexes = [idx for idx in sample_end_indexes if idx in full_data_end_indexes]
        begin_indexes = begin_indexes[begin_indexes <= half_window_size]
        
        end_indexes = sample_indexes[-half_window_size:]
        
        if in_window_begin_indexes:
            begin_indexes_set.union(set(in_window_begin_indexes))
            for i, b_idx in enumerate(in_window_begin_indexes):
                #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                feature_values_list[i] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
        if in_window_end_indexes:
            end_indexes_set.union(set(in_window_end_indexes))
            for i, e_idx in enumerat(in_window_end_indexes):
                #sample_df[feature_name].iloc[e_idx] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                feature_values_list[-i] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
        print("in main loop, sample_df.shape:", sample_df.shape)
        slice_begin = len(begin_indexes_set)
        slice_end = sample_df_len - len(end_indexes_set)
        for i in range(sample_df_len)[slice_begin:slice_end]:
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[sample_idx - half_window_size:sample_idx + half_window_size].mean()
        print("sample_df.shape[0] before assign feature_values_list:", sample_df.shape[0])
        print("len(feature_values_list):", len(feature_values_list))
        sample_df[feature_name] = feature_values_list
    holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
    print("sample_df.index:\n", sample_df.index.tolist())
    print("holdout_indexes:\n", holdout_indexes)
    #train_indexes = np.array(sample_df.index)
    holdout_df = sample_df.iloc[holdout_indexes]
    holdout_df.reset_index(inplace=True)
    holdout_df.drop(columns=['index'], inplace=True)
    sample_df_indexes_set = set(sample_df.index)
    train_indexes = sorted(tuple(sample_df_indexes_set.difference(set(holdout_indexes))))
    print("train_indexes:\n", train_indexes)
    sample_df = sample_df.iloc[train_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [4]:
earthquake_margin_indexes =[
    5656573,
    50085877,
    104677355,
    138772452,
    187641819,
    218652629,
    245829584,
    307838916,
    338276286,
    375377847,
    419368879,
    461811622,
    495800224,
    528777114,
    585568143,
    621985672
]

In [5]:
earthquakes_length = [earthquake_margin_indexes[i + 1] - earthquake_margin_indexes[i] for i in range(len(earthquake_margin_indexes) - 1)]

In [6]:
earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029,
 36417529]

In [7]:
complete_earthquakes_length = earthquakes_length[:-1]

In [8]:
#complete_earthquaces_length = complete_earthquaces_length[:-1]

In [9]:
complete_earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029]

In [10]:
%time
earthquake_1_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=earthquake_margin_indexes[0],
    nrows=complete_earthquakes_length[0]
)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.15 µs


In [11]:
earthquake_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44429304 entries, 0 to 44429303
Data columns (total 2 columns):
acoustic_data      float32
time_to_failure    float32
dtypes: float32(2)
memory usage: 339.0 MB


%%time
earthquake_1_with_additional_features_df = features_maker(earthquake_1_df)

In [12]:
#np.random.randint?

In [19]:
%%time
earthquake_1_with_additional_features_df, holdout_df = add_features(earthquake_1_df, sample_size=20, holdout_size=4)

sample_indexes.shape: (20,)
just after create sample_df, sample_df.shape[0]: 20
sample_df.shape[0] just before main loop: 20
in main loop, sample_df.shape: (20, 2)
sample_df.shape[0] before assign feature_values_list: 20
len(feature_values_list): 20
in main loop, sample_df.shape: (20, 3)
sample_df.shape[0] before assign feature_values_list: 20
len(feature_values_list): 20
in main loop, sample_df.shape: (20, 4)
sample_df.shape[0] before assign feature_values_list: 20
len(feature_values_list): 20
sample_df.index:
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
holdout_indexes:
 [6 3 0 9]
train_indexes:
 [1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
Full calculation feature value time (with slicing) 0.008262900511423747 min:
CPU times: user 612 ms, sys: 4 ms, total: 616 ms
Wall time: 613 ms


In [14]:
print(earthquake_1_with_additional_features_df[:10])
print()
print(earthquake_1_with_additional_features_df[-10:])

NameError: name 'earthquake_1_with_additional_features_df' is not defined