In [1]:
import gc
import math
import os
import pathlib
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [178]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes.sort()
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True) # There is need map sample_df.index -> sample_indexes
                                                    # Должно быть установелнно соотвествие индексов в sample_df
                                                    # (Где после .reset_index индексы - это
                                                    # все целые числа от 0 до sample_df.shape[0]) и sample_indexes -
                                                    # соответствующие индексы в df которые являються случайной
                                                    # выборкой из чисел о first_index до last_index и длинной 
                                                    # равной sample_df.index.shape[0] которая длинна равно sample_size 
                                                    # Соответсвенно, по видимому, in_window_begin_indexes и
                                                    # in_window_end_indexes надо вычислять как то по другому
    #print("just after create sample_df, sample_df.shape[0]:", sample_df.shape[0])
    
    #for feature_name in smootch_feature_names:
    #    sample_df[feature_name] = 0
    #begin_smootch_features_value = []
    #end_smootch_features_value = []
    
    #sample_indexes_set = set(sample_indexes)
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
    #print("sample_df.shape[0] just before main loop:", sample_df.shape[0])
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        print("\n" * 2)
        print("window_size:", window_size)
        print()
        feature_values_list = list(range(sample_size))
        print("sample_df.index.tolist()[:window_size]:\n", sample_df.index.tolist()[:window_size])
        print("df.index.tolist()[:window_size]:\n", df.index.tolist()[:window_size])

        print("sample_df.index.tolist()[-window_size:]:\n", sample_df.index.tolist()[-window_size:])
        print("df.index.tolist()[-window_size:]:\n", df.index.tolist()[-window_size:])
        
        half_window_size = window_size // 2
        ##>begin_indexes = sample_indexes[:half_window_size]  
        ##>print("begin_indexes:\n", begin_indexes)
        sample_begin_indexes = sample_indexes[:half_window_size] #? sample_df.index[:half_window_size]
        print("sample_begin_indexes:\n", sample_begin_indexes)
        #full_data_begin_indexes = df.index[:half_window_size].tolist() #df.index[sample_indexes[i]]
        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        print("full_data_begin_indexes:\n", full_data_begin_indexes)
        
        #in_window_begin_indexes = [idx for idx in sample_begin_indexes if idx in full_data_begin_indexes]
        min_begin_index = min(full_data_begin_indexes)
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            set(range(min_begin_index, min_begin_index + half_window_size))
        )
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        print("sample_end_indexes:\n", sample_end_indexes)

        #full_data_end_indexes = df.index[-half_window_size:].tolist()
        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        print("full_data_end_indexes:", full_data_end_indexes)
        
        #in_window_end_indexes = [idx for idx in sample_end_indexes if idx in full_data_end_indexes]
        max_end_index = max(full_data_end_indexes)
        in_window_end_indexes = full_data_end_indexes.intersection(
            set(range(max_end_index - half_window_size, max_end_index))
        )
        print("in_window_end_indexes:\n", in_window_end_indexes)
        
        ##>begin_indexes = begin_indexes[begin_indexes <= half_window_size]
        ##>print("2 begin_indexes? :\n", begin_indexes)
        
        ##>end_indexes = sample_indexes[-half_window_size:]
        ##>print("2 end_indexes? :\n", end_indexes)
        
        if in_window_begin_indexes:
            begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                print("i: {}, b_idx {}:".format(i, b_idx))
                print("type(b_idx):", type(b_idx))
                print("df[:window_size]:\n", df[:window_size])
                #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                value = df.iloc[b_idx]['acoustic_data']
                value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                feature_values_list[i] = value
        if in_window_end_indexes:
            end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                print("i: {}, e_idx {}:".format(i, e_idx))
                print("type(e_idx):", type(e_idx))
                print("df[-window_size:]:\n", df[-window_size:])
                #sample_df[feature_name].iloc[e_idx] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                value = df.iloc[e_idx]['acoustic_data']
                value = value - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                feature_values_list[-i] = value
        #print("in main loop, sample_df.shape:", sample_df.shape)
        slice_begin = len(begin_indexes_set)
        slice_end = sample_df_len - len(end_indexes_set)
        for i in range(sample_df_len)[slice_begin:slice_end]:
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[sample_idx - half_window_size:sample_idx + half_window_size].mean()
        #print("sample_df.shape[0] before assign feature_values_list:", sample_df.shape[0])
        #print("len(feature_values_list):", len(feature_values_list))
        sample_df[feature_name] = feature_values_list
        
    #sample_df_indexes_set = set(sample_df.index)
    train_indexes = sample_df_indexes_set
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        #print("sample_df.index:\n", sample_df.index.tolist())
        #print("holdout_indexes:\n", holdout_indexes)
        #train_indexes = np.array(sample_df.index)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
    
        #train_indexes = sorted(tuple(sample_df_indexes_set.difference(set(holdout_indexes))))
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        #print("train_indexes:\n", train_indexes)
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [None]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes.sort()
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True) # There is need map sample_df.index -> sample_indexes
                                                    # Должно быть установелнно соотвествие индексов в sample_df
                                                    # (Где после .reset_index индексы - это
                                                    # все целые числа от 0 до sample_df.shape[0]) и sample_indexes -
                                                    # соответствующие индексы в df которые являються случайной
                                                    # выборкой из чисел о first_index до last_index и длинной 
                                                    # равной sample_df.index.shape[0] которая длинна равно sample_size 
                                                    # Соответсвенно, по видимому, in_window_begin_indexes и
                                                    # in_window_end_indexes надо вычислять как то по другому
    #print("just after create sample_df, sample_df.shape[0]:", sample_df.shape[0])
    
    #for feature_name in smootch_feature_names:
    #    sample_df[feature_name] = 0
    #begin_smootch_features_value = []
    #end_smootch_features_value = []
    
    #sample_indexes_set = set(sample_indexes)
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
    #print("sample_df.shape[0] just before main loop:", sample_df.shape[0])
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        print("\n" * 2)
        print("window_size:", window_size)
        print()
        feature_values_list = list(range(sample_size))
        print("sample_df.index.tolist()[:window_size]:\n", sample_df.index.tolist()[:window_size])
        print("df.index.tolist()[:window_size]:\n", df.index.tolist()[:window_size])

        print("sample_df.index.tolist()[-window_size:]:\n", sample_df.index.tolist()[-window_size:])
        print("df.index.tolist()[-window_size:]:\n", df.index.tolist()[-window_size:])
        
        half_window_size = window_size // 2
        ##>begin_indexes = sample_indexes[:half_window_size]  
        ##>print("begin_indexes:\n", begin_indexes)
        sample_begin_indexes = sample_indexes[:half_window_size] #? sample_df.index[:half_window_size]
        print("sample_begin_indexes:\n", sample_begin_indexes)
        #full_data_begin_indexes = df.index[:half_window_size].tolist() #df.index[sample_indexes[i]]
        full_data_begin_indexes = df.index[sample_begin_indexes].tolist()
        print("full_data_begin_indexes:\n", full_data_begin_indexes)
        
        in_window_begin_indexes = [idx for idx in sample_begin_indexes if idx in full_data_begin_indexes]
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        print("sample_end_indexes:\n", sample_end_indexes)

        #full_data_end_indexes = df.index[-half_window_size:].tolist()
        full_data_end_indexes = df.index[sample_end_indexes].tolist()
        print("full_data_end_indexes:", full_data_end_indexes)
        
        in_window_end_indexes = [idx for idx in sample_end_indexes if idx in full_data_end_indexes]
        print("in_window_end_indexes:\n", in_window_end_indexes)
        
        ##>begin_indexes = begin_indexes[begin_indexes <= half_window_size]
        ##>print("2 begin_indexes? :\n", begin_indexes)
        
        ##>end_indexes = sample_indexes[-half_window_size:]
        ##>print("2 end_indexes? :\n", end_indexes)
        
        if in_window_begin_indexes:
            begin_indexes_set.union(set(in_window_begin_indexes))
            for i, b_idx in enumerate(in_window_begin_indexes):
                print("i: {}, b_idx {}:".format(i, b_idx))
                print("type(b_idx):", type(b_idx))
                print("df[:window_size]:\n", df[:window_size])
                #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                value = df.iloc[b_idx]['acoustic_data']
                value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                feature_values_list[i] = value
        if in_window_end_indexes:
            end_indexes_set.union(set(in_window_end_indexes))
            for i, e_idx in enumerate(in_window_end_indexes):
                print("i: {}, e_idx {}:".format(i, e_idx))
                print("type(e_idx):", type(e_idx))
                print("df[-window_size:]:\n", df[-window_size:])
                #sample_df[feature_name].iloc[e_idx] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                value = df.iloc[e_idx]['acoustic_data']
                value = value - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                feature_values_list[-i] = value
        #print("in main loop, sample_df.shape:", sample_df.shape)
        slice_begin = len(begin_indexes_set)
        slice_end = sample_df_len - len(end_indexes_set)
        for i in range(sample_df_len)[slice_begin:slice_end]:
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[sample_idx - half_window_size:sample_idx + half_window_size].mean()
        #print("sample_df.shape[0] before assign feature_values_list:", sample_df.shape[0])
        #print("len(feature_values_list):", len(feature_values_list))
        sample_df[feature_name] = feature_values_list
        
    #sample_df_indexes_set = set(sample_df.index)
    #train_indexes = sample_df_indexes_set
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        #print("sample_df.index:\n", sample_df.index.tolist())
        #print("holdout_indexes:\n", holdout_indexes)
        #train_indexes = np.array(sample_df.index)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
    
        #train_indexes = sorted(tuple(sample_df_indexes_set.difference(set(holdout_indexes))))
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        #print("train_indexes:\n", train_indexes)
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [4]:
earthquake_margin_indexes =[
    5656573,
    50085877,
    104677355,
    138772452,
    187641819,
    218652629,
    245829584,
    307838916,
    338276286,
    375377847,
    419368879,
    461811622,
    495800224,
    528777114,
    585568143,
    621985672
]

In [5]:
earthquakes_length = [earthquake_margin_indexes[i + 1] - earthquake_margin_indexes[i] for i in range(len(earthquake_margin_indexes) - 1)]

In [6]:
earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029,
 36417529]

In [7]:
complete_earthquakes_length = earthquakes_length[:-1]

In [8]:
#complete_earthquaces_length = complete_earthquaces_length[:-1]

In [9]:
complete_earthquakes_length

[44429304,
 54591478,
 34095097,
 48869367,
 31010810,
 27176955,
 62009332,
 30437370,
 37101561,
 43991032,
 42442743,
 33988602,
 32976890,
 56791029]

In [10]:
%time
earthquake_1_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=earthquake_margin_indexes[0],
    nrows=complete_earthquakes_length[0]
)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs


In [11]:
earthquake_1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44429304 entries, 0 to 44429303
Data columns (total 2 columns):
acoustic_data      float32
time_to_failure    float32
dtypes: float32(2)
memory usage: 339.0 MB


%%time
earthquake_1_with_additional_features_df = features_maker(earthquake_1_df)

In [12]:
#np.random.randint?

In [122]:
%%time
earthquake_1_with_additional_features_df, holdout_df = add_features(earthquake_1_df, sample_size=2000000, holdout_size=400000)

Full calculation feature value time (with slicing) 23.202832746505738 min:
CPU times: user 23min 11s, sys: 1.04 s, total: 23min 12s
Wall time: 23min 12s


In [84]:
print(earthquake_1_with_additional_features_df[:10])
print()
print(earthquake_1_with_additional_features_df[-10:])

   acoustic_data  time_to_failure  smootch_mean_ws_3  smootch_mean_ws_5  \
0            5.0        11.540800                6.0               6.00   
1            3.0        11.540800                2.5               4.50   
2            8.0        11.540800                6.0               4.75   
3            7.0        11.540800                7.5               6.25   
4            4.0        11.540800                4.5               4.75   
5            5.0        11.540800                6.0               5.75   
6            2.0        11.540800                1.5               2.50   
7            2.0        11.540799                2.5               4.25   
8            5.0        11.540799                6.5               5.00   
9            3.0        11.540799                5.0               4.25   

   smootch_mean_ws_7  
0           5.833333  
1           4.500000  
2           4.666667  
3           6.000000  
4           3.833333  
5           6.000000  
6           3

In [123]:
X_all = earthquake_1_with_additional_features_df[earthquake_1_with_additional_features_df.columns.drop('time_to_failure')]

In [124]:
y_all = earthquake_1_with_additional_features_df['time_to_failure']

In [125]:
X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.2, random_state=0)

In [136]:
params = {
    #'num_leaves': 51,
    'num_leaves': 27,
    #'min_data_in_leaf': 10,
    'min_data_in_leaf': 8,
    'objective':'regression',
    #'max_depth': -1,
    'max_depth': 5,
    'learning_rate': 0.001,
    'boosting': 'gbdt',
    #'feature_fraction': 0.91,
    #'bagging_freq': 1,
    #'bagging_fraction': 0.91,
    #'bagging_seed': 42,
    'metric': 'mae',
    #'lambda_l1': 0.1,
    'verbosity': -1,
    'nthread': 10,
    'random_state': 42
}

In [138]:
model = lgb.LGBMRegressor(**params, n_estimators = 20000, n_jobs = 10, num_iterations=100000)

In [139]:
%%time
model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_metric='mae',
    verbose=1000,
    early_stopping_rounds=20000
)



Training until validation scores don't improve for 20000 rounds.
[1000]	training's l1: 2.85464	valid_1's l1: 2.85243
[2000]	training's l1: 2.84816	valid_1's l1: 2.84615
[3000]	training's l1: 2.84633	valid_1's l1: 2.84449
[4000]	training's l1: 2.84554	valid_1's l1: 2.84388
[5000]	training's l1: 2.84511	valid_1's l1: 2.84362
[6000]	training's l1: 2.84486	valid_1's l1: 2.8435
[7000]	training's l1: 2.84468	valid_1's l1: 2.84344
[8000]	training's l1: 2.84451	valid_1's l1: 2.84338
[9000]	training's l1: 2.84434	valid_1's l1: 2.84333
[10000]	training's l1: 2.84417	valid_1's l1: 2.84329
[11000]	training's l1: 2.84401	valid_1's l1: 2.84325
[12000]	training's l1: 2.84386	valid_1's l1: 2.84322
[13000]	training's l1: 2.84371	valid_1's l1: 2.8432
[14000]	training's l1: 2.84357	valid_1's l1: 2.84318
[15000]	training's l1: 2.84344	valid_1's l1: 2.84317
[16000]	training's l1: 2.84331	valid_1's l1: 2.84315
[17000]	training's l1: 2.84318	valid_1's l1: 2.84314
[18000]	training's l1: 2.84306	valid_1's l1: 

LGBMRegressor(boosting='gbdt', boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, learning_rate=0.001, max_depth=5,
       metric='mae', min_child_samples=20, min_child_weight=0.001,
       min_data_in_leaf=8, min_split_gain=0.0, n_estimators=20000,
       n_jobs=10, nthread=10, num_iterations=100000, num_leaves=27,
       objective='regression', random_state=42, reg_alpha=0.0,
       reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0, verbosity=-1)

In [127]:
X_test = holdout_df[holdout_df.columns.drop('time_to_failure')]

In [128]:
y_test = holdout_df['time_to_failure']

In [134]:
y_predict = model.predict(X_test)

In [135]:
mean_absolute_error(y_test, y_predict)

2.8447993698125047

In [None]:
2.843063059848607, 2.8431104124307693, 2.8429766961521667, 2.842999310177736, 2.842972294755831, 2.843032369729009

In [150]:
def train_models(
        earthquake_margin_indexes,
        complete_earthquakes_length,
        params,
        sample_size=None,
        holdout_size=None,
        not_seen_data_begin=1,
        not_seen_data_end=5656572 #5656569
    ):
    not_seen_data_df = pd.read_csv(
        '../input/train/train.csv',
        #nrows=100000000,
        names=['acoustic_data', 'time_to_failure'],
        dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
        skiprows=not_seen_data_begin,
        nrows=not_seen_data_end
    )
    
    not_seen_data_df, _ = add_features(
        not_seen_data_df,
        sample_size=not_seen_data_df.shape[0],
        holdout_size=0
    )
    for i in range(len(complete_earthquakes_length)):
        earthquake_df = pd.read_csv(
                '../input/train/train.csv',
                #nrows=100000000,
                names=['acoustic_data', 'time_to_failure'],
                dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
                skiprows=earthquake_margin_indexes[i],
                nrows=complete_earthquakes_length[i]
            )
        if not sample_size:
            sample_size = complete_earthquakes_length[i] // 100
        if not holdout_size:
            holdout_size = complete_earthquakes_length[i] // 500
        earthquake_add_features_df, holdout_add_features_df = add_features(
                earthquake_df,
                sample_size=sample_size,
                holdout_size=holdout_size
            )
        X_all = earthquake_add_features_df[earthquake_add_features_df.columns.drop('time_to_failure')]
        y_all = earthquake_add_features_df['time_to_failure']

        X_train, X_valid, y_train, y_valid = train_test_split(X_all, y_all, test_size=0.2, random_state=0)

        model = lgb.LGBMRegressor(**params, n_estimators = 20000, n_jobs = 10, num_iterations=40000)
        model.fit(
                X_train,
                y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                eval_metric='mae',
                verbose=1000,
                early_stopping_rounds=4000
            )
        X_holdout = holdout_df[holdout_df.columns.drop('time_to_failure')]
        y_holdout = holdout_df['time_to_failure']
        
        y_holdout_predict = model.predict(X_holdout)
        print("earthquake {} mae {}".format(i, mean_absolute_error(y_holdout, y_holdout_predict)))

        not_seen_data_predict = model.predict(not_seen_data_df)
        not_seen_data_predict_df = pd.DataFrame({'time_to_failure': not_seen_data_predict})
        not_seen_data_predict_df.to_csv('not_seend_data_earthquake_{}_model_predict.csv', index=False)

        model.save_model('earthquake_{}_model.txt'.format(i))

    return

In [151]:
%%time
train_models(earthquake_margin_indexes, complete_earthquakes_length, params)

KeyError: 5656569

In [155]:
%%time
not_seen_data_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=1,
    nrows=5656572
)

CPU times: user 1.34 s, sys: 56 ms, total: 1.39 s
Wall time: 1.39 s


In [156]:
not_seen_data_df.shape

(5656572, 2)

In [179]:
%%time
featured_not_seen_data_df, _ = add_features(
    not_seen_data_df,
    sample_size=not_seen_data_df.shape[0],
    holdout_size=0
)




window_size: 3

sample_df.index.tolist()[:window_size]:
 [0, 1, 2]
df.index.tolist()[:window_size]:
 [0, 1, 2]
sample_df.index.tolist()[-window_size:]:
 [5656569, 5656570, 5656571]
df.index.tolist()[-window_size:]:
 [5656569, 5656570, 5656571]
sample_begin_indexes:
 [0]
full_data_begin_indexes:
 {0}


AttributeError: 'set' object has no attribute 'min'

In [None]:
begin_indexes:
 [1]
sample_begin_indexes:
 [1]
full_data_begin_indexes:
 [0]
in_window_begin_indexes:
 []
sample_end_indexes:
 [5656570]
full_data_end_indexes: [5656571]
in_window_end_indexes:
 []
2 begin_indexes? :
 [1]
2 end_indexes? :
 [5656570]
begin_indexes:
 [1 2]
sample_begin_indexes:
 [1 2]
full_data_begin_indexes:
 [0, 1]
in_window_begin_indexes:
 [1]
sample_end_indexes:
 [5656569 5656570]
full_data_end_indexes: [5656570, 5656571]
in_window_end_indexes:
 [5656570]
2 begin_indexes? :
 [1 2]
2 end_indexes? :
 [5656569 5656570]
i: 0, b_idx 1:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 1

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<timed exec> in <module>()

<ipython-input-159-69232aa59e5d> in add_features(df, first_index, last_index, sample_size, holdout_size, smootch_windows_size)
     71                 print("i: {}, b_idx {}:".format(i, b_idx))
     72                 #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
---> 73                 value = df[b_idx]['acoustic_data']
     74                 value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
     75                 feature_values_list[i] = value

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2487         res = cache.get(item)
   2488         if res is None:
-> 2489             values = self._data.get(item)
   2490             res = self._box_item_values(item, values)
   2491             cache[item] = res

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3078                 return self._engine.get_loc(key)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 
   3082         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 1

In [None]:



window_size: 3

begin_indexes:
 [0]
sample_begin_indexes:
 [0]
full_data_begin_indexes:
 [0]
in_window_begin_indexes:
 [0]
sample_end_indexes:
 [5656570]
full_data_end_indexes: [5656571]
in_window_end_indexes:
 []
2 begin_indexes? :
 [0]
2 end_indexes? :
 [5656570]
i: 0, b_idx 0:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3077             try:
-> 3078                 return self._engine.get_loc(key)
   3079             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<timed exec> in <module>()

<ipython-input-161-35773f4b493b> in add_features(df, first_index, last_index, sample_size, holdout_size, smootch_windows_size)
     74                 print("i: {}, b_idx {}:".format(i, b_idx))
     75                 #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
---> 76                 value = df[b_idx]['acoustic_data']
     77                 value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
     78                 feature_values_list[i] = value

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2487         res = cache.get(item)
   2488         if res is None:
-> 2489             values = self._data.get(item)
   2490             res = self._box_item_values(item, values)
   2491             cache[item] = res

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   4113 
   4114             if not isna(item):
-> 4115                 loc = self.items.get_loc(item)
   4116             else:
   4117                 indexer = np.arange(len(self.items))[isna(self.items)]

~/miniconda3/envs/DS-New/lib/python3.6/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3078                 return self._engine.get_loc(key)
   3079             except KeyError:
-> 3080                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   3081 
   3082         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0
