In [27]:
import gc
import math
import os
import random
import pathlib
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

In [2]:
%matplotlib inline

In [106]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    print("df.shape:", df.shape)
    print("df.index[:3]", df.index[:3])
    print("df.index[-3:]", df.index[-3:])
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    print("first_index: {}, last_index: {}".format(first_index, last_index))
    #sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes = random.sample(range(first_index, last_index + 1), sample_size)
    sample_indexes.sort()
    print("sample_indexes[:3]", sample_indexes[:3])
    print("sample_indexes[-3:]", sample_indexes[-3:])
    print("len(sample_indexes):", len(sample_indexes))
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True) # There is need map sample_df.index -> sample_indexes
                                                    # Должно быть установелнно соотвествие индексов в sample_df
                                                    # (Где после .reset_index индексы - это
                                                    # все целые числа от 0 до sample_df.shape[0]) и sample_indexes -
                                                    # соответствующие индексы в df которые являються случайной
                                                    # выборкой из чисел о first_index до last_index и длинной 
                                                    # равной sample_df.index.shape[0] которая длинна равно sample_size 
                                                    # Соответсвенно, по видимому, in_window_begin_indexes и
                                                    # in_window_end_indexes надо вычислять как то по другому
    #print("just after create sample_df, sample_df.shape[0]:", sample_df.shape[0])
    
    #for feature_name in smootch_feature_names:
    #    sample_df[feature_name] = 0
    #begin_smootch_features_value = []
    #end_smootch_features_value = []
    
    #sample_indexes_set = set(sample_indexes)
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
    #print("sample_df.shape[0] just before main loop:", sample_df.shape[0])
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        print("\n" * 2)
        print("window_size:", window_size)
        print()
        feature_values_list = list(range(sample_size))
        print("sample_df.index.tolist()[:window_size]:\n", sample_df.index.tolist()[:window_size])
        print("df.index.tolist()[:window_size]:\n", df.index.tolist()[:window_size])

        print("sample_df.index.tolist()[-window_size:]:\n", sample_df.index.tolist()[-window_size:])
        print("df.index.tolist()[-window_size:]:\n", df.index.tolist()[-window_size:])
        
        half_window_size = window_size // 2
        ##>begin_indexes = sample_indexes[:half_window_size]  
        ##>print("begin_indexes:\n", begin_indexes)
        sample_begin_indexes = sample_indexes[:half_window_size] #? sample_df.index[:half_window_size]
        print("sample_begin_indexes:\n", sample_begin_indexes)
        #full_data_begin_indexes = df.index[:half_window_size].tolist() #df.index[sample_indexes[i]]
        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        print("full_data_begin_indexes:\n", full_data_begin_indexes)
        
        #in_window_begin_indexes = [idx for idx in sample_begin_indexes if idx in full_data_begin_indexes]
        min_begin_index = min(full_data_begin_indexes)
        print("set(range(min_begin_index, min_begin_index + half_window_size))", set(range(min_begin_index, min_begin_index + half_window_size)))
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            set(range(min_begin_index, min_begin_index + half_window_size))
        )
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        print("sample_end_indexes:\n", sample_end_indexes)

        #full_data_end_indexes = df.index[-half_window_size:].tolist()
        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        print("full_data_end_indexes:", full_data_end_indexes)
        
        #in_window_end_indexes = [idx for idx in sample_end_indexes if idx in full_data_end_indexes]
        max_end_index = max(full_data_end_indexes)
        print("max_end_index:", max_end_index)
        print("max_end_index - half_window_size:", max_end_index - half_window_size)
        print("set(range(max_end_index - half_window_size, max_end_index + 1)):\n", set(range(max_end_index + 1 - half_window_size, max_end_index + 1)))
        in_window_end_indexes = full_data_end_indexes.intersection(
            set(range(max_end_index + 1 - half_window_size, max_end_index + 1))
        )
        print("in_window_end_indexes:\n", in_window_end_indexes)
        
        ##>begin_indexes = begin_indexes[begin_indexes <= half_window_size]
        ##>print("2 begin_indexes? :\n", begin_indexes)
        
        ##>end_indexes = sample_indexes[-half_window_size:]
        ##>print("2 end_indexes? :\n", end_indexes)
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        print("len(in_window_begin_indexes)", len(in_window_begin_indexes))
        print()
        if in_window_begin_indexes:
            begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                print("i: {}, b_idx {}:".format(i, b_idx))
                print("type(b_idx):", type(b_idx))
                print("df[:window_size]:\n", df[:window_size])
                #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                value = df.iloc[b_idx]['acoustic_data']
                print("begin 1 value:", value)
                print("df.iloc[first_index:first_index + window_size]['acoustic_data']:", df.iloc[first_index:first_index + window_size]['acoustic_data'])
                temp = df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                print("begin temp:", temp)
                value = value - temp
                print("begin 2 value:", value)
                feature_values_list[i] = value
        print("in_window_end_indexes:\n", in_window_end_indexes)
        print("len(in_window_end_indexes)", len(in_window_end_indexes))
        print()
        if in_window_end_indexes:
            end_indexes_set = end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                print("i: {}, e_idx {}:".format(i, e_idx))
                print("type(e_idx):", type(e_idx))
                print("df[-window_size:]:\n", df[-window_size:])
                #sample_df[feature_name].iloc[e_idx] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                value = df.iloc[e_idx]['acoustic_data']
                print("end 1 value:", value)
                print("df.iloc[last_index + 1 - window_size:]['acoustic_data']:\n", df.iloc[last_index + 1 - window_size:]['acoustic_data'])
                temp = df.iloc[last_index + 1 - window_size:]['acoustic_data'].mean()
                print("end temp:", temp)
                value = value - temp
                print("end 2 value:", value)
                feature_values_list[last_index - i] = value
        #print("in main loop, sample_df.shape:", sample_df.shape)
        print("sample_df_len:", sample_df_len)
        print("begin_indexes_set:\n", begin_indexes_set)
        print("len(begin_indexes_set):", len(begin_indexes_set))
        print("begin_indexes_set:\n", end_indexes_set)
        print("len(begin_indexes_set):", len(end_indexes_set))
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        print("first_regular_idx:", first_regular_idx)
        print("last_regular_idx:", last_regular_idx)
        #for i in range(sample_df_len)[slice_begin:slice_end]:
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[
                sample_idx - half_window_size:sample_idx + half_window_size
            ].mean()
        #print("sample_df.shape[0] before assign feature_values_list:", sample_df.shape[0])
        #print("len(feature_values_list):", len(feature_values_list))
        sample_df[feature_name] = feature_values_list
        
    #sample_df_indexes_set = set(sample_df.index)
    #train_indexes = sample_df_indexes_set
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        #print("sample_df.index:\n", sample_df.index.tolist())
        #print("holdout_indexes:\n", holdout_indexes)
        #train_indexes = np.array(sample_df.index)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
    
        #train_indexes = sorted(tuple(sample_df_indexes_set.difference(set(holdout_indexes))))
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        #print("train_indexes:\n", train_indexes)
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [112]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    print("df.shape:", df.shape)
    print("df.index[:3]", df.index[:3])
    print("df.index[-3:]", df.index[-3:])
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    print("first_index: {}, last_index: {}".format(first_index, last_index))
    #sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes = random.sample(range(first_index, last_index + 1), sample_size)
    sample_indexes.sort()
    print("sample_indexes[:3]", sample_indexes[:3])
    print("sample_indexes[-3:]", sample_indexes[-3:])
    print("len(sample_indexes):", len(sample_indexes))
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True) # There is need map sample_df.index -> sample_indexes
                                                    # Должно быть установелнно соотвествие индексов в sample_df
                                                    # (Где после .reset_index индексы - это
                                                    # все целые числа от 0 до sample_df.shape[0]) и sample_indexes -
                                                    # соответствующие индексы в df которые являються случайной
                                                    # выборкой из чисел о first_index до last_index и длинной 
                                                    # равной sample_df.index.shape[0] которая длинна равно sample_size 
                                                    # Соответсвенно, по видимому, in_window_begin_indexes и
                                                    # in_window_end_indexes надо вычислять как то по другому
    #print("just after create sample_df, sample_df.shape[0]:", sample_df.shape[0])
    
    #for feature_name in smootch_feature_names:
    #    sample_df[feature_name] = 0
    #begin_smootch_features_value = []
    #end_smootch_features_value = []
    
    #sample_indexes_set = set(sample_indexes)
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
    #print("sample_df.shape[0] just before main loop:", sample_df.shape[0])
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        print("\n" * 2)
        print("window_size:", window_size)
        print()
        feature_values_list = list(range(sample_size))

        half_window_size = window_size // 2

        sample_begin_indexes = sample_indexes[:half_window_size] #? sample_df.index[:half_window_size]

        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())

        min_begin_index = min(full_data_begin_indexes)
        
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            set(range(min_begin_index, min_begin_index + half_window_size))
        )
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-half_window_size:]

        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        
        max_end_index = max(full_data_end_indexes)
        print("max_end_index:", max_end_index)
        print("max_end_index - half_window_size:", max_end_index - half_window_size)

        in_window_end_indexes = full_data_end_indexes.intersection(
            set(range(max_end_index + 1 - half_window_size, max_end_index + 1))
        )
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        print("len(in_window_begin_indexes)", len(in_window_begin_indexes))
        print("in_window_end_indexes:\n", in_window_end_indexes)
        print("len(in_window_end_indexes):", len(in_window_end_indexes))

        print()
        if in_window_begin_indexes:
            begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                print("i: {}, b_idx {}:".format(i, b_idx))
                print("type(b_idx):", type(b_idx))
                print("df[:window_size]:\n", df[:window_size])
                #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                value = df.iloc[b_idx]['acoustic_data']
                print("begin 1 value:", value)
                print("df.iloc[first_index:first_index + window_size]['acoustic_data']:\n", df.iloc[first_index:first_index + window_size]['acoustic_data'])
                temp = df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                print("begin temp:", temp)
                value = value - temp
                print("begin 2 value:", value)
                feature_values_list[i] = value
        if in_window_end_indexes:
            end_indexes_set = end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                print("i: {}, e_idx {}:".format(i, e_idx))
                print("type(e_idx):", type(e_idx))
                print("df[-window_size:]:\n", df[-window_size:])
                #sample_df[feature_name].iloc[e_idx] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                value = df.iloc[e_idx]['acoustic_data']
                print("end 1 value:", value)
                print("df.iloc[last_index + 1 - window_size:]['acoustic_data']:\n", df.iloc[last_index + 1 - window_size:]['acoustic_data'])
                temp = df.iloc[last_index + 1 - window_size:]['acoustic_data'].mean()
                print("end temp:", temp)
                value = value - temp
                print("end 2 value:", value)
                feature_values_list[last_index - i] = value
        #print("in main loop, sample_df.shape:", sample_df.shape)
        print("sample_df_len:", sample_df_len)
        print("begin_indexes_set:\n", begin_indexes_set)
        print("len(begin_indexes_set):", len(begin_indexes_set))
        print("begin_indexes_set:\n", end_indexes_set)
        print("len(begin_indexes_set):", len(end_indexes_set))
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        print("first_regular_idx:", first_regular_idx)
        print("last_regular_idx:", last_regular_idx)
        #for i in range(sample_df_len)[slice_begin:slice_end]:
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[
                sample_idx - half_window_size:sample_idx + half_window_size
            ].mean()
        #print("sample_df.shape[0] before assign feature_values_list:", sample_df.shape[0])
        #print("len(feature_values_list):", len(feature_values_list))
        sample_df[feature_name] = feature_values_list
        
    #sample_df_indexes_set = set(sample_df.index)
    #train_indexes = sample_df_indexes_set
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        #print("sample_df.index:\n", sample_df.index.tolist())
        #print("holdout_indexes:\n", holdout_indexes)
        #train_indexes = np.array(sample_df.index)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
    
        #train_indexes = sorted(tuple(sample_df_indexes_set.difference(set(holdout_indexes))))
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        #print("train_indexes:\n", train_indexes)
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [230]:
def add_features(
        df,
        input_first_index=None,
        input_last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    if input_first_index == None or input_last_index == None:
        input_first_index = df.index.min()
        input_last_index = df.index.max() + 1
        
    #sample_indexes = random.sample(range(input_first_index, input_last_index + 1), sample_size)
    sample_indexes = random.sample(range(input_first_index, input_last_index), sample_size)
    sample_indexes.sort()
    print("sample_indexes[:8]:\n", sample_indexes[:8])
    print("sample_indexes[-8:]:\n", sample_indexes[-8:])
    #sample_first_index = min(sample_indexes)
    #sample_last_index = max(sample_indexes)
    #print("sample_first_index:", sample_first_index)
    #print("sample_last_index:", sample_last_index)
    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    acoustic_data_series = df['acoustic_data']
    full_data_indexes = tuple(acoustic_data_series.index.tolist())
    '''
    print("acoustic_data_series.index[:3]:\n", acoustic_data_series.index[:3].tolist())
    print("acoustic_data_series.index[-3:]:\n", acoustic_data_series.index[-3:].tolist())
    '''
    
    print("acoustic_data_series[:8]:\n", acoustic_data_series[:8])
    print("acoustic_data_series[-8:]:\n", acoustic_data_series[-8:])
    
    '''
    print("df[:3]['acousitc_data']:\n", df[:3]['acoustic_data'])
    print("df[-3:]['acoustic_data']:\n", df[-3:]['acoustic_data'])
    '''
    
    sample_df = df.iloc[sample_indexes]
    '''
    print("before reset sample_df.index[:3]", sample_df.index[:3])
    print("before reset sample_df.index[-3:]", sample_df.index[-3:])
    print("before reset sample_df[:3]", sample_df[:3])
    print("before reset sample_df[-3:]", sample_df[-3:])
    '''
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)
    '''
    print("after reset sample_df.index[:3]", sample_df.index[:3])
    print("after reset sample_df.index[-3:]", sample_df.index[-3:])
    print("after reset sample_df[:3]", sample_df[:3])
    print("after reset sample_df[-3:]", sample_df[-3:])
    '''
    output_first_index = 0
    output_last_index = len(sample_df) - 1
    
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
   
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        print("\n\nwindow_size: {} feature_name: {}".format(window_size, feature_name))

        feature_values_list = list(range(sample_size))

        half_window_size = window_size // 2

        sample_begin_indexes = sample_indexes[:half_window_size] #? sample_df.index[:half_window_size]
        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        min_full_data_index = min(full_data_indexes)
        #in_window_full_data_begin_indexes = set(range(min_full_data_index, min_full_data_index + half_window_size))
        in_window_full_data_begin_indexes = set(range(input_first_index, input_first_index + half_window_size))
        print("+++ in_window_full_data_begin_indexes:\n", in_window_full_data_begin_indexes)
              
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            #set(range(min_begin_index, min_begin_index + half_window_size))
            #set(range(min_full_data_index, min_full_data_index + half_window_size))
            in_window_full_data_begin_indexes
        )
        print("---in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        max_full_data_index = max(full_data_end_indexes) + 1
        
        print("** max_full_data_index:", max_full_data_index)
        print("** input_last_index:", input_last_index)
        #in_window_full_data_end_indexes = set(range(max_full_data_index - half_window_size, max_full_data_index))
        #in_window_full_data_end_indexes = set(range(input_last_index + 1 - half_window_size, input_last_index + 1))
        in_window_full_data_end_indexes = set(range(input_last_index - half_window_size, input_last_index))
        print("+++ in_window_full_data_end_indexes:\n", in_window_full_data_end_indexes)
        
        in_window_end_indexes = full_data_end_indexes.intersection(
            #set(range(max_end_index + 1 - half_window_size, max_end_index + 1))
            #set(range(max_full_data_index + 1 - half_window_size, max_full_data_index))
            in_window_full_data_end_indexes
        )
        print("---in_window_end_indexes:\n", in_window_end_indexes)
        if in_window_begin_indexes:
            print("\nin_window_begin_indexes")
            print("sorted(tuple(in_window_begin_indexes)):\n", sorted(tuple(in_window_begin_indexes)))
            begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                print("\ni:", i)
                #value = df.iloc[b_idx]['acoustic_data']
                print("sample_df value:", sample_df.iloc[i]['acoustic_data'])
                #value = acoustic_data_series.iloc[b_idx]
                value = sample_df.iloc[i]['acoustic_data']
                print("1 value:", value)
                #temp = df.iloc[input_first_index:input_first_index + window_size]['acoustic_data'].mean()
                print("acoustic_data_series.iloc[input_first_index:input_first_index + window_size]:\n",
                      acoustic_data_series.iloc[input_first_index:input_first_index + window_size]
                )
                temp = acoustic_data_series.iloc[input_first_index:input_first_index + window_size].mean()
                print("temp:", temp)
                value = value - temp
                print("2 value:", value)
                print("output_first_index + i:", output_first_index + i)
                feature_values_list[output_first_index + i] = value
                
        if in_window_end_indexes:
            print("\nin_window_end_indexes")
            print("sorted(tuple(in_window_end_indexes)):\n", sorted(tuple(in_window_end_indexes)))
            end_indexes_set = end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                print("\ni:", i)
                #value = df.iloc[e_idx]['acoustic_data']
                print("sample_df value:", sample_df.iloc[output_last_index - i]['acoustic_data'])
                #value = acoustic_data_series.iloc[e_idx - max_full_data_index]
                value = acoustic_data_series.iloc[e_idx - input_last_index]
                #value = sample_df.iloc[output_last_index - i]['acoustic_data']
                print("1 value:", value)
                #temp = df.iloc[input_last_index + 1 - window_size:]['acoustic_data'].mean()
                print("acoustic_data_series.iloc[input_last_index - window_size:]:\n",
                      #acoustic_data_series.iloc[input_last_index + 1 - window_size:]
                      acoustic_data_series.iloc[input_last_index - window_size:]
                )
                #temp = acoustic_data_series.iloc[input_last_index + 1 - window_size:].mean()
                temp = acoustic_data_series.iloc[input_last_index - window_size:].mean()
                print("temp:", temp)
                value = value - temp
                print("2 value:", value)
                print("output_last_index - i:", output_last_index - i)
                feature_values_list[output_last_index - i] = value
                
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[
                sample_idx - half_window_size:sample_idx + half_window_size
            ].mean()
        sample_df[feature_name] = feature_values_list
        
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [244]:
def add_features(
        df,
        input_first_index=None,
        input_last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    if input_first_index == None or input_last_index == None:
        input_first_index = df.index.min()
        input_last_index = df.index.max() + 1
        
    
    sample_indexes = random.sample(range(input_first_index, input_last_index), sample_size)
    sample_indexes.sort()
    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    acoustic_data_series = df['acoustic_data']
    full_data_indexes = tuple(acoustic_data_series.index.tolist())

    sample_df = df.iloc[sample_indexes]

    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)

    output_first_index = 0
    output_last_index = len(sample_df) - 1
    
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
   
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):

        feature_values_list = list(range(sample_size))

        half_window_size = window_size // 2

        sample_begin_indexes = sample_indexes[:half_window_size]
        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        min_full_data_index = min(full_data_indexes)
        
        in_window_full_data_begin_indexes = set(range(input_first_index, input_first_index + half_window_size))              
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            in_window_full_data_begin_indexes
        )
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        max_full_data_index = max(full_data_end_indexes) + 1
        
        in_window_full_data_end_indexes = set(range(input_last_index - half_window_size, input_last_index))        
        in_window_end_indexes = full_data_end_indexes.intersection(
            in_window_full_data_end_indexes
        )
        if in_window_begin_indexes:
            begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                value = sample_df.iloc[i]['acoustic_data']
                temp = acoustic_data_series.iloc[input_first_index:input_first_index + window_size].mean()
                value = value - temp
                feature_values_list[output_first_index + i] = value
                
        if in_window_end_indexes:
            end_indexes_set = end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                value = sample_df.iloc[output_last_index - i]['acoustic_data']
                temp = acoustic_data_series.iloc[input_last_index - window_size:].mean()
                value = value - temp
                feature_values_list[output_last_index - i] = value
                
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[
                sample_idx - half_window_size:sample_idx + half_window_size
            ].mean()
        sample_df[feature_name] = feature_values_list
        
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [6]:
not_seen_data_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=1,
    nrows=5656572
)

In [245]:
%%time
featured_not_seen_data_df, _ = add_features(
    not_seen_data_df[:3000],
    sample_size=not_seen_data_df[:1000].shape[0],
    holdout_size=0
)

Full calculation feature value time (with slicing) 0.015858737627665202 min:
CPU times: user 1.07 s, sys: 0 ns, total: 1.07 s
Wall time: 1.06 s


In [177]:
featured_not_seen_data_df.shape

(1000, 5)

In [181]:
featured_not_seen_data_df['acoustic_data'][:7].values

array([ 5.,  8.,  9.,  7., -5.,  3.,  5.], dtype=float32)

In [246]:
featured_not_seen_data_df.head(10)

Unnamed: 0,acoustic_data,time_to_failure,smootch_mean_ws_3,smootch_mean_ws_5,smootch_mean_ws_7
0,12.0,1.4691,3.333333,4.2,4.0
1,8.0,1.4691,7.0,7.75,0.0
2,9.0,1.4691,8.5,8.0,5.333333
3,-5.0,1.4691,1.0,3.5,4.5
4,3.0,1.4691,-1.0,2.5,3.5
5,5.0,1.4691,4.0,1.25,2.333333
6,2.0,1.4691,3.5,3.0,1.666667
7,2.0,1.4691,2.0,3.0,2.333333
8,3.0,1.4691,2.5,1.5,2.666667
9,2.0,1.4691,3.0,4.25,4.5


In [182]:
featured_not_seen_data_df['acoustic_data'][-7:].values

array([10.,  7.,  3.,  3.,  2., -1., -4.], dtype=float32)

In [247]:
featured_not_seen_data_df.tail(10)

Unnamed: 0,acoustic_data,time_to_failure,smootch_mean_ws_3,smootch_mean_ws_5,smootch_mean_ws_7
990,0.0,1.469097,-4.0,-2.75,-1.333333
991,20.0,1.469097,15.0,14.75,15.0
992,29.0,1.469097,24.5,24.5,23.5
993,-9.0,1.469097,-13.0,-14.25,-11.833333
994,29.0,1.469097,27.0,25.5,21.833334
995,33.0,1.469097,31.0,29.0,25.333334
996,-24.0,1.469097,-19.0,-18.0,-16.333334
997,-25.0,1.469097,-24.5,-21.5,-19.166666
998,-23.0,1.469097,-24.0,-23.0,-20.333334
999,17.0,1.469097,11.0,20.6,26.428572


In [59]:
not_seen_data_df[:1000].shape

(1000, 2)

In [60]:
not_seen_data_df[:1000].index.max()

999

In [26]:
np.random.

In [30]:
random.

In [36]:
np.random.rand?

In [37]:
np.random.rand(20)

array([0.94842548, 0.15565242, 0.13225282, 0.44878307, 0.79942495,
       0.46168806, 0.16916164, 0.35274032, 0.16233877, 0.06494294,
       0.4119373 , 0.8498234 , 0.89188705, 0.86023744, 0.99186619,
       0.57907303, 0.92444036, 0.80596842, 0.35860679, 0.09529798])

In [42]:
random.sample?

In [39]:
np.random.randint?

In [49]:
random.sample(range(10), 10)

[3, 8, 0, 5, 4, 2, 7, 6, 9, 1]

In [56]:
np.random.sample?

In [87]:
temp_df = not_seen_data_df[:1000]

In [91]:
temp_df[-7:]

Unnamed: 0,acoustic_data,time_to_failure
993,6.0,1.469099
994,7.0,1.469099
995,8.0,1.469099
996,9.0,1.469099
997,7.0,1.469099
998,8.0,1.469099
999,5.0,1.469099


In [96]:
first_idx = temp_df.index.min()
last_idx = temp_df.index.max()

In [97]:
print(first_idx, last_idx)

0 999


In [100]:
ws = 5

In [104]:
print(first_idx + ws)
print(last_idx - ws)

5
994


In [101]:
temp_df[first_idx:first_idx + ws]['acoustic_data']

0    12.0
1     6.0
2     8.0
3     5.0
4     8.0
Name: acoustic_data, dtype: float32

In [103]:
temp_df[last_idx - ws:last_idx]['acoustic_data']

994    7.0
995    8.0
996    9.0
997    7.0
998    8.0
Name: acoustic_data, dtype: float32

In [105]:
temp_df[last_idx + 1 - ws:]['acoustic_data']

995    8.0
996    9.0
997    7.0
998    8.0
999    5.0
Name: acoustic_data, dtype: float32

In [None]:
Training until validation scores don't improve for 4000 rounds.
[1000]	training's l1: 2.85332	valid_1's l1: 2.85467
[2000]	training's l1: 2.84694	valid_1's l1: 2.84851
[3000]	training's l1: 2.84517	valid_1's l1: 2.84683
[4000]	training's l1: 2.84444	valid_1's l1: 2.84614
[5000]	training's l1: 2.84409	valid_1's l1: 2.84583
[6000]	training's l1: 2.84391	valid_1's l1: 2.84568
[7000]	training's l1: 2.8438	valid_1's l1: 2.84561
[8000]	training's l1: 2.84371	valid_1's l1: 2.84555
[9000]	training's l1: 2.84364	valid_1's l1: 2.84551
[10000]	training's l1: 2.84357	valid_1's l1: 2.84547
[11000]	training's l1: 2.84351	valid_1's l1: 2.84544
[12000]	training's l1: 2.84345	valid_1's l1: 2.84541
[13000]	training's l1: 2.84339	valid_1's l1: 2.84539
[14000]	training's l1: 2.84334	valid_1's l1: 2.84536
[15000]	training's l1: 2.84329	valid_1's l1: 2.84534
[16000]	training's l1: 2.84323	valid_1's l1: 2.84532
[17000]	training's l1: 2.84319	valid_1's l1: 2.84531
[18000]	training's l1: 2.84314	valid_1's l1: 2.84529
[19000]	training's l1: 2.8431	valid_1's l1: 2.84528
[20000]	training's l1: 2.84306	valid_1's l1: 2.84527
[21000]	training's l1: 2.84302	valid_1's l1: 2.84526
[22000]	training's l1: 2.84298	valid_1's l1: 2.84525
[23000]	training's l1: 2.84294	valid_1's l1: 2.84524
[24000]	training's l1: 2.84291	valid_1's l1: 2.84524
[25000]	training's l1: 2.84287	valid_1's l1: 2.84523
[26000]	training's l1: 2.84284	valid_1's l1: 2.84523
[27000]	training's l1: 2.8428	valid_1's l1: 2.84523
[28000]	training's l1: 2.84277	valid_1's l1: 2.84523
[29000]	training's l1: 2.84274	valid_1's l1: 2.84523
[30000]	training's l1: 2.84271	valid_1's l1: 2.84522
[31000]	training's l1: 2.84267	valid_1's l1: 2.84522
[32000]	training's l1: 2.84264	valid_1's l1: 2.84521
[33000]	training's l1: 2.84261	valid_1's l1: 2.84521
[34000]	training's l1: 2.84258	valid_1's l1: 2.84521
[35000]	training's l1: 2.84256	valid_1's l1: 2.8452
[36000]	training's l1: 2.84253	valid_1's l1: 2.8452
[37000]	training's l1: 2.8425	valid_1's l1: 2.8452
[38000]	training's l1: 2.84247	valid_1's l1: 2.84519
[39000]	training's l1: 2.84244	valid_1's l1: 2.84519
[40000]	training's l1: 2.84241	valid_1's l1: 2.84519
Did not meet early stopping. Best iteration is:
[40000]	training's l1: 2.84241	valid_1's l1: 2.84519
earthquake 0 mae 2.8432238740079154
not seen data mae: 4.975486288323955