In [27]:
import gc
import math
import os
import random
import pathlib
import time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, train_test_split

import lightgbm as lgb
import xgboost as xgb

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint

In [2]:
%matplotlib inline

In [20]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    sample_indexes = np.random.randint(first_index, last_index, sample_size + 1)
    sample_indexes.sort()
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True)
    
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()

    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        feature_values_list = list(range(sample_size))
        
        half_window_size = window_size // 2
        
        sample_begin_indexes = sample_indexes[:half_window_size]

        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        
        min_begin_index = min(full_data_begin_indexes)
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            set(range(min_begin_index, min_begin_index + half_window_size))
        )
        
        sample_end_indexes = sample_indexes[-half_window_size:]

        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        
        max_end_index = max(full_data_end_indexes)
        in_window_end_indexes = full_data_end_indexes.intersection(
            set(range(max_end_index - half_window_size, max_end_index))
        )
        
        if in_window_begin_indexes:
            begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                value = df.iloc[b_idx]['acoustic_data']
                value = value - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                feature_values_list[i] = value
        if in_window_end_indexes:
            end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                value = df.iloc[e_idx]['acoustic_data']
                value = value - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                feature_values_list[-i] = value
        #slice_begin = len(begin_indexes_set)
        #slice_end = sample_df_len - len(end_indexes_set)
        #for i in range(sample_df_len)[slice_begin:slice_end]:
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        #for i in range(sample_df_len)[slice_begin:slice_end]:
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[sample_idx - half_window_size:sample_idx + half_window_size].mean()

        sample_df[feature_name] = feature_values_list
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [94]:
def add_features(
        df,
        first_index=None,
        last_index=None,
        sample_size=150000,
        holdout_size=50000,
        smootch_windows_size = (3, 5, 7)
    ):
    
    print("df.shape:", df.shape)
    print("df.index[:3]", df.index[:3])
    print("df.index[-3:]", df.index[-3:])
    if first_index == None or last_index == None:
        first_index = df.index.min()
        last_index = df.index.max()
    print("first_index: {}, last_index: {}".format(first_index, last_index))
    #sample_indexes = np.random.randint(first_index, last_index, sample_size)
    sample_indexes = random.sample(range(first_index, last_index + 1), sample_size)
    sample_indexes.sort()
    print("sample_indexes[:3]", sample_indexes[:3])
    print("sample_indexes[-3:]", sample_indexes[-3:])
    print("len(sample_indexes):", len(sample_indexes))
    #print("sample_indexes.shape:", sample_indexes.shape)    
    smootch_feature_names = ['smootch_mean_ws_{}'.format(window_size) for window_size in smootch_windows_size]
    #half_windows_size = [ws // 2 for ws in smootch_windows_size]

    acoustic_data_series = df['acoustic_data']
    
    sample_df = df.iloc[sample_indexes]
    sample_df.reset_index(inplace=True)
    sample_df.drop(columns=['index'], inplace=True) # There is need map sample_df.index -> sample_indexes
                                                    # Должно быть установелнно соотвествие индексов в sample_df
                                                    # (Где после .reset_index индексы - это
                                                    # все целые числа от 0 до sample_df.shape[0]) и sample_indexes -
                                                    # соответствующие индексы в df которые являються случайной
                                                    # выборкой из чисел о first_index до last_index и длинной 
                                                    # равной sample_df.index.shape[0] которая длинна равно sample_size 
                                                    # Соответсвенно, по видимому, in_window_begin_indexes и
                                                    # in_window_end_indexes надо вычислять как то по другому
    #print("just after create sample_df, sample_df.shape[0]:", sample_df.shape[0])
    
    #for feature_name in smootch_feature_names:
    #    sample_df[feature_name] = 0
    #begin_smootch_features_value = []
    #end_smootch_features_value = []
    
    #sample_indexes_set = set(sample_indexes)
    begin_indexes_set = set()
    end_indexes_set = set()
    
    start_time = time.time()
    #print("sample_df.shape[0] just before main loop:", sample_df.shape[0])
    sample_df_len = sample_df.shape[0]
    for window_size, feature_name in zip(smootch_windows_size, smootch_feature_names):
        
        print("\n" * 2)
        print("window_size:", window_size)
        print()
        feature_values_list = list(range(sample_size))
        print("sample_df.index.tolist()[:window_size]:\n", sample_df.index.tolist()[:window_size])
        print("df.index.tolist()[:window_size]:\n", df.index.tolist()[:window_size])

        print("sample_df.index.tolist()[-window_size:]:\n", sample_df.index.tolist()[-window_size:])
        print("df.index.tolist()[-window_size:]:\n", df.index.tolist()[-window_size:])
        
        half_window_size = window_size // 2
        ##>begin_indexes = sample_indexes[:half_window_size]  
        ##>print("begin_indexes:\n", begin_indexes)
        sample_begin_indexes = sample_indexes[:half_window_size] #? sample_df.index[:half_window_size]
        print("sample_begin_indexes:\n", sample_begin_indexes)
        #full_data_begin_indexes = df.index[:half_window_size].tolist() #df.index[sample_indexes[i]]
        full_data_begin_indexes = set(df.index[sample_begin_indexes].tolist())
        print("full_data_begin_indexes:\n", full_data_begin_indexes)
        
        #in_window_begin_indexes = [idx for idx in sample_begin_indexes if idx in full_data_begin_indexes]
        min_begin_index = min(full_data_begin_indexes)
        print("set(range(min_begin_index, min_begin_index + half_window_size))", set(range(min_begin_index, min_begin_index + half_window_size)))
        in_window_begin_indexes = full_data_begin_indexes.intersection(
            set(range(min_begin_index, min_begin_index + half_window_size))
        )
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        
        sample_end_indexes = sample_indexes[-half_window_size:]
        print("sample_end_indexes:\n", sample_end_indexes)

        #full_data_end_indexes = df.index[-half_window_size:].tolist()
        full_data_end_indexes = set(df.index[sample_end_indexes].tolist())
        print("full_data_end_indexes:", full_data_end_indexes)
        
        #in_window_end_indexes = [idx for idx in sample_end_indexes if idx in full_data_end_indexes]
        max_end_index = max(full_data_end_indexes)
        print("max_end_index:", max_end_index)
        print("max_end_index - half_window_size:", max_end_index - half_window_size)
        print("set(range(max_end_index - half_window_size, max_end_index + 1)):\n", set(range(max_end_index + 1 - half_window_size, max_end_index + 1)))
        in_window_end_indexes = full_data_end_indexes.intersection(
            set(range(max_end_index + 1 - half_window_size, max_end_index + 1))
        )
        print("in_window_end_indexes:\n", in_window_end_indexes)
        
        ##>begin_indexes = begin_indexes[begin_indexes <= half_window_size]
        ##>print("2 begin_indexes? :\n", begin_indexes)
        
        ##>end_indexes = sample_indexes[-half_window_size:]
        ##>print("2 end_indexes? :\n", end_indexes)
        print("in_window_begin_indexes:\n", in_window_begin_indexes)
        print("len(in_window_begin_indexes)", len(in_window_begin_indexes))
        print()
        if in_window_begin_indexes:
            begin_indexes_set = begin_indexes_set.union(in_window_begin_indexes)
            for i, b_idx in enumerate(sorted(tuple(in_window_begin_indexes))):
                print("i: {}, b_idx {}:".format(i, b_idx))
                print("type(b_idx):", type(b_idx))
                print("df[:window_size]:\n", df[:window_size])
                #sample_df[feature_name].iloc[b_idx] = df[b_idx]['acoustic_data'] - df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                value = df.iloc[b_idx]['acoustic_data']
                print("begin 1 value:", value)
                print("df.iloc[first_index:first_index + window_size]['acoustic_data']:", df.iloc[first_index:first_index + window_size]['acoustic_data'])
                temp = df.iloc[first_index:first_index + window_size]['acoustic_data'].mean()
                print("begin temp:", temp)
                value = value - temp
                print("begin 2 value:", value)
                feature_values_list[i] = value
        print("in_window_end_indexes:\n", in_window_end_indexes)
        print("len(in_window_end_indexes)", len(in_window_end_indexes))
        print()
        if in_window_end_indexes:
            end_indexes_set = end_indexes_set.union(in_window_end_indexes)
            for i, e_idx in enumerate(sorted(tuple(in_window_end_indexes))):
                print("i: {}, e_idx {}:".format(i, e_idx))
                print("type(e_idx):", type(e_idx))
                print("df[-window_size:]:\n", df[-window_size:])
                #sample_df[feature_name].iloc[e_idx] = df[e_idx]['acoustic_data'] - df.iloc[last_index - window_size:]['acoustic_data'].mean()
                value = df.iloc[e_idx]['acoustic_data']
                print("end 1 value:", value)
                print("df.iloc[last_index - window_size:]['acoustic_data']:\n", df.iloc[last_index - window_size:]['acoustic_data'])
                temp = df.iloc[last_index - window_size:]['acoustic_data'].mean()
                print("end temp:", temp)
                value = value - temp
                print("end 2 value:", value)
                feature_values_list[last_index - i] = value
        #print("in main loop, sample_df.shape:", sample_df.shape)
        print("sample_df_len:", sample_df_len)
        print("begin_indexes_set:\n", begin_indexes_set)
        print("len(begin_indexes_set):", len(begin_indexes_set))
        print("begin_indexes_set:\n", end_indexes_set)
        print("len(begin_indexes_set):", len(end_indexes_set))
        first_regular_idx = len(begin_indexes_set)
        last_regular_idx = sample_df_len - len(end_indexes_set)
        print("first_regular_idx:", first_regular_idx)
        print("last_regular_idx:", last_regular_idx)
        #for i in range(sample_df_len)[slice_begin:slice_end]:
        for i in range(first_regular_idx, last_regular_idx):
            sample_idx = sample_indexes[i]
            feature_values_list[i] = acoustic_data_series.iloc[sample_idx - half_window_size:sample_idx + half_window_size].mean()
        #print("sample_df.shape[0] before assign feature_values_list:", sample_df.shape[0])
        #print("len(feature_values_list):", len(feature_values_list))
        sample_df[feature_name] = feature_values_list
        
    #sample_df_indexes_set = set(sample_df.index)
    #train_indexes = sample_df_indexes_set
    holdout_df = None
    if holdout_size > 0:
        holdout_indexes = np.random.randint(0, sample_df.shape[0], holdout_size)
        #print("sample_df.index:\n", sample_df.index.tolist())
        #print("holdout_indexes:\n", holdout_indexes)
        #train_indexes = np.array(sample_df.index)
        holdout_df = sample_df.iloc[holdout_indexes]
        holdout_df.reset_index(inplace=True)
        holdout_df.drop(columns=['index'], inplace=True)
    
        #train_indexes = sorted(tuple(sample_df_indexes_set.difference(set(holdout_indexes))))
        train_indexes = sorted(tuple(set(sample_df.index).difference(set(holdout_indexes))))
        #print("train_indexes:\n", train_indexes)
        sample_df = sample_df.iloc[train_indexes]
        sample_df.reset_index(inplace=True)
        sample_df.drop(columns=['index'], inplace=True)
    print("Full calculation feature value time (with slicing) {} min:".format((time.time() - start_time) / 60))
    return sample_df, holdout_df

In [6]:
not_seen_data_df = pd.read_csv(
    '../input/train/train.csv',
    #nrows=100000000,
    names=['acoustic_data', 'time_to_failure'],
    dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32},
    skiprows=1,
    nrows=5656572
)

In [95]:
%%time
featured_not_seen_data_df, _ = add_features(
    not_seen_data_df[:1000],
    sample_size=not_seen_data_df[:1000].shape[0],
    holdout_size=0
)

df.shape: (1000, 2)
df.index[:3] RangeIndex(start=0, stop=3, step=1)
df.index[-3:] RangeIndex(start=997, stop=1000, step=1)
first_index: 0, last_index: 999
sample_indexes[:3] [0, 1, 2]
sample_indexes[-3:] [997, 998, 999]
len(sample_indexes): 1000



window_size: 3

sample_df.index.tolist()[:window_size]:
 [0, 1, 2]
df.index.tolist()[:window_size]:
 [0, 1, 2]
sample_df.index.tolist()[-window_size:]:
 [997, 998, 999]
df.index.tolist()[-window_size:]:
 [997, 998, 999]
sample_begin_indexes:
 [0]
full_data_begin_indexes:
 {0}
set(range(min_begin_index, min_begin_index + half_window_size)) {0}
in_window_begin_indexes:
 {0}
sample_end_indexes:
 [999]
full_data_end_indexes: {999}
max_end_index: 999
max_end_index - half_window_size: 998
set(range(max_end_index - half_window_size, max_end_index + 1)):
 {999}
in_window_end_indexes:
 {999}
in_window_begin_indexes:
 {0}
len(in_window_begin_indexes) 1

i: 0, b_idx 0:
type(b_idx): <class 'int'>
df[:window_size]:
    acoustic_data  time_to_failure
0  

Full calculation feature value time (with slicing) 0.013333674271901448 min:
CPU times: user 824 ms, sys: 16 ms, total: 840 ms
Wall time: 807 ms


In [84]:
featured_not_seen_data_df.shape

(1000, 5)

In [85]:
featured_not_seen_data_df.head(10)

Unnamed: 0,acoustic_data,time_to_failure,smootch_mean_ws_3,smootch_mean_ws_5,smootch_mean_ws_7
0,12.0,1.4691,3.333333,4.2,4.0
1,6.0,1.4691,9.0,-1.8,-2.0
2,8.0,1.4691,7.0,7.75,0.0
3,5.0,1.4691,6.5,6.75,7.833333
4,8.0,1.4691,6.5,7.25,7.333333
5,8.0,1.4691,8.0,7.5,7.5
6,9.0,1.4691,8.5,8.0,5.333333
7,7.0,1.4691,8.0,4.75,5.0
8,-5.0,1.4691,1.0,3.5,4.5
9,3.0,1.4691,-1.0,2.5,3.5


In [86]:
featured_not_seen_data_df.tail(20)

Unnamed: 0,acoustic_data,time_to_failure,smootch_mean_ws_3,smootch_mean_ws_5,smootch_mean_ws_7
980,3.0,1.469099,2.5,1.25,1.5
981,-4.0,1.469099,-0.5,0.5,0.666667
982,1.0,1.469099,-1.5,-0.5,-0.166667
983,-2.0,1.469099,-0.5,-1.5,0.0
984,-1.0,1.469099,-1.5,0.25,0.333333
985,3.0,1.469099,1.0,1.25,2.0
986,5.0,1.469099,4.0,3.25,2.666667
987,6.0,1.469099,5.5,4.75,4.0
988,5.0,1.469099,5.5,5.5,4.666667
989,6.0,1.469099,5.5,5.0,5.0


In [59]:
not_seen_data_df[:1000].shape

(1000, 2)

In [60]:
not_seen_data_df[:1000].index.max()

999

In [26]:
np.random.

In [30]:
random.

In [36]:
np.random.rand?

In [37]:
np.random.rand(20)

array([0.94842548, 0.15565242, 0.13225282, 0.44878307, 0.79942495,
       0.46168806, 0.16916164, 0.35274032, 0.16233877, 0.06494294,
       0.4119373 , 0.8498234 , 0.89188705, 0.86023744, 0.99186619,
       0.57907303, 0.92444036, 0.80596842, 0.35860679, 0.09529798])

In [42]:
random.sample?

In [39]:
np.random.randint?

In [49]:
random.sample(range(10), 10)

[3, 8, 0, 5, 4, 2, 7, 6, 9, 1]

In [56]:
np.random.sample?

In [87]:
temp_df = not_seen_data_df[:1000]

In [91]:
temp_df[-7:]

Unnamed: 0,acoustic_data,time_to_failure
993,6.0,1.469099
994,7.0,1.469099
995,8.0,1.469099
996,9.0,1.469099
997,7.0,1.469099
998,8.0,1.469099
999,5.0,1.469099
