# Feature Extraction
***
Here I determine which features I want to use using tsfresh's native feature selection, and then I begin the arduous process. Most of the actual feature extraction was divvied up among other computers.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyspark
import pickle

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_features
import tsfresh
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.feature_extraction.settings import from_columns

In [2]:
plt.rcParams['xtick.labelsize']=15
plt.rcParams['ytick.labelsize']=15
plt.rcParams['figure.figsize'] = (15,9)

In [2]:
rows = 30000000

In [4]:
train1 = pd.read_csv("../input/train.csv", nrows = 30000000,
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}).reset_index()
train1.head(5)

Unnamed: 0,index,acoustic_data,time_to_failure
0,0,12,1.4691
1,1,6,1.4691
2,2,8,1.4691
3,3,5,1.4691
4,4,8,1.4691


In [5]:
train1.describe()

Unnamed: 0,index,acoustic_data,time_to_failure
count,30000000.0,30000000.0,30000000.0
mean,15000000.0,4.462893,6.938125
std,8660254.0,11.21602,3.417466
min,0.0,-4621.0,0.0007954798
25%,7500000.0,2.0,5.696199
50%,15000000.0,4.0,7.644399
75%,22500000.0,7.0,9.5926
max,30000000.0,3252.0,11.5408


In [6]:
rows = 150000
idlist = []
for n in range(1,201):
    idlist += [n for i in range(rows)]
train1['id'] = idlist
train1.columns = ['time', 'acoustic_data', 'time_to_failure', 'id']
cols = train1.columns.tolist()
cols = cols[-1:] + cols[:-1]
train1 = train1[cols]
y = train1['time_to_failure']
X = train1.drop(columns = 'time_to_failure')
target = y[149999::150000]
target.index = range(1,201)

In [7]:
train1.head()

Unnamed: 0,id,time,acoustic_data,time_to_failure
0,1,0,12,1.4691
1,1,1,6,1.4691
2,1,2,8,1.4691
3,1,3,5,1.4691
4,1,4,8,1.4691


In [8]:
y = train1['time_to_failure']
X = train1.drop(columns = 'time_to_failure')

In [12]:
X.head()

Unnamed: 0,id,time,acoustic_data
0,1,0,12
1,1,1,6
2,1,2,8
3,1,3,5
4,1,4,8


In [17]:
extracted_features = extract_features(X, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=EfficientFCParameters())

Feature Extraction: 100%|███████████████████████████████████████████████████████████| 40/40 [2:45:17<00:00, 124.19s/it]


In [19]:
# with open("extract1","wb") as pickle_out:
#     pickle.dump(extracted_features,pickle_out)

In [16]:
# with open("extract1","rb") as pickle_in:
#     extracted_features = pickle.load(pickle_in)

In [17]:
extracted_features.head()

variable,acoustic_data__abs_energy,acoustic_data__absolute_sum_of_changes,"acoustic_data__agg_autocorrelation__f_agg_""mean""__maxlag_40","acoustic_data__agg_autocorrelation__f_agg_""median""__maxlag_40","acoustic_data__agg_autocorrelation__f_agg_""var""__maxlag_40","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,acoustic_data__symmetry_looking__r_0.9,acoustic_data__symmetry_looking__r_0.9500000000000001,acoustic_data__time_reversal_asymmetry_statistic__lag_1,acoustic_data__time_reversal_asymmetry_statistic__lag_2,acoustic_data__time_reversal_asymmetry_statistic__lag_3,acoustic_data__value_count__value_-1,acoustic_data__value_count__value_0,acoustic_data__value_count__value_1,acoustic_data__variance,acoustic_data__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,10247.0,391980.0,-0.019617,-0.029211,0.077173,11.472805,-0.16048,-0.000171,9e-06,15.549423,...,1.0,1.0,0.259257,3.830395,-4.118365,3622.0,5741.0,8406.0,26.02111,1.0
2,31377.0,405226.0,-0.029642,-0.047355,0.095004,10.87406,-0.042025,-6.5e-05,1.3e-05,14.998821,...,1.0,1.0,-6.401732,-0.225979,2.809399,3888.0,5853.0,8557.0,43.412309,1.0
3,13573.0,418888.0,-0.021527,-0.055133,0.093452,10.050543,0.081034,0.000129,1.3e-05,13.210181,...,1.0,1.0,-1.566988,-1.367183,-2.153706,3686.0,5626.0,8242.0,48.544298,1.0
4,-20948.0,405840.0,-0.034455,-0.066201,0.089827,10.815835,-0.018169,-3e-05,1.3e-05,15.019441,...,1.0,1.0,-1.075888,-11.037708,-2.765704,3622.0,5545.0,8370.0,47.91799,1.0
5,10336.0,406869.0,-0.027299,-0.037009,0.072958,10.495975,0.013896,2.3e-05,1.3e-05,13.833931,...,1.0,1.0,-4.243117,-10.559775,-1.223389,3567.0,5443.0,8207.0,53.305855,1.0


In [20]:
impute(extracted_features);

 'acoustic_data__friedrich_coefficients__m_3__r_30__coeff_1'
 'acoustic_data__friedrich_coefficients__m_3__r_30__coeff_2'
 'acoustic_data__friedrich_coefficients__m_3__r_30__coeff_3'
 'acoustic_data__max_langevin_fixed_point__m_3__r_30'] did not have any finite values. Filling with zeros.


variable,acoustic_data__abs_energy,acoustic_data__absolute_sum_of_changes,"acoustic_data__agg_autocorrelation__f_agg_""mean""__maxlag_40","acoustic_data__agg_autocorrelation__f_agg_""median""__maxlag_40","acoustic_data__agg_autocorrelation__f_agg_""var""__maxlag_40","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","acoustic_data__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,acoustic_data__symmetry_looking__r_0.9,acoustic_data__symmetry_looking__r_0.9500000000000001,acoustic_data__time_reversal_asymmetry_statistic__lag_1,acoustic_data__time_reversal_asymmetry_statistic__lag_2,acoustic_data__time_reversal_asymmetry_statistic__lag_3,acoustic_data__value_count__value_-1,acoustic_data__value_count__value_0,acoustic_data__value_count__value_1,acoustic_data__variance,acoustic_data__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,10247.0,391980.0,-0.019617,-0.029211,0.077173,11.472805,-0.160480,-1.705898e-04,0.000009,15.549423,...,1.0,1.0,0.259257,3.830395,-4.118365,3622.0,5741.0,8406.0,26.021110,1.0
2,31377.0,405226.0,-0.029642,-0.047355,0.095004,10.874060,-0.042025,-6.538121e-05,0.000013,14.998821,...,1.0,1.0,-6.401732,-0.225979,2.809399,3888.0,5853.0,8557.0,43.412309,1.0
3,13573.0,418888.0,-0.021527,-0.055133,0.093452,10.050543,0.081034,1.288873e-04,0.000013,13.210181,...,1.0,1.0,-1.566988,-1.367183,-2.153706,3686.0,5626.0,8242.0,48.544298,1.0
4,-20948.0,405840.0,-0.034455,-0.066201,0.089827,10.815835,-0.018169,-2.995335e-05,0.000013,15.019441,...,1.0,1.0,-1.075888,-11.037708,-2.765704,3622.0,5545.0,8370.0,47.917990,1.0
5,10336.0,406869.0,-0.027299,-0.037009,0.072958,10.495975,0.013896,2.289369e-05,0.000013,13.833931,...,1.0,1.0,-4.243117,-10.559775,-1.223389,3567.0,5443.0,8207.0,53.305855,1.0
6,-10131.0,397547.0,-0.007783,-0.021690,0.086919,10.373082,-0.000802,-9.265173e-07,0.000009,13.546429,...,1.0,1.0,-1.759577,0.541614,0.088250,3665.0,5783.0,8235.0,29.529367,1.0
7,675.0,405963.0,-0.021109,-0.027548,0.093692,10.431613,0.008065,9.652277e-06,0.000010,13.985859,...,1.0,1.0,-3.986333,-4.080542,-6.621298,3799.0,5601.0,8431.0,32.351119,1.0
8,-5908.0,397772.0,-0.019882,-0.033951,0.083886,10.012702,-0.009531,-1.263670e-05,0.000011,13.125312,...,1.0,1.0,-4.568234,-4.897311,2.288032,4088.0,6498.0,9276.0,34.275077,1.0
9,-11427.0,426464.0,-0.025673,-0.038012,0.124867,11.604713,-0.040907,-7.316655e-05,0.000015,16.327527,...,1.0,1.0,4.077621,0.499473,-9.269517,4183.0,6224.0,8580.0,60.678135,1.0
10,-6746.0,413719.0,-0.030353,-0.062920,0.079779,11.318845,-0.053392,-8.293604e-05,0.000013,15.853895,...,1.0,1.0,-5.472080,-9.492600,-2.962438,4100.0,6027.0,8684.0,47.478111,1.0


In [23]:
kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(features_filtered)

In [21]:
kind_to_fc_parameters

{'acoustic_data': {'number_peaks': [{'n': 3},
   {'n': 1},
   {'n': 5},
   {'n': 10},
   {'n': 50}],
  'change_quantiles': [{'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.2},
   {'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.4},
   {'f_agg': 'mean', 'isabs': True, 'qh': 0.6, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.6},
   {'f_agg': 'mean', 'isabs': True, 'qh': 0.8, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 0.4, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 0.2, 'ql': 0.0},
   {'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.8},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.2},
   {'f_agg': 'var', 'isabs': False, 'qh': 0.8, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': False, 'qh': 0.6, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.4},
   {'f_agg': 'var', 'isabs': False, 'qh': 0.4, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 

In [31]:
#The fdr level is the threshold of feature importance. I set it so obnoxiously low to get the right number of features.
smaller = select_features(extracted_features, target, fdr_level = .0000000000000000000000005, ml_task = 'regression')

In [32]:
smaller.shape

(200, 51)

In [23]:
smaller

variable,acoustic_data__number_peaks__n_3,acoustic_data__number_peaks__n_1,acoustic_data__number_peaks__n_5,"acoustic_data__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.2","acoustic_data__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","acoustic_data__change_quantiles__f_agg_""mean""__isabs_True__qh_0.6__ql_0.0","acoustic_data__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.6","acoustic_data__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0","acoustic_data__change_quantiles__f_agg_""mean""__isabs_True__qh_0.4__ql_0.0",acoustic_data__range_count__max_1000000000000.0__min_0,...,"acoustic_data__fft_aggregated__aggtype_""variance""",acoustic_data__partial_autocorrelation__lag_8,"acoustic_data__fft_aggregated__aggtype_""centroid""",acoustic_data__spkt_welch_density__coeff_2,acoustic_data__partial_autocorrelation__lag_3,"acoustic_data__agg_linear_trend__f_agg_""min""__chunk_len_50__attr_""intercept""","acoustic_data__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.0",acoustic_data__variance,acoustic_data__standard_deviation,"acoustic_data__linear_trend__attr_""stderr"""
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,13089.0,31523.0,8640.0,2.302788,2.149905,2.156878,2.068256,2.319118,2.042939,138430.0,...,-3.904593e+08,-0.075044,24625.173081,14.983442,-0.090809,-5.284989,3.592482,26.021110,5.101089,3.040551e-07
2,13055.0,31292.0,8596.0,2.351774,2.213743,2.238072,2.212272,2.387595,2.175643,136635.0,...,-2.947181e+08,0.004184,22155.481298,29.984385,-0.170811,-5.560894,4.644371,43.412309,6.588802,3.928758e-07
3,12651.0,30706.0,8494.0,2.431366,2.325665,2.326864,2.338177,2.441643,2.308001,135749.0,...,-2.762929e+08,0.007227,21632.689820,31.040585,-0.189362,-4.104160,4.975456,48.544298,6.967374,4.153307e-07
4,12976.0,31320.0,8592.0,2.370905,2.237330,2.232143,2.220443,2.372187,2.172832,137391.0,...,-2.816678e+08,0.050166,21689.958971,64.074867,-0.181572,-5.286495,4.738911,47.917990,6.922282,4.127463e-07
5,12910.0,31302.0,8483.0,2.371184,2.249011,2.247954,2.249526,2.382595,2.212217,137178.0,...,-2.520480e+08,0.028125,20848.711759,24.289833,-0.157911,-3.906175,4.773918,53.305855,7.301086,4.353521e-07
6,12957.0,31243.0,8536.0,2.337027,2.188784,2.191130,2.122494,2.340233,2.112687,138049.0,...,-3.674822e+08,-0.065876,24101.280311,15.013053,-0.122346,-3.056196,3.882135,29.529367,5.434093,3.238814e-07
7,12909.0,31042.0,8587.0,2.370466,2.237658,2.248775,2.200572,2.387506,2.188418,136703.0,...,-3.520853e+08,-0.046083,23770.017435,18.040218,-0.148563,-4.188974,4.078571,32.351119,5.687804,3.391433e-07
8,13434.0,31692.0,8796.0,2.290183,2.140742,2.145645,2.105858,2.283054,2.108357,137348.0,...,-3.303593e+08,-0.016979,23141.981778,36.524345,-0.144895,-3.931615,4.382976,34.275077,5.854492,3.490852e-07
9,12384.0,30363.0,8367.0,2.515634,2.368621,2.383281,2.429959,2.488553,2.387901,133782.0,...,-2.611324e+08,0.011832,21181.629746,29.791634,-0.258616,-7.209276,6.156567,60.678135,7.789617,4.644726e-07
10,12798.0,30921.0,8471.0,2.395822,2.289431,2.298168,2.314484,2.424275,2.258129,135238.0,...,-2.708545e+08,0.003616,21547.164288,42.613857,-0.187804,-6.700678,4.938428,47.478111,6.890436,4.107787e-07


In [33]:
smaller_fc_parameters = tsfresh.feature_extraction.settings.from_columns(smaller)

smaller2 is the name of the 41 variables I would ultimately stick with

In [6]:
smaller2 = {'acoustic_data': {'number_peaks': [{'n': 3}, {'n': 1}, {'n': 5}, {'n': 10}],
  'change_quantiles': [{'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.2},
   {'f_agg': 'mean', 'isabs': True, 'qh': 1.0, 'ql': 0.4},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.4}],
  'range_count': [{'max': 1000000000000.0, 'min': 0}],
  'number_cwt_peaks': [{'n': 1}],
  'mean_abs_change': None,
  'absolute_sum_of_changes': None,
  'c3': [{'lag': 1}, {'lag': 2}],
  'ar_coefficient': [{'k': 10, 'coeff': 4}, {'k': 10, 'coeff': 1}],
  'partial_autocorrelation': [{'lag': 2},
   {'lag': 8},
   {'lag': 3},
   {'lag': 9},
   {'lag': 1}],
  'quantile': [{'q': 0.1}],
  'fft_aggregated': [{'aggtype': 'variance'}, {'aggtype': 'centroid'}],
  'spkt_welch_density': [{'coeff': 2}],
  'agg_linear_trend': [{'f_agg': 'min', 'chunk_len': 50, 'attr': 'intercept'},
   {'f_agg': 'mean', 'chunk_len': 5, 'attr': 'stderr'},
   {'f_agg': 'max', 'chunk_len': 50, 'attr': 'intercept'}],
  'variance': None,
  'standard_deviation': None,
  'linear_trend': [{'attr': 'stderr'}],
  'cid_ce': [{'normalize': True}],
  'autocorrelation': [{'lag': 1}]}}

In [41]:
# with open("smaller2","wb") as pickle_out:
#     pickle.dump(smaller2,pickle_out)

In [42]:
train2 = pd.read_csv("../input/train.csv", skiprows = range(1,30000001), nrows = 30000000,
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}).reset_index()
train2.head(5)

Unnamed: 0,index,acoustic_data,time_to_failure
0,0,0,5.217499
1,1,3,5.217499
2,2,4,5.217499
3,3,4,5.217499
4,4,8,5.217499


I'd have to repeatedly write helper functions to help me extract features...

In [43]:
def tsfreshify(df, i):
    rows = 150000
    idlist = []
    for n in range(200*(i-1) + 1,200*(i-1) + 201):
        idlist += [n for i in range(rows)]
    df['id'] = idlist
    df.columns = ['time', 'acoustic_data', 'time_to_failure', 'id']
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    y = df['time_to_failure']
    X = df.drop(columns = 'time_to_failure')
    target = y[rows-1::rows]
    target.index = range(200*(i-1) + 1,200*(i-1) + 201)
    return X, target

In [44]:
X2, y2 = tsfreshify(train2, 2)

In [46]:
X2.head()

Unnamed: 0,id,time,acoustic_data
0,201,0,0
1,201,1,3
2,201,2,4
3,201,3,4
4,201,4,8


In [47]:
y2[:5]

201    5.179096
202    5.139798
203    5.100400
204    5.062097
205    5.022699
Name: time_to_failure, dtype: float64

In [45]:
extracted2 = extract_features(X2, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=smaller_fc_parameters['acoustic_data'])

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [48:13<00:00, 47.49s/it]


In [48]:
extracted22 = extract_features(X2, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=smaller2['acoustic_data'])

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 40/40 [43:55<00:00, 31.77s/it]


In [8]:
train3 = pd.read_csv("../input/train.csv", skiprows = range(1,2*rows+1), nrows = 150000000,
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}).reset_index()

In [7]:
def tsfreshify2(df, i):
    rows = 150000
    idlist = []
    for n in range(1000*(i-1) + 1,1000*(i-1) + 1001):
        idlist += [n for i in range(rows)]
    df['id'] = idlist
    df.columns = ['time', 'acoustic_data', 'time_to_failure', 'id']
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    y = df['time_to_failure']
    X = df.drop(columns = 'time_to_failure')
    target = y[rows-1::rows]
    target.index = range(1000*(i-1) + 1,1000*(i-1) + 1001)
    return X, target

In [9]:
X3, y3 = tsfreshify2(train3, 2)

In [10]:
extracted3 = extract_features(X3, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=smaller2['acoustic_data'])

Feature Extraction: 100%|███████████████████████████████████████████████████████████| 40/40 [3:48:31<00:00, 242.13s/it]


In [11]:
with open("extract3","wb") as pickle_out:
    pickle.dump(extracted3, pickle_out)

In [13]:
train4 = pd.read_csv("../input/train.csv", skiprows = range(1,3*rows+1), nrows = 150000000,
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}).reset_index()

In [14]:
X4, y4 = tsfreshify2(train4, 3)

In [15]:
extracted4 = extract_features(X4, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=smaller2['acoustic_data'])

Feature Extraction: 100%|███████████████████████████████████████████████████████████| 40/40 [3:47:22<00:00, 206.61s/it]


In [16]:
with open("extract4","wb") as pickle_out:
    pickle.dump(extracted4, pickle_out)

In [17]:
with open("y3","wb") as pickle_out:
    pickle.dump(y3, pickle_out)

In [18]:
with open("y4","wb") as pickle_out:
    pickle.dump(y4,pickle_out)

In [None]:
train1 = pd.read_csv("../input/train.csv", skiprows = range(1,0*rows+1), nrows = 150000000,
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}).reset_index()

In [None]:
X1, y1 = tsfreshify2(train3, 1)

In [None]:
with open("extract1","wb") as pickle_out:
    pickle.dump(extracted1, pickle_out)
with open("y1","wb") as pickle_out:
    pickle.dump(y1, pickle_out)

In [None]:
extracted1 = extract_features(X1, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=smaller2['acoustic_data'])

In [None]:
train2 = pd.read_csv("../input/train.csv", skiprows = range(1,1*rows+1), nrows = 150000000,
                    dtype={'acoustic_data': np.int16, 'time_to_failure': np.float64}).reset_index()

In [None]:
X2, y2 = tsfreshify2(train3, 4)

In [None]:
with open("extract2","wb") as pickle_out:
    pickle.dump(extracted2, pickle_out)
with open("y2","wb") as pickle_out:
    pickle.dump(y2, pickle_out)

In [None]:
extracted2 = extract_features(X2, column_id="id", column_sort="time",n_jobs=8, default_fc_parameters=smaller2['acoustic_data'])

In [5]:
outliers = train[(train['acoustic_data']>500)]

In [12]:
outliers.count()

acoustic_data      15278
time_to_failure    15278
dtype: int64

### Spark Analysis
***
Here is some spark analysis I did at the beginning, but I abandoned it later in favor of dask or larger EC2 instances, since I really needed to use LGBM or XGBoost, and using XGBoost on spark is a mega hassle (although possible)

In [8]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Exception: Java gateway process exited before sending its port number

In [2]:
aak = os.environ['AWS_ACCESS_KEY']
ask = os.environ['AWS_SECRET_KEY']
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.hadoop:hadoop-aws:2.6.0 pyspark-shell'

In [3]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
hadoopConf = spark._jsc.hadoopConfiguration()
hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
hadoopConf.set("fs.s3.awsAccessKeyId", aak)
hadoopConf.set("fs.s3.awsSecretAccessKey", ask)

In [6]:
df2 = spark.read_csv('s3://isaacearthquake/train.csv')
df2.show()

AttributeError: 'SparkSession' object has no attribute 'read_csv'

In [7]:
df = spark.read.format("csv").option("header", "true").load("s3://isaacearthquake/train.csv")

In [14]:
type(df)

pyspark.sql.dataframe.DataFrame

In [15]:
df.printSchema()

root
 |-- acoustic_data: string (nullable = true)
 |-- time_to_failure: string (nullable = true)



In [22]:
bob=df.rdd.countApprox()

TypeError: countApprox() missing 1 required positional argument: 'timeout'

In [24]:
df.describe().show()

KeyboardInterrupt: 