## STFT 中，每個時間點的前5/10大頻率成分(頻率本身，不是頻率的震幅)的標準差的平均、標準差、趨勢
## 再考慮去掉 DC Term 、第 1 及 2 頻率成分的情況，因此共 2 X 3 X 3 = 18 個特徵


In [None]:
import numpy as np 
import pandas as pd
import os
from tqdm import tqdm,tqdm_notebook
from scipy.signal import stft
import matplotlib.pyplot as plt

In [None]:
# Import
float_data = pd.read_csv("../input/train.csv", dtype={"acoustic_data": np.float32, "time_to_failure": np.float32}).values

In [None]:
from sklearn.linear_model import LinearRegression

def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]
def add_trend_feature_abs(arr, abs_values=True):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

In [None]:
def create_feature(wave):
    fft_spe = abs(stft(wave)[2])[:,:]
    f = []
    for j in range(3):
        max_freq_seq10 = np.apply_along_axis(lambda t :np.where(t>=np.quantile(t,124/129))[0].std(),0,fft_spe[j:,:])
        max_freq_seq5 = np.apply_along_axis(lambda t :np.where(t>=np.quantile(t,119/129))[0].std(),0,fft_spe[j:,:])
        f.append(max_freq_seq10.mean())
        f.append(max_freq_seq10.std())
        f.append(add_trend_feature(max_freq_seq10))
        f.append(max_freq_seq5.mean())
        f.append(max_freq_seq5.std())
        f.append(add_trend_feature(max_freq_seq5))
    return np.r_[f]

In [None]:
# 該波段 stft 後，最大震幅的頻率平均
segment = 150000
total = int(len(float_data)/150000)
f_list = []
target = []
for i in tqdm_notebook(range(total)):
    wave = float_data[i*segment:(i+1)*segment,0]
    f_list.append(create_feature(wave))
    target.append(float_data[i*segment+149999,1])

In [None]:
col = []
for i in range(3):
    for sta in ['max_freq_10_Mean','max_freq_10_Std','max_freq_10_Trend','max_freq_5_Mean','max_freq_5_Std','max_freq_5_Trend']:
        col += [f'fft_{i}_{sta}']

In [None]:
X = pd.DataFrame(np.array(f_list),columns=col)
y = pd.Series(target)

In [None]:
submit = pd.read_csv('../input/sample_submission.csv')

In [None]:
f_list = []
for seg_id in tqdm_notebook(submit.seg_id):
    wave = pd.read_csv(f'../input/test/{seg_id}.csv').values[:,0]
    f_list.append(create_feature(wave))

In [None]:
X_test = pd.DataFrame(f_list,index=submit.seg_id,columns=col)

In [None]:
X.to_csv('X.csv')
y.to_csv('y.csv')
X_test.to_csv('X_test.csv')