In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import os
print(os.listdir("../input"))
import time
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

# 数据处理

## 训练集

In [None]:
%%time
df_train=pd.read_csv('../input/train.csv',dtype = {'acoustic_data': np.int16,'time_to_failure': np.float64})
pd.options.display.precision = 15

> [NumPy 数据类型](http://www.runoob.com/numpy/numpy-dtype.html)

In [None]:
df_train.info()

# 特征工程

In [None]:
# Simple trend feature: fit a linear regression and return the coefficient
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

In [None]:
rows = 150000
segments=int(np.floor(df_train.shape[0] / rows))

X = pd.DataFrame(index=range(segments),dtype=np.float64)
Y = pd.DataFrame(index=range(segments),dtype=np.float64)

for segment in tqdm(range(segments)):
    seg = df_train.iloc[segment*rows: segment*rows + rows]
    x = seg['acoustic_data']
    y = seg['time_to_failure'].values[-1]  # 取seg的最后一个time_to_failure
    
    Y.loc[segment, 'time_to_failure'] = y
    
    X.loc[segment, 'ave'] = x.values.mean()
    X.loc[segment, 'std'] = x.values.std()
    X.loc[segment, 'max'] = x.values.max()
    X.loc[segment, 'min'] = x.values.min()
    X.loc[segment, 'q01'] = np.quantile(x.values, 0.01)
    X.loc[segment, 'q05'] = np.quantile(x.values, 0.05)
    X.loc[segment, 'q95'] = np.quantile(x.values, 0.75)
    X.loc[segment, 'q99'] = np.quantile(x.values, 0.99)
    
    X.loc[segment,'abs_max'] = np.abs(x.values).max()
    X.loc[segment, 'abs_mean'] = np.abs(x.values).mean()
    X.loc[segment, 'abs_std'] = np.abs(x.values).std()
    X.loc[segment, 'trend'] = add_trend_feature(x.values)
    X.loc[segment, 'abs_trend'] = add_trend_feature(x.values, abs_values=True)
    
    for w in [10,100,1000]:
        x_roll_std = x.rolling(w).std().dropna().values
        x_roll_mean = x.rolling(w).mean().dropna().values
        x_roll_abs_mean = x.abs().rolling(w).mean().dropna().values
        
        X.loc[segment, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
        X.loc[segment, 'std_roll_std_' + str(w)] = x_roll_std.std()
        X.loc[segment, 'max_roll_std_' + str(w)] = x_roll_std.max()
        X.loc[segment, 'min_roll_std_' + str(w)] = x_roll_std.min()
        X.loc[segment, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
        X.loc[segment, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
        X.loc[segment, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
        X.loc[segment, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)
        
        X.loc[segment, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
        X.loc[segment, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
        X.loc[segment, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
        X.loc[segment, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
        X.loc[segment, 'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01)
        X.loc[segment, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
        X.loc[segment, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
        X.loc[segment, 'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99)
        
        X.loc[segment, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
        X.loc[segment, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
        X.loc[segment, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max()
        X.loc[segment, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min()
        X.loc[segment, 'q01_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.01)
        X.loc[segment, 'q05_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.05)
        X.loc[segment, 'q95_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.95)
        X.loc[segment, 'q99_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.99)

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X_scaled,Y,test_size=0.3,random_state=42)
X_train = pd.DataFrame(X_train,columns=X.columns)

## lgb训练模型

In [None]:
# train = lgb.Dataset(X_train,y_train)
# valid = lgb.Dataset(X_valid,y_valid,reference = train)

parameters = {
              'max_depth': [8, 15, 20, 25, 30, 35],
              'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
              'n_eatimators' : [500, 1000, 1500, 2000]
}

In [None]:
estimator = lgb.LGBMRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

estimator.fit(
    X_train,
    y_train,
    eval_metric='mae',
    eval_set=[(X_valid,y_valid)],
    verbose=True
    )
# gsearch = GridSearchCV(estimator, param_grid=parameters, cv=5, scoring='neg_mean_absolute_error')
# gsearch.fit(X_train,y_train.values.flatten())

# print("Best CV score: {:.4f}".format(gsearch.best_score_))
# print(gsearch.best_params_)


## 测试集

In [None]:
submission = pd.read_csv('../input/sample_submission.csv',index_col='seg_id')
X_test = pd.DataFrame(columns=X.columns,dtype=np.float64,index=submission.index)

In [None]:
X_test.index

In [None]:
for seg_id in X_test.index:
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    
    x = seg['acoustic_data']  # pd series
    
    X_test.loc[seg_id, 'ave'] = x.values.mean()
    X_test.loc[seg_id, 'std'] = x.values.std()
    X_test.loc[seg_id, 'max'] = x.values.max()
    X_test.loc[seg_id, 'min'] = x.values.min()
    X_test.loc[seg_id, 'q01'] = np.quantile(x.values, 0.01)
    X_test.loc[seg_id, 'q05'] = np.quantile(x.values, 0.05)
    X_test.loc[seg_id, 'q95'] = np.quantile(x.values, 0.95)
    X_test.loc[seg_id, 'q99'] = np.quantile(x.values, 0.99)
    
    X_test.loc[seg_id, 'abs_max'] = np.abs(x.values).max()
    X_test.loc[seg_id, 'abs_mean'] = np.abs(x.values).mean()
    X_test.loc[seg_id, 'abs_std'] = np.abs(x.values).std()
    X_test.loc[seg_id, 'trend'] = add_trend_feature(x.values)
    X_test.loc[seg_id, 'abs_trend'] = add_trend_feature(x.values, abs_values=True)
    
    # New features - rolling features
    for w in [10, 100, 1000]:
        x_roll_std = x.rolling(w).std().dropna().values
        x_roll_mean = x.rolling(w).mean().dropna().values
        x_roll_abs_mean = x.abs().rolling(w).mean().dropna().values
        
        X_test.loc[seg_id, 'ave_roll_std_' + str(w)] = x_roll_std.mean()
        X_test.loc[seg_id, 'std_roll_std_' + str(w)] = x_roll_std.std()
        X_test.loc[seg_id, 'max_roll_std_' + str(w)] = x_roll_std.max()
        X_test.loc[seg_id, 'min_roll_std_' + str(w)] = x_roll_std.min()
        X_test.loc[seg_id, 'q01_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.01)
        X_test.loc[seg_id, 'q05_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.05)
        X_test.loc[seg_id, 'q95_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.95)
        X_test.loc[seg_id, 'q99_roll_std_' + str(w)] = np.quantile(x_roll_std, 0.99)
        
        X_test.loc[seg_id, 'ave_roll_mean_' + str(w)] = x_roll_mean.mean()
        X_test.loc[seg_id, 'std_roll_mean_' + str(w)] = x_roll_mean.std()
        X_test.loc[seg_id, 'max_roll_mean_' + str(w)] = x_roll_mean.max()
        X_test.loc[seg_id, 'min_roll_mean_' + str(w)] = x_roll_mean.min()
        X_test.loc[seg_id, 'q01_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.01)
        X_test.loc[seg_id, 'q05_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.05)
        X_test.loc[seg_id, 'q95_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.95)
        X_test.loc[seg_id, 'q99_roll_mean_' + str(w)] = np.quantile(x_roll_mean, 0.99)
        
        X_test.loc[seg_id, 'ave_roll_abs_mean_' + str(w)] = x_roll_abs_mean.mean()
        X_test.loc[seg_id, 'std_roll_abs_mean_' + str(w)] = x_roll_abs_mean.std()
        X_test.loc[seg_id, 'max_roll_abs_mean_' + str(w)] = x_roll_abs_mean.max()
        X_test.loc[seg_id, 'min_roll_abs_mean_' + str(w)] = x_roll_abs_mean.min()
        X_test.loc[seg_id, 'q01_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.01)
        X_test.loc[seg_id, 'q05_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.05)
        X_test.loc[seg_id, 'q95_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.95)
        X_test.loc[seg_id, 'q99_roll_abs_mean_' + str(w)] = np.quantile(x_roll_abs_mean, 0.99)

In [None]:
X_test_scaled = scaler.transform(X_test)

## 预测

In [None]:
submission['time_to_failure'] = estimator.predict(X_test_scaled)
submission.to_csv('submission.csv')

# 特征重要性

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
plot_importance(estimator,max_num_features=90, height=0.4,ax=ax)