In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import os
print(os.listdir("../input"))
import time
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

# 数据处理

## 训练集

In [None]:
%%time
df_train=pd.read_csv('../input/train.csv',dtype = {'acoustic_data': np.int16,'time_to_failure': np.float64})
pd.options.display.precision = 15

> [NumPy 数据类型](http://www.runoob.com/numpy/numpy-dtype.html)

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
rows = 150000
segments=int(np.floor(df_train.shape[0] / rows))

X = pd.DataFrame(index=range(segments),dtype=np.float64,
                      columns = ['ave','std','max','min'])
Y = pd.DataFrame(index=range(segments),dtype=np.float64,
                      columns=['time_to_failure'])

for segment in tqdm(range(segments)):
    seg = df_train.iloc[segment*rows: segment*rows + rows]
    x = seg['acoustic_data'].values
    y = seg['time_to_failure'].values[-1]  # 取seg的最后一个time_to_failure
    
    Y.loc[segment, 'time_to_failure'] = y
    X.loc[segment, 'ave'] = x.mean()
    X.loc[segment, 'std'] = x.std()
    X.loc[segment, 'max'] = x.max()
    X.loc[segment, 'min'] = x.min()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled,columns = ['ave','std','max','min'])

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X_scaled,Y,test_size=0.3,random_state=42)

## 训练模型

In [None]:
model = LGBMRegressor(
    max_depth=8,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)

model.fit(
    X_train,
    y_train,
    eval_metric = 'mae',
    eval_set=[(X_valid, y_valid)], 
    verbose=True
)


## 测试集

In [None]:
submission = pd.read_csv('../input/sample_submission.csv',index_col='seg_id')
X_test = pd.DataFrame(columns=X.columns,dtype=np.float64,index=submission.index)

In [None]:
for seg_id in X_test.index:
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    x = seg['acoustic_data'].values
    
    X_test.loc[seg_id, 'ave'] = x.mean()
    X_test.loc[seg_id, 'std'] = x.std()
    X_test.loc[seg_id, 'max'] = x.max()
    X_test.loc[seg_id, 'min'] = x.min()

In [None]:
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled,columns = ['ave','std','max','min'])

## 预测

In [None]:
submission['time_to_failure'] = model.predict(X_test_scaled)
submission.to_csv('../submission.csv')

# 特征重要性

In [None]:
plot_importance(model,max_num_features=4)