In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
%matplotlib notebook

In [2]:
# load data and name the column names
column_name =  ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
       's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
       's15', 's16', 's17', 's18', 's19', 's20', 's21' ]
train_FD001 = pd.read_table("./CMAPSSData/train_FD001.txt",header=None,delim_whitespace=True)
train_FD002 = pd.read_table("./CMAPSSData/train_FD002.txt",header=None,delim_whitespace=True)
train_FD003 = pd.read_table("./CMAPSSData/train_FD003.txt",header=None,delim_whitespace=True)
train_FD004 = pd.read_table("./CMAPSSData/train_FD004.txt",header=None,delim_whitespace=True)
train_FD001.columns = column_name
train_FD002.columns = column_name
train_FD003.columns = column_name
train_FD004.columns = column_name

In [31]:
for data in ['train_FD00' + str(i) for  i in range(1,5)]:
    # have a look at the info of each data file
    eval(data).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
engine_id    20631 non-null int64
cycle        20631 non-null int64
setting1     20631 non-null float64
setting2     20631 non-null float64
setting3     20631 non-null float64
s1           20631 non-null float64
s2           20631 non-null float64
s3           20631 non-null float64
s4           20631 non-null float64
s5           20631 non-null float64
s6           20631 non-null float64
s7           20631 non-null float64
s8           20631 non-null float64
s9           20631 non-null float64
s10          20631 non-null float64
s11          20631 non-null float64
s12          20631 non-null float64
s13          20631 non-null float64
s14          20631 non-null float64
s15          20631 non-null float64
s16          20631 non-null float64
s17          20631 non-null int64
s18          20631 non-null int64
s19          20631 non-null float64
s20          20631 non-null float64

Now, we can see that these data files have no null values, which is good thing because now we don't need to impute by ourselves.

And the size of them are:
* train_FD001 : (20631, 26)
* train_FD002 : (53759, 26)
* train_FD003 : (24720, 26)
* train_FD004 : (61249, 26)

## 数据处理：
1. 去掉'engine_id'
2. 根据'cycle'确定RUL（比如：最后一个cycle的RUL为0，因为训练集的数据都是从正常一直跑到故障的）
$$ RUL[0] = max(cycle) - 1 $$
$$ ... $$
$$ RUL[-1] = 0 $$
3. 将四个训练集按axis = 0接到一起，**顺序不打乱**，这样我们得到一个新的训练集 train，其大小为(160359, 25)，其最后一列为RUL
4. 对每一列的feature都进行scale，使其满足标准正太分布
$$scaled\_train =  {train - train\_mean \over train\_std\_deviation} $$

$$scaled\_test = {test - train\_mean \over train\_std\_deviation} $$

In [3]:
def compute_rul_of_one_id(train_FD00X_of_one_id):
    '''
    输入train_FD001的一个engine_id的数据，输出这些数据对应的RUL（剩余寿命），type为list
    '''
    max_cycle = max(train_FD00X_of_one_id['cycle'])  # 故障时的cycle
    rul_of_one_id = max_cycle - train_FD00X_of_one_id['cycle']
    return rul_of_one_id.tolist()

def compute_rul_of_one_file(train_FD00X):
    '''
    输入train_FD001，输出一个list'''
    rul = []
    # 循环train中，''engine_id''这一列的每一种id值
    for id in set(train_FD00X['engine_id']):
        rul.extend(compute_rul_of_one_id(train_FD00X[train_FD00X['engine_id'] == id]))
    return rul

In [4]:
# 为4个data增加RUL列
for data_file in ['train_FD00' + str(i) for  i in range(1,5)]:
    # have a look at the info of each data file
    eval(data_file)['RUL'] = compute_rul_of_one_file(eval(data_file))

In [32]:
# 重新设置index， 使四个data的index能衔接上
train_FD001.index = range(20631)
train_FD002.index = range(20631,20631+53759)
train_FD003.index = range(20631+53759,20631+53759+24720)
train_FD004.index = range(20631+53759+24720,20631+53759+24720+61249)

In [39]:
# 将四个data拼接到一起，并设置hierarchical index : ['FD001', 'FD002', 'FD003', 'FD004']
frames = [train_FD001, train_FD002, train_FD003, train_FD004]
train = pd.concat(frames, keys = ['FD001', 'FD002', 'FD003', 'FD004'])

In [59]:
train.loc['FD001'][train.loc['FD001']['engine_id'] == 1]

Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.4190,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.00,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.80,8.4294,0.03,393,2388,100.0,38.90,23.4044,187
5,1,6,-0.0043,-0.0001,100.0,518.67,642.10,1584.47,1398.37,14.62,...,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669,186
6,1,7,0.0010,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.10,23.3774,185
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106,184
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.80,14.62,...,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066,183
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694,182


## 保存成csv格式，并读取
未解决问题：读取之后hierarchical index变了

In [41]:
train.to_csv('train_FD001_to_4')

In [50]:
train_all = pd.read_csv('train_FD001_to_4', index_col =[0,1])

In [51]:
train_all

Unnamed: 0_level_0,Unnamed: 1_level_0,engine_id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s13,s14,s15,s16,s17,s18,s19,s20,s21,RUL
Unnamed: 0_level_1,Unnamed: 1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FD001,0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.70,1400.60,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.00,39.06,23.4190,191
FD001,1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.00,39.00,23.4236,190
FD001,2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.20,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.00,38.95,23.3442,189
FD001,3,1,4,0.0007,0.0000,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.00,38.88,23.3739,188
FD001,4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.80,8.4294,0.03,393,2388,100.00,38.90,23.4044,187
FD001,5,1,6,-0.0043,-0.0001,100.0,518.67,642.10,1584.47,1398.37,14.62,...,2388.03,8132.85,8.4108,0.03,391,2388,100.00,38.98,23.3669,186
FD001,6,1,7,0.0010,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388.03,8132.32,8.3974,0.03,392,2388,100.00,39.10,23.3774,185
FD001,7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,2388.03,8131.07,8.4076,0.03,391,2388,100.00,38.97,23.3106,184
FD001,8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.80,14.62,...,2388.05,8125.69,8.3728,0.03,392,2388,100.00,39.05,23.4066,183
FD001,9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,2388.06,8129.38,8.4286,0.03,393,2388,100.00,38.95,23.4694,182
