# EDA analysis of CMAPSS dataset

### Dataset Overview

| Dataset | Train Trajectories | Test Trajectories | Operating Conditions | Fault Modes                             |
|---------|--------------------|-------------------|----------------------|------------------------------------------|
| FD001   | 100                | 100               | 1 (Sea Level)        | 1 (HPC Degradation)                      |
| FD002   | 260                | 259               | 6                    | 1 (HPC Degradation)                      |
| FD003   | 100                | 100               | 1 (Sea Level)        | 2 (HPC Degradation, Fan Degradation)     |
| FD004   | 248                | 249               | 6                    | 2 (HPC Degradation, Fan Degradation)     |


### Import libraries

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data_dir = "../data/"

### Import data 

In [54]:
# Standard inputs
column_names = [
    "unit", "cycles", "op_1", "op_2", "op_3",
]
# Append sensor columns
colnum = 22
for i in range(1,colnum): 
    column_names.append( "s_%s"%(i))

In [62]:
train = pd.read_csv(data_dir + 'train_FD001.txt',sep=' ', header=None)
train = train.dropna(axis = 1)
train.columns = column_names[:train.shape[1]]

test = pd.read_csv(data_dir + 'test_FD001.txt',sep=' ', header=None)
test = test.dropna(axis = 1)
test.columns = column_names[:test.shape[1]]

y_test = pd.read_csv(data_dir + 'RUL_FD001.txt', sep=' ', header=None, names=['RUL'])

eng_num = train["unit"].nunique()
print( "Number of units is %s"%(eng_num))

Number of units is 100


#### Get more information on the data

In [64]:
train.describe()

Unnamed: 0,unit,cycles,op_1,op_2,s_1,s_2,s_3,s_4,s_5,s_6,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,518.67,642.680934,1590.523119,1408.933782,14.62,21.609803,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,0.500053,6.13115,9.000605,5.3292e-15,0.001389,...,0.737553,0.071919,19.076176,0.037505,3.469531e-18,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,518.67,641.21,1571.04,1382.25,14.62,21.6,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,518.67,642.325,1586.26,1402.36,14.62,21.61,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,518.67,642.64,1590.1,1408.04,14.62,21.61,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,518.67,643.0,1594.38,1414.555,14.62,21.61,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,518.67,644.53,1616.91,1441.49,14.62,21.61,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


In [None]:
# Drop operating setting 3 since its value is not changing
train = train.drop("op_3", axis = 1)
train.head(5)

Unnamed: 0,unit,cycles,op_1,op_2,s_1,s_2,s_3,s_4,s_5,s_6,...,s_12,s_13,s_14,s_15,s_16,s_17,s_18,s_19,s_20,s_21
0,1,1,-0.0007,-0.0004,518.67,641.82,1589.7,1400.6,14.62,21.61,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,518.67,642.15,1591.82,1403.14,14.62,21.61,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,518.67,642.35,1587.99,1404.2,14.62,21.61,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,518.67,642.35,1582.79,1401.87,14.62,21.61,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,518.67,642.37,1582.85,1406.22,14.62,21.61,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [51]:
# for i in range(3):
#     engine_id = i + 1
#     df_subset = train[train["unit"] == engine_id]
#     plt.plot(df_subset["cycles"], df_subset["s_3"], label=f"Engine {engine_id}")
# plt.xlabel("Cycle")
# plt.ylabel("Sensor 2")
# plt.title("Sensor Trend")
# plt.legend()