# Dataset Exploration -  N-CMAPSS DS02

The new C-MAPSS dataset DS02 from NASA provides degradation trajectories of 9 turbofan engines with unknown and different initial health condition for complete flights and two failure modes (HPT efficiency degradation & HPT efficiency degradation combined with LPT efficiency and capacity degradation). The data were synthetically generated with the Commercial Modular Aero-Propulsion System Simulation (C-MAPSS) dynamical model. The data contains multivariate sensors readings of the complete run-to-failure trajectories. Therefore, the records stop at the cycle/time the engine failed. A total number of 6.5M time stamps are available.

Copyright (c) by Manuel Arias.

In [4]:
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
%matplotlib inline

In [43]:
# Data files are very large and scored in H5 format
# The following function opends de files and 
def read_h5_file(filename):
    # Time tracking, Operation time (min):  0.003
    t = time.process_time()  

    # Load data
    with h5py.File(filename, 'r') as hdf:
            # Development set
            W_dev = np.array(hdf.get('W_dev'))             # W
            X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
            X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
            T_dev = np.array(hdf.get('T_dev'))             # T
            Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
            A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

            # Test set
            W_test = np.array(hdf.get('W_test'))           # W
            X_s_test = np.array(hdf.get('X_s_test'))       # X_s
            X_v_test = np.array(hdf.get('X_v_test'))       # X_v
            T_test = np.array(hdf.get('T_test'))           # T
            Y_test = np.array(hdf.get('Y_test'))           # RUL  
            A_test = np.array(hdf.get('A_test'))           # Auxiliary
            
            # Varnams
            W_var = np.array(hdf.get('W_var'))
            X_s_var = np.array(hdf.get('X_s_var'))  
            X_v_var = np.array(hdf.get('X_v_var')) 
            T_var = np.array(hdf.get('T_var'))
            A_var = np.array(hdf.get('A_var'))
            
            # from np.array to list dtype U4/U5
            W_var = list(np.array(W_var, dtype='U20'))
            X_s_var = list(np.array(X_s_var, dtype='U20'))  
            X_v_var = list(np.array(X_v_var, dtype='U20')) 
            T_var = list(np.array(T_var, dtype='U20'))
            A_var = list(np.array(A_var, dtype='U20'))
                            
    W = np.concatenate((W_dev, W_test), axis=0)  
    X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
    X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
    T = np.concatenate((T_dev, T_test), axis=0)
    Y = np.concatenate((Y_dev, Y_test), axis=0) 
    A = np.concatenate((A_dev, A_test), axis=0) 


    print('')
    print("Operation time (min): " , (time.process_time()-t)/60)
    print(f"\nData Points")
    print ("W shape: ", str(W.shape), type(W))
    print ("X_s shape: ", str(X_s.shape), type(X_s))
    print ("X_v shape: ", str(X_v.shape), type(X_v))
    print ("T shape: ", str(T.shape), type(T))
    print ("A shape: ", str(A.shape), type(A))
    
    # Features labels
    print(f"\nLabels")
    print (f"W_var type: {type(W_var)} shape: {len(W_var)}, sample: {W_var[:5]}")
    print (f"X_s_var shape: " , type(X_s_var), len(X_s_var), X_s_var[:5])
    print (f"X_v_var shape: ", type(X_v_var), len(X_v_var), X_v_var[:5])
    print (f"T_var shape: ", type(T_var), len(T_var), T_var[:5])
    print (f"A_var shape: ", type(A_var), len(A_var), A_var[:5])
    
    # Target
    print(f"\nTarget")
    print ("Y shape: ", type(Y), len(Y))

    # Prepare Datasets
    df_W = DataFrame(data=W, columns=W_var)
    df_X_s = DataFrame(data=X_s, columns=X_s_var)
    df_X_v = DataFrame(data=X_v, columns=X_v_var)
    df_T = DataFrame(data=T, columns=T_var)
    df_A = DataFrame(data=A, columns=A_var)

    return df_W, df_X_s, df_X_v, df_T, df_A, Y

In [44]:
### Set-up - Define file location
filename = '/Volumes/share/Datasets/17_TurbofanEngineDegradationSimulationDataSet2/data_set/N-CMAPSS_DS02-006.h5'
df_W, df_X_s, df_X_v, df_T, df_A, Y = read_h5_file(filename)

print("\nDataframes from datapoints and labels")
print('df_w', type(df_W), df_W.shape)
print('df_X_s', type(df_X_s), df_X_s.shape)
print('df_X_v', type(df_X_v), df_X_v.shape)
print('df_T', type(df_T), df_T.shape)
print('df_A', type(df_A), df_A.shape)


Operation time (min):  0.060886650000000014

Data Points
W shape:  (6517190, 4) <class 'numpy.ndarray'>
X_s shape:  (6517190, 14) <class 'numpy.ndarray'>
X_v shape:  (6517190, 14) <class 'numpy.ndarray'>
T shape:  (6517190, 10) <class 'numpy.ndarray'>
A shape:  (6517190, 4) <class 'numpy.ndarray'>

Labels
W_var type: <class 'list'> shape: 4, sample: ['alt', 'Mach', 'TRA', 'T2']
X_s_var shape:  <class 'list'> 14 ['T24', 'T30', 'T48', 'T50', 'P15']
X_v_var shape:  <class 'list'> 14 ['T40', 'P30', 'P45', 'W21', 'W22']
T_var shape:  <class 'list'> 10 ['fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod']
A_var shape:  <class 'list'> 4 ['unit', 'cycle', 'Fc', 'hs']

Target
Y shape:  <class 'numpy.ndarray'> 6517190

Dataframes from datapoints and labels
df_w <class 'pandas.core.frame.DataFrame'> (6517190, 4)
df_X_s <class 'pandas.core.frame.DataFrame'> (6517190, 14)
df_X_v <class 'pandas.core.frame.DataFrame'> (6517190, 14)
df_T <class 'pandas.core.frame.DataFrame'> (