In [5]:
import CMAPSAuxFunctions
import random
import numpy as np
import pandas as pd
import math
import time

from IPython.display import display, HTML
%matplotlib notebook

In [6]:
windowSize = 30
stride = 1
data_file_train = '../CMAPSSData/train_FD001.txt'
data_file_test = '../CMAPSSData/test_FD001.txt'
selected_features = ['T24', 'T30', 'T50', 'P30', 'Nf', 'Nc', 'Ps30', 'phi', 'NRf', 'NRc', 
                     'BPR', 'htBleed', 'W31', 'W32']


df_train = CMAPSAuxFunctions.load_into_df(data_file_train)
df_test = CMAPSAuxFunctions.load_into_df(data_file_test)


start = time.clock()
X_train, y_train, X_cv, y_cv = CMAPSAuxFunctions.create_windowed_data(df_train, selected_features, 'train', 
                                                                            time_window = windowSize, stride = stride,
                                                                           crossValidationRatio = 0.1)
end = time.clock()
print("Elapsed time: {}".format(str(end-start)))

start = time.clock()
X_test, y_test, _, _ = CMAPSAuxFunctions.create_windowed_data(df_test, selected_features, 'test', 
                                                                            time_window = windowSize, stride = stride)
end = time.clock()
print("Elapsed time: {}".format(str(end-start)))


Elapsed time: 1.5431569999999994
Elapsed time: 0.9871559999999997


In [7]:
print("Printing shapes\n")

print("Training data (X, y)")
print(X_train.shape)
print(y_train.shape)

if X_cv is not None:
    print("Cross-Validation data (X, y)")
    print(X_cv.shape)
    print(y_cv.shape)

print("Testing data (X, y)")
print(X_test.shape)
print(y_test.shape)

print("Printing first 5 elements\n")

print("Training data (X, y)")
print(X_train[:5,:])
print(y_train[:5,:])

if X_cv is not None:
    print("Cross-Validation data (X, y)")
    print(X_cv[:5,:])
    print(y_cv[:5,:])

print("Testing data (X, y)")
print(X_test[:5,:])
print(y_test[:5,:])

Printing shapes

Training data (X, y)
(15916, 420)
(15916, 1)
Cross-Validation data (X, y)
(10, 420)
(10, 1)
Testing data (X, y)
(100, 420)
(100, 1)
Printing first 5 elements

Training data (X, y)
[[  641.82    1589.7     1400.6    ...,   390.        39.05      23.411 ]
 [  642.15    1591.82    1403.14   ...,   392.        38.94      23.3353]
 [  642.35    1587.99    1404.2    ...,   392.        39.02      23.4999]
 [  642.35    1582.79    1401.87   ...,   392.        38.83      23.3506]
 [  642.37    1582.85    1406.22   ...,   392.        38.81      23.3092]]
[[ 125.]
 [ 125.]
 [ 125.]
 [ 125.]
 [ 125.]]
Cross-Validation data (X, y)
[[  642.91    1594.7     1414.93   ...,   393.        38.88      23.2242]
 [  642.92    1594.11    1402.33   ...,   394.        38.61      23.2125]
 [  642.5     1595.22    1407.99   ...,   395.        38.66      23.1985]
 [  642.64    1589.85    1410.59   ...,   394.        38.41      23.2056]
 [  642.27    1578.13    1407.15   ...,   391.        38.98  

In [13]:
num_engines = 90
splittingRatio = 0.15
shuffledEngines = list(range(1,num_engines+1))
random.shuffle(shuffledEngines)
print(shuffledEngines)

i = int(splittingRatio*num_engines)

test_engines = shuffledEngines[:i]
crossVal_engines = shuffledEngines[i:] 

print(test_engines)
print(crossVal_engines)

[70, 11, 12, 61, 79, 19, 20, 51, 85, 13, 63, 45, 32, 82, 46, 4, 78, 65, 50, 58, 23, 35, 28, 21, 49, 41, 62, 18, 47, 69, 55, 29, 27, 39, 53, 81, 15, 26, 7, 5, 8, 36, 90, 34, 9, 83, 80, 30, 87, 40, 54, 67, 2, 48, 17, 10, 16, 75, 25, 42, 71, 86, 22, 38, 52, 72, 3, 60, 88, 44, 73, 37, 64, 66, 57, 76, 77, 1, 89, 14, 24, 56, 68, 84, 6, 59, 74, 33, 43, 31]
[70, 11, 12, 61, 79, 19, 20, 51, 85, 13, 63, 45, 32]
[82, 46, 4, 78, 65, 50, 58, 23, 35, 28, 21, 49, 41, 62, 18, 47, 69, 55, 29, 27, 39, 53, 81, 15, 26, 7, 5, 8, 36, 90, 34, 9, 83, 80, 30, 87, 40, 54, 67, 2, 48, 17, 10, 16, 75, 25, 42, 71, 86, 22, 38, 52, 72, 3, 60, 88, 44, 73, 37, 64, 66, 57, 76, 77, 1, 89, 14, 24, 56, 68, 84, 6, 59, 74, 33, 43, 31]


In [65]:
def compute_training_RUL(df_row, *args):
    
    constRUL = args[1]
    rul_vector = args[0]
    
    if rul_vector[int(df_row['Unit Number']) - 1] - df_row['Cycle'] > constRUL:
        return constRUL
    else:
        return rul_vector[int(df_row['Unit Number']) - 1] - df_row['Cycle']


def get_X_y_from_df(df, time_window, features, num_units, dataset_type, stride=1):
    
    n_m = df.shape[0]
    n_x = len(features)
    
    df_values = df[features].values
    targets = df['RUL'].values
    n_m = 0
    n_X = len(features)
    df_unit_values = []
    targets_unit = []
    num_samples_unit = []
    engine_windows = []
    
    engineNumbers = df['Unit Number'].unique()
    
    #Count number of elements at each group so that we can create the matrix to hold them all. 
    #Also store each matrix in temporary arrays to access them faster
    for j in range(num_units):
        
        i = engineNumbers[j]
        df_unit = df.loc[df['Unit Number'] == i]
        df_unit_values.append(df_unit[features].values) #is this a view or a copy of the df?
        targets_unit.append(df_unit['RUL'].values) #is this a view or a copy of the df?
        num_samples_unit.append(df_unit.shape[0])
        engine_windows.append(math.floor((df_unit.shape[0]-time_window)/stride) + 1)
        n_m = n_m + engine_windows[-1]
    
    #Create the numpy arrays to hold the features
    if (dataset_type == 'train' or dataset_type == 'cross_validation'):
        X, y = np.empty([n_m, n_x*time_window]), np.empty([n_m, 1])
    else:
        X, y = np.empty([num_units, n_x*time_window]), np.empty([num_units, 1])
        
    k = 0
    
    #Create the feature matrix by moving the time window for each type of engine.
    for i in range(num_units):
    
        if (dataset_type == 'train' or dataset_type == 'cross_validation'):
            for j in range(engine_windows[i]):

                time_window_samples = df_unit_values[i][j*stride:j*stride+time_window,:]
                X[k,:] = np.squeeze(time_window_samples.reshape(1,-1))
                y[k] = targets_unit[i][j*stride+time_window-1]
                k = k + 1
        else:
            time_window_samples = df_unit_values[i][-time_window:,:]
            X[k,:] = np.squeeze(time_window_samples.reshape(1,-1))
            k = k + 1
    
    return X, y


def retrieve_and_reshape_data(from_file, selected_features, dataset_type, time_window=10, constRUL=125, unit_number=None, stride=1, crossValidationRatio=0):
    '''
    5    T2        - Total temperature at fan inlet      R
    6    T24       - Total temperature at lpc outlet     R
    7    T30       - Total temperature at hpc outlet     R
    8    T50       - Total temperature at LPT outlet     R
    9    P2        - Pressure at fan inlet               psia
    10   P15       - Total pressure in bypass-duct       psia
    11   P30       - Total pressure at HPC outlet        psia
    12   Nf        - Physical fan speed                  rpm
    13   Nc        - Physical core speed                 rpm
    14   epr       - Engine Pressure ratio (P50/P2)      --
    15   Ps30      - Static Pressure at HPC outlet       psia
    16   phi       - Ratio fuel flow to Ps30             pps/psi
    17   NRf       - corrected fan speed                 rpm
    18   NRc       - Corrected core speed                rpm
    19   BPR       - Bypass ratio                        --
    20   farB      - Burner fuel-air ratio               --
    21   htBleed   - Bleed enthalpy                      --
    22   Nf_dmd    - Demanded fan speed                  rpm
    23   PCNfR_dmd - Demanded corrected fan speed        rpm
    24   W31       - HPT coolant bleed                   lbm/s
    25   W32       - LPT coolant bleed                   lbm/s
    '''

    X_crossVal, y_crossVal = None, None

    if crossValidationRatio < 0 or crossValidationRatio > 1 :
        print("Error, cross validation must be between 0 and 1")
        return
    
    df = pd.read_csv(from_file ,sep='\s+',header=None)

    col_names = {0:'Unit Number', 1:'Cycle', 2:'Op. Settings 1', 3:'Op. Settings 2', 4:'Op. Settings 3', 5:'T2',
                6:'T24', 7:'T30', 8:'T50', 9:'P2', 10:'P15', 11:'P30', 12:'Nf', 13:'Nc', 14:'epr', 15:'Ps30', 
                16:'phi', 17:'NRf', 18:'NRc', 19:'BPR', 20:'farB', 21:'htBleed', 22:'Nf_dmd', 23:'PCNfR_dmd', 
                24:'W31', 25:'W32'}

    df.rename(columns=col_names, inplace=True)

    #In case a specific unit number is needed
    if unit_number != None:
        df = df[df['Unit Number'] == unit_number]
        df['Unit Number'] = 1

    df_rul, num_units = generate_df_withRUL(df, selected_features, constRUL)

    #Split for cross-validation
    if crossValidationRatio != 0 and dataset_type == 'train': 
        df_train, df_crossVal, num_train, num_crossVal = split_dataFrames(df_rul, crossValidationRatio)
        
        df_crossVal, rul_crossVal = generate_cross_validation_from_df(df_crossVal, time_window)
        #display(df_train)
        #display(df_crossVal)
        
        X, y = get_X_y_from_df(df_train, time_window, selected_features, num_train, 
                               dataset_type, stride=stride)
        
        X_crossVal, _ = get_X_y_from_df(df_crossVal, time_window, selected_features, num_crossVal, 
                                        'test', stride=stride)
        
        y_crossVal = rul_crossVal
    else:
        X, y = get_X_y_from_df(df_rul, time_window, selected_features, num_units, dataset_type, stride=stride)
    
    return X, y, X_crossVal, y_crossVal


def generate_df_withRUL(df, selected_features, constRUL):
    """Given a dataframe compute its RUL and extract its selectedFeatures"""

    gruoped_by_unit = df.groupby('Unit Number')
    rul_vector = gruoped_by_unit.size().values
    num_units = len(gruoped_by_unit)

    df['RUL'] = df.apply(compute_training_RUL, axis = 1, args=(rul_vector,constRUL,))
    selected_features_rul = selected_features[:]
    selected_features_rul.extend(['Unit Number', 'RUL'])
    df_selected_features = df[selected_features_rul]
    
    return df_selected_features, num_units


def split_dataFrames(df, splittingRatio):
    """Split the dataframes according to the indicated splitting ratio"""

    num_engines = df['Unit Number'].max()

    shuffledEngines = list(range(1,num_engines+1))
    random.shuffle(shuffledEngines)

    i = int(splittingRatio*num_engines)
    num_crossVal = i
    num_train = num_engines - num_crossVal

    crossVal_engines = shuffledEngines[:i]
    train_engines = shuffledEngines[i:]

    df_train = df[df['Unit Number'].isin(train_engines)]
    df_crossVal = df[df['Unit Number'].isin(crossVal_engines)]

    return (df_train, df_crossVal, num_train, num_crossVal)


def generate_cross_validation_from_df(df, window_size):
    """Given a dataframe truncate the data to generate cross validation dataset"""
    
    data = []
    
    groupedByUnit = df.groupby('Unit Number')
    sizes = groupedByUnit.size().values
    ruls = np.zeros((sizes.shape[0],1))
    cols = df.columns
    
    count = 0
    
    #Truncate readings up to a random number larger than window size but less than total size
    for engineNumber, df in groupedByUnit:
        truncateAt = random.randint(window_size, sizes[count])
        ruls[count] = sizes[count] - truncateAt
       # print("{} {} {}".format(engineNumber, truncateAt, ruls[count]))
        data_temp = df.values[:truncateAt]

        if count == 0:
            data = data_temp
        else:
            data = np.concatenate([data, data_temp])
        
        count = count + 1
    
    df = pd.DataFrame(data=data, columns=cols)
    
    return df, ruls

In [66]:
windowSize = 30
stride = 1
data_file_train = '../CMAPSSData/train_FD001.txt'
selected_features = ['T24', 'T30', 'T50', 'P30', 'Nf', 'Nc', 'Ps30', 'phi', 'NRf', 'NRc', 
                     'BPR', 'htBleed', 'W31', 'W32']


X_train, y_train, X_cv, y_cv, = retrieve_and_reshape_data(data_file_train, selected_features, 'train', 
                                                                            time_window = windowSize, stride = stride,
                                                                           crossValidationRatio = 0.2)

X_test, y_test, _, _, = retrieve_and_reshape_data(data_file_train, selected_features, 'test', 
                                                                            time_window = windowSize, stride = stride,
                                                                           crossValidationRatio = 0.2)

3 74 [ 105.]
6 184 [ 4.]
10 69 [ 153.]
12 157 [ 13.]
15 99 [ 108.]
23 74 [ 94.]
25 202 [ 28.]
28 165 [ 0.]
30 134 [ 60.]
31 103 [ 131.]
45 36 [ 122.]
49 106 [ 109.]
56 35 [ 240.]
59 146 [ 85.]
60 88 [ 84.]
71 199 [ 9.]
74 87 [ 79.]
75 34 [ 195.]
92 134 [ 207.]
93 118 [ 37.]


In [67]:
print("Printing shapes\n")

print("Training data (X, y)")
print(X_train.shape)
print(y_train.shape)

if X_cv is not None:
    print("Cross-Validation data (X, y)")
    print(X_cv.shape)
    print(y_cv.shape)

print("Testing data (X, y)")
print(X_test.shape)
print(y_test.shape)

print("Printing first 5 elements\n")

print("Training data (X, y)")
print(X_train[:5,:])
print(y_train[:5,:])

if X_cv is not None:
    print("Cross-Validation data (X, y)")
    print(X_cv[:5,:])
    print(y_cv[:5,:])

print("Testing data (X, y)")
print(X_test[:5,:])
print(y_test[:5,:])

Printing shapes

Training data (X, y)
(14204, 420)
(14204, 1)
Cross-Validation data (X, y)
(20, 420)
(20, 1)
Testing data (X, y)
(100, 420)
(100, 1)
Printing first 5 elements

Training data (X, y)
[[  641.82    1589.7     1400.6    ...,   390.        39.05      23.411 ]
 [  642.15    1591.82    1403.14   ...,   392.        38.94      23.3353]
 [  642.35    1587.99    1404.2    ...,   392.        39.02      23.4999]
 [  642.35    1582.79    1401.87   ...,   392.        38.83      23.3506]
 [  642.37    1582.85    1406.22   ...,   392.        38.81      23.3092]]
[[ 125.]
 [ 125.]
 [ 125.]
 [ 125.]
 [ 125.]]
Cross-Validation data (X, y)
[[  642.15    1582.99    1398.33   ...,   392.        38.93      23.3845]
 [  642.67    1589.15    1419.23   ...,   397.        38.58      22.9949]
 [  642.12    1583.04    1393.76   ...,   392.        38.98      23.3893]
 [  642.93    1592.17    1417.31   ...,   394.        38.37      23.203 ]
 [  642.18    1590.23    1398.83   ...,   393.        38.97  