<a href="https://colab.research.google.com/github/francoisdoanp/MLTBP/blob/master/Project_MLv2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine learning - Final Project**

# Turbofan engine degradation dataset (NASA)

# Data Preparation

**Importing necessary packages**



In [0]:

import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
import math
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM
from sklearn.preprocessing import PolynomialFeatures
from keras import regularizers
import keras.backend as k

**Importing the Turbofan engine degradation dataset.**

**Files are located in the following Github repository: https://github.com/francoisdoanp/MLTBP**

We have 4 training datasets, which contains information about one hundred engines, all of the same type. Thus, we will combine the training and test data sets. 

The training and test sets have 21 columns: ID, Time (Cycles), 3 columns for operational settings and 21 sensor measurements.

The training and testing sets have the same format, while the validation sets only contain the real RUL (remaining useful life).

For more information on the data, consult the read me at the following address:https://github.com/francoisdoanp/MLTBP/blob/master/readme.txt

In [0]:
url_base = 'https://raw.githubusercontent.com/francoisdoanp/MLTBP/master/'

file_train_1 = 'train_FD001.txt'
file_train_2 = 'train_FD002.txt'
file_train_3 = 'train_FD003.txt'
file_train_4 = 'train_FD004.txt'

file_test_1 = 'test_FD001.txt'
file_test_2 = 'test_FD002.txt'
file_test_3 = 'test_FD003.txt'
file_test_4 = 'test_FD004.txt'

file_valid_1 = 'RUL_FD001.txt'
file_valid_2 = 'RUL_FD002.txt'
file_valid_3 = 'RUL_FD003.txt'
file_valid_4 = 'RUL_FD004.txt'


pt1 = pd.read_csv(url_base + file_train_1, sep=' ', header=None)
pt2 = pd.read_csv(url_base + file_train_2, sep=' ', header=None)
pt3 = pd.read_csv(url_base + file_train_3, sep=' ', header=None)
pt4 = pd.read_csv(url_base + file_train_4, sep=' ', header=None)

pte1 = pd.read_csv(url_base + file_test_1, sep=' ', header=None)
pte2 = pd.read_csv(url_base + file_test_2, sep=' ', header=None)
pte3 = pd.read_csv(url_base + file_test_3, sep=' ', header=None)
pte4 = pd.read_csv(url_base + file_test_4, sep=' ', header=None)

pv1 = pd.read_csv(url_base + file_valid_1, header=None)
pv2 = pd.read_csv(url_base + file_valid_2, header=None)
pv3 = pd.read_csv(url_base + file_valid_3, header=None)
pv4 = pd.read_csv(url_base + file_valid_4, header=None)


# Updating ids

pt2[0] = pt2[0].apply(lambda x: x+100)
pt3[0] = pt3[0].apply(lambda x: x+360)
pt4[0] = pt4[0].apply(lambda x: x+460)

pte2[0] = pte2[0].apply(lambda x: x+100)
pte3[0] = pte3[0].apply(lambda x: x+359)
pte4[0] = pte4[0].apply(lambda x: x+459)


# Joining the dataframes

train_pd = pd.concat([pt1,pt2,pt3,pt4]).reset_index(drop=True)
test_pd = pd.concat([pte1,pte2,pte3,pte4]).reset_index(drop=True)
valid_pd = pd.concat([pv1,pv2,pv3,pv4], ignore_index=True)

train_pd = train_pd.drop(train_pd.columns[[26,27]], axis='columns')
test_pd = test_pd.drop(test_pd.columns[[26,27]], axis='columns')


# Assigning labels to Dataframe's columns based on the Readme

train_pd.columns = ['id', 'Time (Cycles)', 'OS1', 'OS2', 'OS3', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21']
test_pd.columns = ['id', 'Time (Cycles)', 'OS1', 'OS2', 'OS3', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21']
valid_pd.columns = ['RUL']

#Loading scaler

scaler = StandardScaler()
  

**Adding variables Conditons and fault mode**

Note:

**Condition (ONE)** and **Fault ONE** are binary variables.

When Condition(ONE) = 1 (true), it means that the condition is at Sea Level

When Condition(ONE) = 0 (false), it means NO, the condition IS NOT AT SEA LEVEL, and thus is the  second condition; SIX.

When Fault ONE = 1 (true), it means that the fault modes is one (HPC Degradation)

When Fault ONE = 0 (false), it means that the fault mode is TWO (HPC Degradation and Fan degradation)

In [0]:
# Adding variables Condition and fault modes

def value_condition_train(row):
  if (row['id'] <= 100):
    return 1
  elif (row['id'] <= 360) & (row['id'] > 100):
    return 0
  elif (row['id'] <= 460) & (row['id'] > 360):
    return 1
  else:
    return 0
  
def value_fault_train(row):
  if (row['id'] <= 100):
    return 1
  elif (row['id'] <= 360) & (row['id'] > 100):
    return 1
  elif (row['id'] <= 460) & (row['id'] > 360):
    return 0
  else:
    return 0
  
def value_condition_test(row):
  if (row['id'] <= 100):
    return 1
  elif (row['id'] <= 359) & (row['id'] > 100):
    return 0
  elif (row['id'] <= 459) & (row['id'] > 359):
    return 1
  else:
    return 0
  
def value_fault_test(row):
  if (row['id'] <= 100):
    return 1
  elif (row['id'] <= 359) & (row['id'] > 100):
    return 1
  elif (row['id'] <= 459) & (row['id'] > 359):
    return 0
  else:
    return 0


train_pd['Condition (One)'] = train_pd.apply(value_condition_train, axis=1)
train_pd['Fault ONE'] = train_pd.apply(value_fault_train,axis=1)

test_pd['Condition (One)'] = test_pd.apply(value_condition_test, axis=1)
test_pd['Fault ONE'] = test_pd.apply(value_fault_test,axis=1)

display(train_pd)

At this stage, we create the truth remaining useful life (RUL) for the training set.

Important note: In the training set, the last Cycle (represented in the table by 'Time (Cycles)') is when the engine is considered unusable. However, in the test set, the last cycle IS NOT when the engine is considered unusable. It will fail at a later time. Thus, in the valid_pd, we have the true RUL. 

In [0]:
#Adding column for remaining useful life (RUL)

y_train = pd.DataFrame(train_pd.groupby(['id'])['Time (Cycles)'].max())

train_pd = pd.merge(train_pd,y_train, on='id')
train_pd['RUL'] = train_pd['Time (Cycles)_y'] - train_pd['Time (Cycles)_x']
train_pd = train_pd.drop('Time (Cycles)_y',1)
train_pd = train_pd.rename(columns = {'Time (Cycles)_x':'Time (Cycles)'})

y_train = train_pd.iloc[:,28]
y_train_id = train_pd.iloc[:,[0,28]]


In [0]:
def model_score(y_true, y_pred):
  pred_df = pd.DataFrame(y_pred)
  test_err = pd.concat([y_true,pred_df], axis=1, ignore_index=True)
  test_err.columns = ['RUL', 'Pred_RUL']
  a1 = 10
  a2 = 13
  score=0

  for index, row in test_err.iterrows():
    d = row['Pred_RUL'] - row['RUL']
    if d < 0:
      score += np.expm1(-(d/a1))-1
    else:
      score += np.expm1(d/a2)-1

  return score

**Preparing data**

In [0]:
# Scaling data

train_pd_scaled = train_pd.copy()
train_pd_scaled.iloc[:,2:26] = scaler.fit_transform(train_pd.iloc[:,2:26])

test_pd_scaled = test_pd.copy()
test_pd_scaled.iloc[:,2:26] = scaler.fit_transform(test_pd.iloc[:,2:26])

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', -1)

print(train_pd_scaled)


In [0]:
# Removing RUL and id columns, as we do not want these features to be in our predictors

train_pd_ra = train_RA_pd.copy()
train_pd_ra = train_pd_ra.drop(['id', 'RUL','Time (Cycles)'],axis=1)



In [0]:
# Removing RUL and id columns, as we do not want these features to be in our predictors

train_pd_lm = train_pd_scaled.copy()
train_pd_lm = train_pd_lm.drop(['id', 'RUL', 'Time (Cycles)'],axis=1)

# Keeping only last time cycle for each id

idx = test_pd_scaled.groupby(['id'])['Time (Cycles)'].transform(max) == test_pd_scaled['Time (Cycles)']
test_pd_lm = test_pd_scaled[idx]

idx4 = test_RA_pd.groupby(['id'])['Time (Cycles)'].transform(max) == test_RA_pd['Time (Cycles)']
test_pd_ra = test_RA_pd[idx4]

# Removing id column

test_pd_lm = test_pd_lm.drop(['id', 'Time (Cycles)'], axis=1)
test_pd_ra = test_pd_ra.drop(['id', 'Time (Cycles)'], axis=1)

y_train_ra = y_train_id.groupby('id').apply(lambda group: group.iloc[10:]).reset_index(drop=True)
y_train_ra = y_train_ra.drop(['id'], axis=1)


# **Model 3.2: Neural Network with LSTM architecture**

In [0]:
# Using Azure tutorial for predictive maintenance for data manipulation
# Reference: https://github.com/Azure/lstms_for_predictive_maintenance/blob/master/Deep%20Learning%20Basics%20for%20Predictive%20Maintenance.ipynb

# Picking a sequence length - This will be the window of time in which the LSTM will gather data from

sequence_length = 50


# This function will reshape our data so it can be usable with Keras (Samples, time window, features)
def gen_sequence(id_df, seq_length,seq_cols):
  data_array = id_df[seq_cols].values
  num_elements = data_array.shape[0]
  for start,stop in zip(range(0, num_elements-seq_length), range(seq_length, num_elements)):
    yield data_array[start:stop,:]

# Reference column names

sensor_cols = ['S' + str(i) for i in range(1,22)]
sequence_cols = ['Time (Cycles)', 'OS1', 'OS2', 'OS3']
other_cols = ['Condition (One)', 'Fault ONE']
sequence_cols.extend(sensor_cols)
sequence_cols.extend(other_cols)

# Generating sequences

seq_gen = (list(gen_sequence(train_pd_scaled[train_pd_scaled['id']==id], sequence_length,sequence_cols))
          for id in train_pd_scaled['id'].unique())

# Generate sequences and convert to numpy array

seq_array = np.concatenate(list(seq_gen)).astype(np.float32)
print(seq_array.shape)

def gen_labels(id_df, seq_length, label):
  data = id_df[label].values
  num_elements = data.shape[0]
  return data[seq_length:num_elements,:]

label_gen = [gen_labels(train_pd_scaled[train_pd_scaled['id']==id], sequence_length, ['RUL'])
            for id in train_pd_scaled['id'].unique()]

label_array = np.concatenate(label_gen).astype(np.float32)
print(label_array.shape)

(124909, 50, 27)
(124909, 1)


In [0]:
# Building the RNN

nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(LSTM(input_shape=(sequence_length, nb_features), units=100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(input_shape=(sequence_length, nb_features), units=100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(input_shape=(sequence_length, nb_features), units=100, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(input_shape=(sequence_length, nb_features), units=50, return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=nb_out))
model.add(Activation('linear'))
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])

print(model.summary())


In [0]:
# Fitting the RNN

model.fit(seq_array, label_array, epochs=100, batch_size=200, validation_split=0.05, verbose=1, callbacks =[keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='min')])

In [0]:
# Adding RUL to test set

truth_nn= valid_pd.copy()
max_hid = pd.DataFrame(test_pd.groupby('id')['Time (Cycles)'].max()).reset_index()
max_hid.columns = ['id','max']
truth_nn.columns = ['truth']
truth_nn['id'] = truth_nn.index +1
truth_nn['truth'] = truth_nn['truth'] + max_hid['max']

test_pd_nn = test_pd.copy()
test_pd_nn = test_pd_nn.merge(truth_nn, on=['id'], how='left')
test_pd_nn['RUL'] = test_pd_nn['truth'] - test_pd_nn['Time (Cycles)']
test_pd_nn.drop('truth', axis=1, inplace=True)

print(test_pd_nn)

In [0]:
# Preparing test set

seq_array_test = [test_pd_nn[test_pd_nn['id']==id][sequence_cols].values[-sequence_length:]
                 for id in test_pd_nn['id'].unique() if len(test_pd_nn[test_pd_nn['id']==id]) >= sequence_length]

seq_array_test = np.asarray(seq_array_test).astype(np.float32)

y_mask = [len(test_pd_nn[test_pd_nn['id']==id]) >= sequence_length for id in test_pd_nn['id'].unique()]

label_array_test = test_pd_nn.groupby('id')['RUL'].nth(-1)[y_mask].values
label_array_test = label_array_test.reshape(label_array_test.shape[0],1).astype(np.float32)

print(seq_array_test.shape)
print(label_array_test.shape)

In [0]:
# Fitting model on test set

y_pred_lstm = model.predict(seq_array_test, batch_size=32, verbose=1)

result_lstm =  metrics.mean_absolute_error(y_pred_lstm, label_array_test)

print(f'The mean absolute error for the LSTM on the test set is {result_lstm}.' )

score_lstm = model_score(valid_pd, y_pred_lstm)

print(f'The score of the linear model is: {(score_lstm)}')
