<a href="https://colab.research.google.com/github/francoisdoanp/MLTBP/blob/master/Project_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine learning - Final Project**

# Turbofan engine degradation dataset (NASA)

# Data Preparation

**Importing necessary packages**



In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
import math

**Importing the Turbofan engine degradation dataset.**

**Files are located in the following Github repository: https://github.com/francoisdoanp/MLTBP**

We have 4 training datasets, which contains information about one hundred engines, all of the same type. Thus, we will combine the training and test data sets. 

The training and test sets have 21 columns: ID, Time (Cycles), 3 columns for operational settings and 21 sensor measurements.

The training and testing sets have the same format, while the validation sets only contain the real RUL (remaining useful life).

For more information on the data, consult the read me at the following address:https://github.com/francoisdoanp/MLTBP/blob/master/readme.txt

In [8]:
url_base = 'https://raw.githubusercontent.com/francoisdoanp/MLTBP/master/'

file_train_1 = 'train_FD001.txt'
file_train_2 = 'train_FD002.txt'
file_train_3 = 'train_FD003.txt'
file_train_4 = 'train_FD004.txt'

file_test_1 = 'test_FD001.txt'
file_test_2 = 'test_FD002.txt'
file_test_3 = 'test_FD003.txt'
file_test_4 = 'test_FD004.txt'

file_valid_1 = 'RUL_FD001.txt'
file_valid_2 = 'RUL_FD002.txt'
file_valid_3 = 'RUL_FD003.txt'
file_valid_4 = 'RUL_FD004.txt'


pt1 = pd.read_csv(url_base + file_train_1, sep=' ', header=None)
pt2 = pd.read_csv(url_base + file_train_2, sep=' ', header=None)
pt3 = pd.read_csv(url_base + file_train_3, sep=' ', header=None)
pt4 = pd.read_csv(url_base + file_train_4, sep=' ', header=None)

pte1 = pd.read_csv(url_base + file_test_1, sep=' ', header=None)
pte2 = pd.read_csv(url_base + file_test_2, sep=' ', header=None)
pte3 = pd.read_csv(url_base + file_test_3, sep=' ', header=None)
pte4 = pd.read_csv(url_base + file_test_4, sep=' ', header=None)

pv1 = pd.read_csv(url_base + file_valid_1, header=None)
pv2 = pd.read_csv(url_base + file_valid_2, header=None)
pv3 = pd.read_csv(url_base + file_valid_3, header=None)
pv4 = pd.read_csv(url_base + file_valid_4, header=None)


# Updating unit numbers (1416 motor in total)
#Labeled event(failure) data
pt2[0] = pt2[0].apply(lambda x: x+100)
pt3[0] = pt3[0].apply(lambda x: x+360)
pt4[0] = pt4[0].apply(lambda x: x+460)

#Censored data (no event)
pte1[0] = pte1[0].apply(lambda x: x+708)
pte2[0] = pte2[0].apply(lambda x: x+808)
pte3[0] = pte3[0].apply(lambda x: x+1067)
pte4[0] = pte4[0].apply(lambda x: x+1167)


# Joining the dataframes
train_pd = pd.concat([pt1,pt2,pt3,pt4])
test_pd = pd.concat([pte1,pte2,pte3,pte4])
valid_pd = pd.concat([pv1,pv2,pv3,pv4], ignore_index=True)

train_pd = train_pd.drop(train_pd.columns[[26,27]], axis='columns')
test_pd = test_pd.drop(test_pd.columns[[26,27]], axis='columns')

#Survival_pd = pd.concat([train_pd,test_pd])


# Assigning labels to Dataframe's columns based on the Readme

train_pd.columns = ['Unit Number', 'Time (Cycles)', 'OS1', 'OS2', 'OS3', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21']
test_pd.columns = ['Unit Number', 'Time (Cycles)', 'OS1', 'OS2', 'OS3', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21']
#Survival_pd.columns = ['Unit Number', 'Time (Cycles)', 'OS1', 'OS2', 'OS3', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S12', 'S13', 'S14', 'S15', 'S16', 'S17', 'S18', 'S19', 'S20', 'S21']
valid_pd.columns = ['RUL']

# Keep only the last cycle of each unit numbers
#Survival_pd = Survival_pd.groupby('Unit Number').max()

#display(Survival_pd)

#Loading scaler

scaler = StandardScaler()
  

Unnamed: 0_level_0,Time (Cycles),OS1,OS2,OS3,S1,S2,S3,S4,S5,S6,...,S12,S13,S14,S15,S16,S17,S18,S19,S20,S21
Unit Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,192,0.0047,0.0005,100.0,518.67,644.21,1605.44,1432.52,14.62,21.61,...,522.86,2388.35,8140.58,8.5227,0.03,398,2388,100.0,39.18,23.4999
2,287,0.0076,0.0006,100.0,518.67,643.94,1610.10,1431.17,14.62,21.61,...,523.26,2388.26,8175.57,8.5377,0.03,398,2388,100.0,39.24,23.6005
3,179,0.0058,0.0005,100.0,518.67,643.93,1606.50,1438.51,14.62,21.61,...,523.18,2388.20,8255.34,8.5363,0.03,399,2388,100.0,39.23,23.5181
4,189,0.0059,0.0006,100.0,518.67,644.53,1612.11,1434.12,14.62,21.61,...,522.48,2388.17,8259.42,8.5462,0.03,399,2388,100.0,39.21,23.5074
5,269,0.0055,0.0005,100.0,518.67,644.02,1609.41,1434.59,14.62,21.61,...,523.04,2388.23,8215.19,8.5410,0.03,398,2388,100.0,39.29,23.5503
6,188,0.0044,0.0005,100.0,518.67,644.12,1607.63,1434.92,14.62,21.61,...,522.21,2388.36,8128.37,8.5358,0.03,397,2388,100.0,39.10,23.4517
7,259,0.0068,0.0005,100.0,518.67,644.27,1607.70,1436.55,14.62,21.61,...,523.00,2388.27,8158.87,8.5627,0.03,398,2388,100.0,39.22,23.5133
8,150,0.0053,0.0005,100.0,518.67,644.02,1610.35,1432.31,14.62,21.61,...,522.13,2388.30,8131.45,8.5639,0.03,397,2388,100.0,39.04,23.3798
9,201,0.0050,0.0005,100.0,518.67,644.04,1607.04,1433.83,14.62,21.61,...,523.12,2388.56,8289.63,8.5282,0.03,399,2388,100.0,39.27,23.5349
10,222,0.0063,0.0005,100.0,518.67,644.35,1613.62,1436.29,14.62,21.61,...,523.04,2388.25,8183.55,8.5312,0.03,398,2388,100.0,39.43,23.5493


**Adding variables Conditons and fault mode**

Note:

**Condition (ONE)** and **Fault ONE** are binary variables.

When Condition(ONE) = 1 (true), it means that the condition is at Sea Level

When Condition(ONE) = 0 (false), it means NO, the condition IS NOT AT SEA LEVEL, and thus is the  second condition; SIX.

When Fault ONE = 1 (true), it means that the fault modes is one (HPC Degradation)

When Fault ONE = 0 (false), it means that the fault mode is TWO (HPC Degradation and Fan degradation)

##### Adding variables Condition and fault modes

def value_condition_train(row):
  if (row['Unit Number'] <= 100):
    return 1
  elif (row['Unit Number'] <= 360) & (row['Unit Number'] > 100):
    return 0
  elif (row['Unit Number'] <= 460) & (row['Unit Number'] > 360):
    return 1
  else:
    return 0
  
def value_fault_train(row):
  if (row['Unit Number'] <= 100):
    return 1
  elif (row['Unit Number'] <= 360) & (row['Unit Number'] > 100):
    return 1
  elif (row['Unit Number'] <= 460) & (row['Unit Number'] > 360):
    return 0
  else:
    return 0

def value_event_train(row):
  return 1
  
def value_condition_test(row):
  if (row['Unit Number'] <= 808):
    return 1
  elif (row['Unit Number'] <= 1067) & (row['Unit Number'] > 808):
    return 0
  elif (row['Unit Number'] <= 1077) & (row['Unit Number'] > 1067):
    return 1
  else:
    return 0
  
def value_fault_test(row):
  if (row['Unit Number'] <= 808):
    return 1
  elif (row['Unit Number'] <= 1067) & (row['Unit Number'] > 808):
    return 1
  elif (row['Unit Number'] <= 1167) & (row['Unit Number'] > 1067):
    return 0
  else:
    return 0

def value_event_test(row):
  return 0


train_pd['Condition (One)'] = train_pd.apply(value_condition_train, axis=1)
train_pd['Fault ONE'] = train_pd.apply(value_fault_train,axis=1)
train_pd['Event'] = train_pd.apply(value_event_train,axis=1)

test_pd['Condition (One)'] = test_pd.apply(value_condition_test, axis=1)
test_pd['Fault ONE'] = test_pd.apply(value_fault_test,axis=1)
test_pd['Event'] = test_pd.apply(value_event_test,axis=1)

Survival_pd = pd.concat([train_pd,test_pd])

# Keep only the last cycle of each unit numbers
#Survival_pd = Survival_pd.iloc[Survival_pd.groupby(['Unit Number'])['Time (Cycles)'].idxmax()]

display(Survival_pd)


##### corr = train_pd.groupby(['RUL']).corr()
corr.style.background_gradient(cmap='coolwarm')

**How do we know when an engine fails?**

We look at the last time cycle for every unit number.



In [93]:
train_pd.groupby(['Unit Number'])['Time (Cycles)'].max().hist(bins=30, grid=False)
plt.xlabel('Number of Cycles')
plt.ylabel('Frequency')


NameError: ignored

#**Model 1: Multiple Linear Regression (Not time-sensitive)**






**Preparing data**

In [0]:
# Scaling data

train_pd_scaled = train_pd.copy()
train_pd_scaled.iloc[:,2:26] = scaler.fit_transform(train_pd.iloc[:,2:26])

test_pd_lm = test_pd.copy()
test_pd_lm.iloc[:,2:26] = scaler.fit_transform(test_pd.iloc[:,2:26])

pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', -1)



In [0]:
# Removing RUL and Unit Number columns, as we do not want these features to be in our predictors

train_pd_lm = train_pd_scaled.copy()
train_pd_lm = train_pd_lm.drop(['RUL', 'Unit Number'],axis=1)

# Keeping only last time cycle for each unit number

idx = test_pd_lm.groupby(['Unit Number'])['Time (Cycles)'].transform(max) == test_pd_lm['Time (Cycles)']
test_pd_lm = test_pd_scaled[idx]

# Removing Unit Number column

test_pd_lm = test_pd_lm.drop(['Unit Number'], axis=1)

**Fitting Linear Model**

In [103]:
# Fitting Linear model

reg = LinearRegression().fit(train_pd_lm, y_train)

y_pred = reg.predict(test_pd_lm)

acc_score_lm =  metrics.mean_squared_error(valid_pd, y_pred)

print(f'The square root of the mean squared error for the linear model is: {math.sqrt(acc_score_lm)}.')


The square root of the mean squared error for the linear model is: 54.8991731927702.


In [0]:
print(adjusted_error(valid_pd, y_pred))