## Predictive Maintenance of NASA Turbofan Engines

The goal of the project focuses on prediction of the RUL (Remaining Useful Lifestyle) of the turbofan engine. The dataset can be obtained thro https://www.kaggle.com/datasets/behrad3d/nasa-cmaps.

### 1. Overview of the dataset

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# setting to show all rows, comment to disable it

# pd.set_option('display.max_rows', None)

In [34]:
index_names = ['unit_number', 'time_cycles']
setting_names = ['setting_1', 'setting_2', 'setting_3']
sensor_names = ['sensor_{}'.format(i+1) for i in range(0,21)]
col_names = index_names + setting_names + sensor_names

In [35]:
sensor_dictionary = {}

dict_list= [ 
    "(Fan inlet temperature) (◦R)",
    "(LPC outlet temperature) (◦R)",
    "(HPC outlet temperature) (◦R)",
    "(LPT outlet temperature) (◦R)",
    "(Fan inlet Pressure) (psia)",
    "(bypass-duct pressure) (psia)",
    "(HPC outlet pressure) (psia)",
    "(Physical fan speed) (rpm)",
    "(Physical core speed) (rpm)",
    "(Engine pressure ratio(P50/P2)",
    "(HPC outlet Static pressure) (psia)",
    "(Ratio of fuel flow to Ps30) (pps/psia)",
    "(Corrected fan speed) (rpm)",
    "(Corrected core speed) (rpm)",
    "(Bypass Ratio) ",
    "(Burner fuel-air ratio)",
    "(Bleed Enthalpy)",
    "(Required fan speed)",
    "(Required fan conversion speed)",
    "(High-pressure turbines Cool air flow)",
    "(Low-pressure turbines Cool air flow)" 
]

In [36]:
df_train_fd001 = pd.read_csv('data/CMaps/train_FD001.txt', header = None, index_col = False, sep = '\s+', names = col_names)
df_train_fd002 = pd.read_csv('data/CMaps/train_FD002.txt', header = None, index_col = False, sep = '\s+', names = col_names)
df_train_fd003 = pd.read_csv('data/CMaps/train_FD003.txt', header = None, index_col = False, sep = '\s+', names = col_names)
df_train_fd004 = pd.read_csv('data/CMaps/train_FD004.txt', header = None, index_col = False, sep = '\s+', names = col_names)

df_test_fd001 = pd.read_csv('data/CMaps/test_FD001.txt', header = None, index_col = False, sep = '\s+', names = col_names)
df_test_fd002 = pd.read_csv('data/CMaps/test_FD002.txt', header = None, index_col = False, sep = '\s+', names = col_names)
df_test_fd003 = pd.read_csv('data/CMaps/test_FD003.txt', header = None, index_col = False, sep = '\s+', names = col_names)
df_test_fd004 = pd.read_csv('data/CMaps/test_FD004.txt', header = None, index_col = False, sep = '\s+', names = col_names)

y_fd001 = pd.read_csv('data/CMaps/RUL_FD001.txt', header = None, index_col = False, sep = '\s+', names=['RUL'])
y_fd002 = pd.read_csv('data/CMaps/RUL_FD002.txt', header = None, index_col = False, sep = '\s+', names=['RUL'])
y_fd003 = pd.read_csv('data/CMaps/RUL_FD003.txt', header = None, index_col = False, sep = '\s+', names=['RUL'])
y_fd004 = pd.read_csv('data/CMaps/RUL_FD004.txt', header = None, index_col = False, sep = '\s+', names=['RUL'])

In [37]:
df_train_fd002.describe()

Unnamed: 0,unit_number,time_cycles,setting_1,setting_2,setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,...,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0,53759.0
mean,131.082981,109.154746,23.998407,0.572056,94.04602,472.910207,579.672399,1419.971013,1205.442024,8.031986,...,266.069034,2334.557253,8066.597682,9.329654,0.023326,348.309511,2228.806358,97.756838,20.789296,12.473423
std,74.463862,69.180569,14.747376,0.310016,14.237735,26.389707,37.289399,105.946341,119.123428,3.613839,...,137.659507,128.068271,84.83795,0.749335,0.004711,27.754515,145.32798,5.364067,9.869331,5.921615
min,1.0,1.0,0.0,0.0,60.0,445.0,535.53,1243.73,1023.77,3.91,...,129.12,2027.61,7848.36,8.3357,0.02,303.0,1915.0,84.93,10.18,6.0105
25%,68.0,52.0,10.0046,0.2507,100.0,445.0,549.57,1352.76,1123.655,3.91,...,131.52,2387.9,8062.14,8.6778,0.02,331.0,2212.0,100.0,10.91,6.5463
50%,131.0,104.0,25.0013,0.7,100.0,462.54,555.98,1369.18,1138.89,7.05,...,183.2,2388.08,8082.54,9.3109,0.02,335.0,2223.0,100.0,14.88,8.9292
75%,195.0,157.0,41.998,0.84,100.0,491.19,607.34,1499.37,1306.85,10.52,...,371.26,2388.17,8127.195,9.3869,0.03,369.0,2324.0,100.0,28.47,17.0832
max,260.0,378.0,42.008,0.842,100.0,518.67,644.52,1612.88,1439.23,14.62,...,523.37,2390.48,8268.5,11.0669,0.03,399.0,2388.0,100.0,39.34,23.5901


In [38]:
train_fd001 = df_train_fd001.copy()
train_fd002 = df_train_fd002.copy()
train_fd003 = df_train_fd003.copy()
train_fd004 = df_train_fd004.copy()

In [8]:
len(train_fd004)

61249

In [9]:
len(train_fd003)

24720

In [10]:
len(train_fd002)

53759

In [11]:
len(train_fd001)

20631

In [39]:
def add_rul_column(df):
    train_grouped_by_unit = df.groupby(by='unit_number') 
    max_time_cycles = train_grouped_by_unit['time_cycles'].max()  
    merged = df.merge(max_time_cycles.to_frame(name = 'max_time_cycle'), left_on = 'unit_number', right_index = True)
    merged["rul"] = merged["max_time_cycle"] - merged['time_cycles']
    merged = merged.drop("max_time_cycle", axis = 1)

    return merged

In [80]:
def add_rul_column_test(df, dfy):
    y_test = dfy.copy()
    y_test["unit_number"] = y_test.index + 1
    max_rul = pd.DataFrame(df.groupby('unit_number')['time_cycles'].max()).reset_index()
    max_rul.columns = ['unit_number', 'max_time_cycles']
    merged = y_test.merge(max_rul, how = 'inner', on = 'unit_number')
    merged["failed_rul"] = merged["RUL"] + merged["max_time_cycles"]
    merged.drop(["RUL", "max_time_cycles"], axis = 1)
    merged_c = merged.drop(["RUL", "max_time_cycles"], axis = 1)
    test = pd.DataFrame(df.groupby('unit_number')['time_cycles'].max()).reset_index().merge(merged_c, how = 'inner', on = 'unit_number')
    test = test.drop(["time_cycles"], axis = 1)
    
    return test

In [41]:
train_fd001 = add_rul_column(train_fd001)
train_fd002 = add_rul_column(train_fd002)
train_fd003 = add_rul_column(train_fd003)
train_fd004 = add_rul_column(train_fd004)

In [14]:
# Convert to csv

train_fd001.to_csv("data/train_fd001_with_rul.csv", index = False)
train_fd002.to_csv("data/train_fd002_with_rul.csv", index = False)
train_fd003.to_csv("data/train_fd003_with_rul.csv", index = False)
train_fd004.to_csv("data/train_fd004_with_rul.csv", index = False)

In [89]:
test_fd001 = df_test_fd001.copy()
test_fd002 = df_test_fd002.copy()
test_fd003 = df_test_fd003.copy()
test_fd004 = df_test_fd004.copy()

f_test_fd001 = add_rul_column_test(test_fd001, y_fd001)
f_test_fd002 = add_rul_column_test(test_fd002, y_fd002)
f_test_fd003 = add_rul_column_test(test_fd003, y_fd003)
f_test_fd004 = add_rul_column_test(test_fd004, y_fd004)

test_fd001 = test_fd001.merge(f_test_fd001, how = 'inner', on = 'unit_number')
test_fd001['rul'] = test_fd001['failed_rul'] - test_fd001['time_cycles']
test_fd001 = test_fd001.drop(["failed_rul"], axis = 1)

test_fd002 = test_fd002.merge(f_test_fd002, how = 'inner', on = 'unit_number')
test_fd002['rul'] = test_fd002['failed_rul'] - test_fd002['time_cycles']
test_fd002 = test_fd002.drop(["failed_rul"], axis = 1)

test_fd003 = test_fd003.merge(f_test_fd003, how = 'inner', on = 'unit_number')
test_fd003['rul'] = test_fd003['failed_rul'] - test_fd003['time_cycles']
test_fd003 = test_fd003.drop(["failed_rul"], axis = 1)

test_fd004 = test_fd004.merge(f_test_fd004, how = 'inner', on = 'unit_number')
test_fd004['rul'] = test_fd004['failed_rul'] - test_fd004['time_cycles']
test_fd004 = test_fd004.drop(["failed_rul"], axis = 1)

In [90]:
# Convert to csv

test_fd001.to_csv("data/test_fd001_with_rul.csv", index = False)
test_fd002.to_csv("data/test_fd002_with_rul.csv", index = False)
test_fd003.to_csv("data/test_fd003_with_rul.csv", index = False)
test_fd004.to_csv("data/test_fd004_with_rul.csv", index = False)