The regression part of NASA-RUL is to predict the RUL

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
columns=["id","cycle","op1","op2","op3","sensor1","sensor2","sensor3","sensor4","sensor5","sensor6","sensor7","sensor8",
         "sensor9","sensor10","sensor11","sensor12","sensor13","sensor14","sensor15","sensor16","sensor17","sensor18","sensor19"
         ,"sensor20","sensor21","sensor22","sensor23"]

In [3]:
train=pd.read_csv("archive/CMaps/train_FD001.txt",sep=" ",names=columns)   ## to read txt into dataframe
test=pd.read_csv("archive/CMaps/test_FD001.txt",sep=" ",names=columns)
test_result=pd.read_csv("archive/CMaps/RUL_FD001.txt",header=None)

In [4]:
train.shape, test.shape, test_result.shape

((20631, 28), (13096, 28), (100, 1))

In [5]:
test_result.columns=["rul"]
test_result.describe()

Unnamed: 0,rul
count,100.0
mean,75.52
std,41.76497
min,7.0
25%,32.75
50%,86.0
75%,112.25
max,145.0


In [6]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20631.0,51.506568,29.22763,1.0,26.0,52.0,77.0,100.0
cycle,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
op1,20631.0,-9e-06,0.002187313,-0.0087,-0.0015,0.0,0.0015,0.0087
op2,20631.0,2e-06,0.0002930621,-0.0006,-0.0002,0.0,0.0003,0.0006
op3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
sensor1,20631.0,518.67,0.0,518.67,518.67,518.67,518.67,518.67
sensor2,20631.0,642.680934,0.5000533,641.21,642.325,642.64,643.0,644.53
sensor3,20631.0,1590.523119,6.13115,1571.04,1586.26,1590.1,1594.38,1616.91
sensor4,20631.0,1408.933782,9.000605,1382.25,1402.36,1408.04,1414.555,1441.49
sensor5,20631.0,14.62,1.7764e-15,14.62,14.62,14.62,14.62,14.62


In [7]:
train.drop(columns=["sensor22", "sensor23"], inplace=True)
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,20631.0,51.506568,29.22763,1.0,26.0,52.0,77.0,100.0
cycle,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
op1,20631.0,-9e-06,0.002187313,-0.0087,-0.0015,0.0,0.0015,0.0087
op2,20631.0,2e-06,0.0002930621,-0.0006,-0.0002,0.0,0.0003,0.0006
op3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
sensor1,20631.0,518.67,0.0,518.67,518.67,518.67,518.67,518.67
sensor2,20631.0,642.680934,0.5000533,641.21,642.325,642.64,643.0,644.53
sensor3,20631.0,1590.523119,6.13115,1571.04,1586.26,1590.1,1594.38,1616.91
sensor4,20631.0,1408.933782,9.000605,1382.25,1402.36,1408.04,1414.555,1441.49
sensor5,20631.0,14.62,1.7764e-15,14.62,14.62,14.62,14.62,14.62


In [8]:
#Drop  all columns with only 1 unique value
train_df = train
for col in train_df.columns:
    if train_df[col].nunique() == 1:
        train_df.drop(columns=[col], inplace=True)
train_df.nunique()

id           100
cycle        362
op1          158
op2           13
sensor2      310
sensor3     3012
sensor4     4051
sensor6        2
sensor7      513
sensor8       53
sensor9     6403
sensor11     159
sensor12     427
sensor13      56
sensor14    6078
sensor15    1918
sensor17      13
sensor20     120
sensor21    4745
dtype: int64

In [9]:
def Calculate_RUL(df):
    max_cycles = df.groupby('id')['cycle'].max()
    merged = df.merge(max_cycles.to_frame(name='max_time_cycle'), left_on='id',right_index=True)
    merged["RUL"] = merged["max_time_cycle"] - merged['cycle']
    merged = merged.drop("max_time_cycle", axis=1)
    return merged
train_df = Calculate_RUL(train_df)
train_df.shape

(20631, 20)

In [10]:
train_df.corr()['RUL'].sort_values(key=abs)

op2        -0.001948
op1        -0.003198
id          0.078753
sensor6    -0.128348
sensor14   -0.306769
sensor9    -0.390102
sensor13   -0.562569
sensor8    -0.563968
sensor3    -0.584520
sensor17   -0.606154
sensor2    -0.606484
sensor20    0.629428
sensor21    0.635662
sensor15   -0.642667
sensor7     0.657223
sensor12    0.671983
sensor4    -0.678948
sensor11   -0.696228
cycle      -0.736241
RUL         1.000000
Name: RUL, dtype: float64

In [11]:
# taking 30% correlation as threshold
train_df.drop(columns=['op2', 'op1', 'id', 'sensor6'], inplace=True)
train_df.shape

(20631, 16)

In [12]:
#Now the training process
X = train_df.drop(columns=['RUL'])
Y = train_df['RUL']
X_cols = X.columns.to_list()
X_cols.remove("cycle")

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=168)

In [15]:
scaler=MinMaxScaler()
scaled_train=pd.DataFrame(scaler.fit_transform(X_train.drop(columns=['cycle'])),columns=X_cols,index=X_train.index)
scaled_test=pd.DataFrame(scaler.transform(X_test.drop(columns=['cycle'])),columns=X_cols,index=X_test.index)

In [16]:
scaled_train['cycle']=X_train['cycle']
scaled_test['cycle']=X_test['cycle']

In [17]:
regressor = RandomForestRegressor(n_estimators=100, random_state=168)
regressor.fit(scaled_train, y_train)

In [18]:
y_pred = regressor.predict(scaled_test)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred) 
print("MSE:", mse)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# R-squared (R2) 
r2 = r2_score(y_test, y_pred)
print("R-squared:", r2) 

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)  
print("MAE:", mae)

MSE: 1271.404697363319
RMSE: 35.65676229501662
R-squared: 0.722295490812783
MAE: 24.815025203567274


In [19]:
y_test.max(), y_test.min()

(361, 0)

Since Range of expected values is 362, and the MAE is 24, the accuracy is about 94%