# PHM Data Challenge

This year’s data challenge is on estimating the health of helicopter turbine engines.

In [1]:
!pip install pycaret



In [2]:
# Libraries
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

### Load Data

In [3]:
X_train = pd.read_csv('X_train.csv')
Y_train = pd.read_csv('y_train.csv')
test = pd.read_csv('X_test.csv')

### Data Modeling

In [4]:
# create train model
Y_train['faulty'].reset_index(drop=True, inplace=True)
train = pd.concat([X_train,Y_train['faulty']], axis=1)

In [5]:
# review train model
train.head()

Unnamed: 0,id,trq_measured,oat,mgt,pa,ias,np,ng,faulty
0,0,54.1,2.0,544.5,212.1408,74.5625,89.18,99.64,1
1,1,49.625,24.22231,578.4844,1625.64,30.35596,99.55273,91.3866,0
2,2,52.0,7.0,566.1,1912.925,65.625,100.14,90.96,1
3,3,62.4,7.25,560.1,277.0632,54.8125,90.64,100.28,0
4,4,62.9,23.25,593.7,53.6448,73.4375,99.91,92.17,0


In [6]:
# Drop first column id
train.drop('id', axis=1, inplace=True)

In [7]:
# rescale all variables except the target variable
df_scale = train.loc[:, train.columns!='faulty']
scaler = preprocessing.MinMaxScaler()
df_scale = scaler.fit_transform(df_scale)
df_scale = pd.DataFrame(df_scale)
df_scale.reset_index(drop=True, inplace=True)

In [8]:
# combine rescaled value
train['faulty'].reset_index(drop=True, inplace=True)
train = pd.concat([df_scale,train['faulty']], axis=1)

In [9]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,faulty
0,0.437796,0.379132,0.301445,0.127206,0.542026,0.251964,0.859157,1
1,0.388105,0.775611,0.434144,0.554775,0.22067,0.878715,0.12344,0
2,0.414477,0.468339,0.385787,0.641675,0.477056,0.914199,0.085412,1
3,0.52996,0.472799,0.362358,0.146844,0.398455,0.340181,0.916207,0
4,0.535512,0.758263,0.493557,0.079263,0.533848,0.900302,0.193273,0


In [10]:
# ML libraries
from pycaret.classification import *

### Predict Health State

In [11]:
exp_class101 = setup(data = train, target = 'faulty', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,faulty
2,Target type,Binary
3,Original data shape,"(742625, 8)"
4,Transformed data shape,"(742625, 8)"
5,Transformed train set shape,"(519837, 8)"
6,Transformed test set shape,"(222788, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


In [12]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9994,1.0,0.9991,0.9993,0.9992,0.9987,0.9987,48.711
rf,Random Forest Classifier,0.9992,1.0,0.9988,0.9991,0.999,0.9983,0.9983,133.63
xgboost,Extreme Gradient Boosting,0.9977,1.0,0.9972,0.997,0.9971,0.9951,0.9951,5.219
knn,K Neighbors Classifier,0.9976,0.9996,0.9972,0.9969,0.997,0.995,0.995,9.042
dt,Decision Tree Classifier,0.9964,0.9963,0.9954,0.9957,0.9956,0.9926,0.9926,4.132
lightgbm,Light Gradient Boosting Machine,0.9915,0.9997,0.9874,0.9915,0.9894,0.9823,0.9823,24.62
gbc,Gradient Boosting Classifier,0.9591,0.992,0.9385,0.9592,0.9487,0.9147,0.9148,112.04
ada,Ada Boost Classifier,0.9334,0.9793,0.9037,0.9291,0.9162,0.8609,0.8612,23.29
svm,SVM - Linear Kernel,0.9222,0.9704,0.8837,0.9204,0.9015,0.8373,0.8379,1.096
lr,Logistic Regression,0.9207,0.973,0.8947,0.9073,0.9009,0.8348,0.8349,2.477


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [13]:
print(best)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)


NameError: name 'et' is not defined

In [16]:
# selected model
et = create_model('et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9993,1.0,0.999,0.9993,0.9992,0.9986,0.9986
1,0.9994,1.0,0.9992,0.9992,0.9992,0.9987,0.9987
2,0.9994,1.0,0.9992,0.9993,0.9993,0.9988,0.9988
3,0.9995,1.0,0.9992,0.9994,0.9993,0.9989,0.9989
4,0.9994,1.0,0.9994,0.9992,0.9993,0.9988,0.9988
5,0.9994,1.0,0.9991,0.9994,0.9992,0.9987,0.9987
6,0.9994,1.0,0.999,0.9996,0.9993,0.9988,0.9988
7,0.9992,1.0,0.9987,0.9994,0.999,0.9984,0.9984
8,0.9994,1.0,0.9992,0.9994,0.9993,0.9988,0.9988
9,0.9992,1.0,0.999,0.9991,0.9991,0.9984,0.9984


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# plot residuals
plot_model(et, plot = 'residuals')

In [None]:
# plot error
plot_model(best, plot = 'error')

In [None]:
# plot feature importance
plot_model(best, plot = 'feature')

In [17]:
tuned_model = tune_model(et)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


KeyboardInterrupt: 

In [19]:
final_model = finalize_model(et) #finalize_model(tuned_model)

In [20]:
save_model(final_model,'Final Model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['0', '1', '2', '3', '4', '5', '6'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=[],
                                     transformer=...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterio

In [21]:
saved_final_model = load_model('Final Model')

Transformation Pipeline and Model Successfully Loaded


In [22]:
test.head()

Unnamed: 0,id,trq_measured,oat,mgt,pa,ias,np,ng
0,0,76.2,29.5,648.0,303.276,0.0,99.83,96.77
1,1,63.3,18.75,595.4,464.82,96.5625,100.01,93.61
2,2,87.3,2.5,644.4,503.5296,119.4375,99.92,96.87
3,3,85.4,0.25,630.7,458.4192,121.25,100.04,96.04
4,4,73.1,21.25,625.9,626.364,111.4375,100.17,95.67


In [23]:
# classification
# Drop first column id
test.drop('id', axis=1, inplace=True)

In [24]:
scaler = preprocessing.MinMaxScaler()
df_scale = scaler.fit_transform(test)
df_scale = pd.DataFrame(df_scale)
df_scale.reset_index(drop=True, inplace=True)
test = df_scale

In [25]:
unseen_predictions = predict_model(final_model, data=test)
unseen_predictions.head(30)

Unnamed: 0,0,1,2,3,4,5,6,prediction_label,prediction_score
0,0.628253,0.960938,0.683389,0.342064,0.0,0.209877,0.662745,0,0.81
1,0.388476,0.625,0.39019,0.479512,0.716273,0.265432,0.352941,0,0.57
2,0.834572,0.117188,0.663322,0.512448,0.885953,0.237654,0.672549,1,0.92
3,0.799257,0.046875,0.586957,0.474066,0.899397,0.274691,0.591176,1,0.94
4,0.570632,0.703125,0.560201,0.61696,0.826611,0.314815,0.554902,0,0.61
5,0.795539,0.265625,0.712932,0.596732,0.891052,0.268519,0.717647,1,0.94
6,0.330855,0.734375,0.25641,0.316131,0.29439,0.333333,0.25098,0,0.76
7,0.310409,0.421875,0.083612,0.230031,0.501159,0.296296,0.052941,0,0.74
8,0.598513,0.46875,0.406355,0.430238,0.831247,0.302469,0.42549,1,0.59
9,0.689591,0.492188,0.483835,0.284751,0.0,0.228395,0.467647,0,0.54


In [1]:
### Validation

In [5]:
validation = pd.read_csv("X_validation.csv")

In [6]:
validation.head()

Unnamed: 0,id,trq_measured,oat,mgt,pa,ias,np,ng
0,0,56.5,19.0,553.9,276.7584,73.6875,99.81,91.07
1,1,86.2,6.75,657.9,657.4536,122.875,100.03,97.61
2,2,54.0,21.75,559.6,263.3472,18.125,99.57,90.62
3,3,55.4,20.75,566.8,751.0272,84.75,99.92,91.16
4,4,51.3,19.5,554.2,755.904,68.5625,99.99,90.09


In [7]:
# classification
# Drop first column id
validation.drop('id', axis=1, inplace=True)

In [8]:
scaler = preprocessing.MinMaxScaler()
df_scale = scaler.fit_transform(validation)
df_scale = pd.DataFrame(df_scale)
df_scale.reset_index(drop=True, inplace=True)
validation = df_scale

In [None]:
# load model
saved_final_model = load_model('Final Model')

In [None]:
unseen_predictions = predict_model(final_model, data=test)
unseen_predictions.head(30)