# Feature engineering

In [1]:
import os
import sys
import pickle
import h5py
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import math
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.model_selection import train_test_split

base_dir = os.path.dirname(os.getcwd())
print(base_dir)
sys.path.insert(1, base_dir)
from package.api import DB as api
import package.utils as utils

%matplotlib inline
%load_ext autoreload
%autoreload 2


print(tf.__version__)
gpu = tf.config.list_physical_devices('GPU')
has_gpu = gpu[0][0].split(':')[1] == 'GPU'
print(f"[INFO] GPU?: <{has_gpu}> {gpu}")
if(has_gpu):
    tf.config.experimental.set_memory_growth(gpu[0], True)

/home/oem/phm2021_data_challenge
2.7.0
[INFO] GPU?: <True> [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [2]:
# THESE ARE YOUR CREDENTIALS IN PLAIN TEXT!
params = utils.get_aws_secret("/secret/cmapss")
#print(params)
db, cur =  api.connect(params)
db.set_session(autocommit=True)
del(params)

[INFO] connecting to db.
[INFO] connected.


## train model to learn telemetry data on HS=1 only
- input: summary_data
- output: telemetry_data

### Get all units

In [3]:
units = api._get_units(group_id=[1,2], 
                       Fc=[1,2,3],
                       datasets=['all'],
                       db=db)

# units = api._get_units(group_id=[1,2], 
#                        Fc=[1,2,3],
#                        datasets=['DS01-005'],
#                        db=db)
units.head()

Unnamed: 0,id,serial_number,age,eol,rul,group_id,Fc,unit,dataset
0,1,9EvKtW13,0.0,100.0,100.0,1,1,1,DS01-005
1,2,UecVuQWo,0.0,75.0,75.0,1,3,2,DS01-005
2,3,sroN68E2,0.0,100.0,100.0,1,2,3,DS01-005
3,4,dGvfoDFi,0.0,95.0,95.0,1,1,4,DS01-005
4,5,mhpHf016,0.0,89.0,89.0,1,3,5,DS01-005


### use all data for this
- downsample the data by a factor of 10

In [8]:
downsample = 10

summary_data = api._get_data(units=list(units.id),
                             downsample=downsample,
                             table='summary_tb',
                             db=db).astype(np.float32).round(3)
#summary_data.asset_id = summary_data.asset_id.astype(int)
print(len(summary_data))
summary_data.head()

6338311


Unnamed: 0,id,asset_id,cycle,hs,alt,Mach,TRA,T2
0,10.0,1.0,1.0,1.0,3081.0,0.379,70.4,522.302
1,20.0,1.0,1.0,1.0,3153.0,0.384,70.576,522.428
2,30.0,1.0,1.0,1.0,3229.0,0.391,70.576,522.645
3,40.0,1.0,1.0,1.0,3305.0,0.394,70.576,522.651
4,50.0,1.0,1.0,1.0,3393.0,0.397,70.664,522.53


In [9]:
telemetry_data = api._get_data(units=list(units.id),
                               downsample=downsample,
                               table='telemetry_tb',
                               drop_cols=['id', 'asset_id', 'dt', 'rn'],
                               db=db).astype(np.float32).round(3)
telemetry_data.head()

Unnamed: 0,Wf,Nf,Nc,T24,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50
0,4.619,2142.986,8693.203,618.329,1470.481,1849.662,1268.918,19.417,14.472,19.713,24.398,394.432,400.931,15.944
1,4.624,2144.852,8694.997,618.606,1471.05,1850.533,1268.849,19.424,14.472,19.72,24.418,394.728,401.23,15.919
2,4.62,2145.311,8695.133,618.815,1471.088,1850.242,1268.181,19.43,14.48,19.726,24.431,394.555,401.06,15.887
3,4.611,2145.352,8694.137,618.802,1470.763,1849.517,1267.365,19.411,14.468,19.706,24.41,393.944,400.444,15.847
4,4.604,2145.904,8693.531,618.722,1470.574,1849.327,1266.863,19.378,14.441,19.673,24.374,393.353,399.843,15.805


### we dont need degradation data for feature engineering

In [None]:
# degradation_data = api._get_data(units=list(units.id),
#                                  downsample=downsample,
#                                  table='degradation_tb',
#                                  drop_cols=['asset_id', 'rn'],
#                                  db=db).astype(np.float32)
# degradation_data.head()

### merge the dataframes, and only use hs=1, set precision to 3

In [10]:
pd.set_option('precision', 3)
df = pd.concat([summary_data, telemetry_data], axis=1)
df = df[df['hs'] == 1].round(3)
df.head()

Unnamed: 0,id,asset_id,cycle,hs,alt,Mach,TRA,T2,Wf,Nf,...,T30,T48,T50,P15,P2,P21,P24,Ps30,P40,P50
0,10.0,1.0,1.0,1.0,3081.0,0.379,70.4,522.302,4.619,2142.986,...,1470.481,1849.662,1268.918,19.417,14.472,19.713,24.398,394.432,400.931,15.944
1,20.0,1.0,1.0,1.0,3153.0,0.384,70.576,522.428,4.624,2144.852,...,1471.05,1850.533,1268.849,19.424,14.472,19.72,24.418,394.728,401.23,15.919
2,30.0,1.0,1.0,1.0,3229.0,0.391,70.576,522.645,4.62,2145.311,...,1471.088,1850.242,1268.181,19.43,14.48,19.726,24.431,394.555,401.06,15.887
3,40.0,1.0,1.0,1.0,3305.0,0.394,70.576,522.651,4.611,2145.352,...,1470.763,1849.517,1267.365,19.411,14.468,19.706,24.41,393.944,400.444,15.847
4,50.0,1.0,1.0,1.0,3393.0,0.397,70.664,522.53,4.604,2145.904,...,1470.574,1849.327,1266.863,19.378,14.441,19.673,24.374,393.353,399.843,15.805


### y_labels are the target columns, t_labels are the training columns
- we need to keep the id, asset_id and cycle columns for auxillary information, these will be separated out from the training data in a subsequent step

In [11]:
y_labels = list(telemetry_data.columns)
print(y_labels)
t_labels = list(summary_data.columns)
print(t_labels)
# delete these to free up memory
del summary_data, telemetry_data

['Wf', 'Nf', 'Nc', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50']
['id', 'asset_id', 'cycle', 'hs', 'alt', 'Mach', 'TRA', 'T2']


### Train test split

In [12]:
train_df, y_train, val_df, y_val, test_df, y_test = utils.train_test_split(df=df, 
                                                                     y_labels=y_labels, 
                                                                     t_labels=t_labels, 
                                                                     train_pct=.75, 
                                                                     val_pct=.10, 
                                                                     test_pct=.15, 
                                                                     verbose=True)

train, val, test set counts: 67, 9, 14
train units: [43.0, 62.0, 39.0, 70.0, 3.0, 65.0, 29.0, 63.0, 22.0, 71.0, 64.0, 80.0, 78.0, 12.0, 51.0, 23.0, 52.0, 54.0, 8.0, 6.0, 42.0, 66.0, 45.0, 47.0, 46.0, 73.0, 41.0, 2.0, 87.0, 85.0, 90.0, 67.0, 9.0, 76.0, 31.0, 24.0, 81.0, 72.0, 44.0, 19.0, 25.0, 20.0, 55.0, 35.0, 17.0, 74.0, 7.0, 50.0, 28.0, 57.0, 56.0, 11.0, 37.0, 4.0, 5.0, 33.0, 26.0, 1.0, 49.0, 13.0, 77.0, 82.0, 30.0, 83.0, 14.0, 16.0, 68.0]
val units: [15.0, 69.0, 40.0, 21.0, 53.0, 89.0, 59.0, 79.0, 27.0]
test units: [32.0, 10.0, 34.0, 61.0, 75.0, 36.0, 48.0, 84.0, 58.0, 18.0, 38.0, 88.0, 60.0, 86.0]


### now remove the auxillary information, it may be used later for mapping the data back to specific units (or other purposes)

In [13]:
del df
train_aux_df = pd.concat([train_df.pop(col) for col in ['id', 'asset_id', 'cycle', 'hs']], axis=1)
val_aux_df = pd.concat([val_df.pop(col) for col in ['id', 'asset_id', 'cycle', 'hs']], axis=1)
test_aux_df = pd.concat([test_df.pop(col) for col in ['id', 'asset_id', 'cycle', 'hs']], axis=1)      

### Standardize the data to [-1, 1]
- convert to numpy
- Surprise! use keras.layers.BatchNormalization() and dont worry about the rest :)

In [14]:
X_train = np.array(train_df, dtype=np.float32)
del train_df
X_val = np.array(val_df, dtype=np.float32)
del val_df
X_test = np.array(test_df, dtype=np.float32)
del test_df

### set up grid search on the hyper parameters
- import the wrapper
- define a build function

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor