# PREPROCESSING FILE
- loads the datasets you choose,
- creates a dataframe
- resamples
- saves
#### Do this here, then use another notebook for individual tasks
### imports

In [44]:
import os
import sys
import pickle
import h5py
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
import random
import gc
import multiprocessing as mp
import tensorflow as tf
import json

base_dir = os.path.dirname(os.getcwd())

sys.path.insert(1, base_dir)
from package.api import DB as api
import package.utils as utils

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load all data for DS08
- two files, dev and test set

In [2]:
h5_dir = 'data_h5'
fnames = [
    'N-CMAPSS_DS01-005.h5',
    'N-CMAPSS_DS03-012.h5',
    'N-CMAPSS_DS04.h5',
    'N-CMAPSS_DS05.h5',
    'N-CMAPSS_DS06.h5',
    'N-CMAPSS_DS07.h5',
    'N-CMAPSS_DS08a-009.h5',
    'N-CMAPSS_DS08c-008.h5'
]

sets = ['dev', 'test']

df = pd.DataFrame()
ui = 0

for filename in fnames:
    print(filename)
    for _set in sets:
        print(_set)
        with h5py.File(os.path.join(base_dir, h5_dir, filename), 'r') as hdf:
            a_data = np.array(hdf.get(f"A_{_set}"))
            w_data = np.array(hdf.get(f"W_{_set}"))
            x_data = np.array(hdf.get(f"X_s_{_set}"))
            xv_data = np.array(hdf.get(f"X_v_{_set}"))
            t_data = np.array(hdf.get(f"T_{_set}"))
            y_data = np.array(hdf.get(f"Y_{_set}"))

            a_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('A_var')))]
            w_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('W_var')))]
            x_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('X_s_var')))]
            xv_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('X_v_var')))]
            t_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('T_var')))]
            
        df_a = pd.DataFrame(data=a_data, columns=a_labels)
        df_a['ui'] = -1
        df_a['dataset'] = filename.split('_')[1].split('.')[0]
        df_w = pd.DataFrame(data=w_data, columns=w_labels)
        df_x = pd.DataFrame(data=x_data, columns=x_labels)
        df_xv = pd.DataFrame(data=xv_data, columns=xv_labels)
        df_t = pd.DataFrame(data=t_data, columns=t_labels)
        df_y = pd.DataFrame(data=y_data, columns=['y'])
        print(f"<{filename}> : {pd.unique(df_a.unit)}")
        for n in list(pd.unique(df_a.unit)):
            df_a.loc[df_a['unit'] == n, 'ui'] = ui
            ui = ui + 1

        df_temp = pd.concat([df_a, df_y, df_w, df_x, df_xv, df_t], axis=1)
        #print(df_temp.head())
        if(len(df)) == 0:
            df = df_temp
        else:
            df = pd.concat([df, df_temp], axis=0)      
        
        del df_a, df_w, df_x, df_xv, df_t, df_y, a_data, w_data, t_data, x_data, y_data, df_temp
    break

N-CMAPSS_DS01-005.h5
dev
<N-CMAPSS_DS01-005.h5> : [1. 2. 3. 4. 5. 6.]
test
<N-CMAPSS_DS01-005.h5> : [ 7.  8.  9. 10.]


### get the labels

In [3]:
y_labels = t_labels
t_labels = []
t_labels.append(w_labels)
t_labels.append(x_labels)
t_labels = [l for labels in t_labels for l in labels]
print(y_labels)
print(t_labels)
print(xv_labels)

csv_dir = 'data_csv'

with open(os.path.join(base_dir, csv_dir, 'v_labels.txt'), "w") as f:
    for l in xv_labels:
        f.write(f"{l}\n")
        
with open(os.path.join(base_dir, csv_dir, 'y_labels.txt'), "w") as f:
    for l in y_labels:
        f.write(f"{l}\n")

with open(os.path.join(base_dir, csv_dir, 't_labels.txt'), "w") as f:
    for l in t_labels:
        f.write(f"{l}\n")

['fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod', 'LPT_eff_mod', 'LPT_flow_mod']
['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf']
['T40', 'P30', 'P45', 'W21', 'W22', 'W25', 'W31', 'W32', 'W48', 'W50', 'SmFan', 'SmLPC', 'SmHPC', 'phi']


## Create the augmented auxiliary data by aggregating over units

In [4]:
df_max = df[['ui', 'Fc', 'unit', 'dataset', 'cycle']].groupby('ui').agg('max')
df_max.reset_index(inplace=True)
df_max.head()

Unnamed: 0,ui,Fc,unit,dataset,cycle
0,0,1.0,1.0,DS01-005,100.0
1,1,3.0,2.0,DS01-005,75.0
2,2,2.0,3.0,DS01-005,100.0
3,3,1.0,4.0,DS01-005,95.0
4,4,3.0,5.0,DS01-005,89.0


True

In [163]:
api._create_asset_type(asset_type='engine', subtype='turbine', description='N-CMAPSS dataset unit', db=db, cur=cur)

1

In [156]:
asset_tb_cols = api.get_fields('asset_tb', as_list=True, db=db)
asset_tb_cols.remove('id')
asset_tb_cols

['owner',
 'type_id',
 'process_id',
 'serial_number',
 'common_name',
 'age',
 'eol',
 'rul',
 'units']

In [157]:
engine_tb_cols = api.get_fields('engine_tb', as_list=True, db=db)
engine_tb_cols

['id', 'group_id', 'unit', 'dataset']

In [150]:
# THESE ARE YOUR CREDENTIALS IN PLAIN TEXT!
params = utils.get_aws_secret("/secret/ncmapssdb")
#print(params)
db, cur =  api.connect(params)
db.set_session(autocommit=True)
del(params)

[INFO] connecting to db.
[INFO] connected.


In [38]:
db_tables = api.get_tables(db)
print(db_tables)

        table_name
0  process_type_tb
1       process_tb
2    asset_type_tb
3         asset_tb
4        engine_tb
5         group_tb
6       summary_tb
7     telemetry_tb
8   degradation_tb
9          test_tb


In [168]:
asset_type_id = api._get_asset_type(asset_type='engine', subtype='turbine', db=db)
print(asset_type_id)
print(type(asset_type_id))

1
<class 'int'>


In [174]:
api._create_asset(type_id=asset_type_id,
                  common_name='ncmapss unit',
                  age=0,
                  eol=100,
                  rul=100,
                  units='cycles',
                  serial_number='sd3kg0dk00',
                  db=db,
                  cur=cur)

UniqueViolation: duplicate key value violates unique constraint "asset_tb_serial_number_key"
DETAIL:  Key (serial_number)=(sd3kg0dk00) already exists.


1


In [68]:
api.batch_insert(db, tb, cols, values, cur)

True

In [147]:
asset_type_id = api._create_asset_type(asset_type='engine', subtype='turbine', description='N-CMAPSS dataset unit', db=db, cur=cur)
print(asset_type_id)

1


# DO ONCE, then proceed below
## LOAD the dataset and resample it, then SAVE it
- for some reason I could not stop getting duplicate index errors when trying to resample before saving and loading

In [None]:
del df
df = pd.read_csv(csv_dir+'df08_all.csv')
df.drop(columns=[df.columns[0]], inplace=True)
df.index = pd.to_timedelta(df.index, unit='s')
df = df.resample('10S').interpolate(method='time')
df.head()