# PREPROCESSING FILE
- loads the datasets you choose,
- creates a dataframe
- resamples
- saves
#### Do this here, then use another notebook for individual tasks
### imports

In [1]:
import os
import sys
import pickle
import h5py
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
import random
import gc
import string
import multiprocessing as mp
import tensorflow as tf
import json

base_dir = os.path.dirname(os.getcwd())

sys.path.insert(1, base_dir)
from package.api import DB as api
import package.utils as utils

%matplotlib inline
%load_ext autoreload
%autoreload 2

## load all data

In [36]:
h5_dir = 'data_h5'
fnames = [
    'N-CMAPSS_DS01-005.h5',
    'N-CMAPSS_DS03-012.h5',
    'N-CMAPSS_DS04.h5',
    'N-CMAPSS_DS05.h5',
    'N-CMAPSS_DS06.h5',
    'N-CMAPSS_DS07.h5',
    'N-CMAPSS_DS08a-009.h5',
    'N-CMAPSS_DS08c-008.h5'
]

sets = ['dev', 'test']

df = pd.DataFrame()
asset_id = 1

for filename in fnames:
    print(filename)
    for _set in sets:
        print(_set)
        with h5py.File(os.path.join(base_dir, h5_dir, filename), 'r') as hdf:
            a_data = np.array(hdf.get(f"A_{_set}"))
            w_data = np.array(hdf.get(f"W_{_set}"))
            x_data = np.array(hdf.get(f"X_s_{_set}"))
            v_data = np.array(hdf.get(f"X_v_{_set}"))
            t_data = np.array(hdf.get(f"T_{_set}"))
            y_data = np.array(hdf.get(f"Y_{_set}"))

            a_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('A_var')))]
            w_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('W_var')))]
            x_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('X_s_var')))]
            v_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('X_v_var')))]
            t_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('T_var')))]
            
        df_a = pd.DataFrame(data=a_data, columns=a_labels)
        df_a['asset_id'] = -1
        df_a['dataset'] = filename.split('_')[1].split('.')[0]
        df_w = pd.DataFrame(data=w_data, columns=w_labels)
        df_x = pd.DataFrame(data=x_data, columns=x_labels)
        df_v = pd.DataFrame(data=v_data, columns=v_labels)
        df_t = pd.DataFrame(data=t_data, columns=t_labels)
        df_y = pd.DataFrame(data=y_data, columns=['y'])
        print(f"<{filename}> : {pd.unique(df_a.unit)}")
        for n in list(pd.unique(df_a.unit)):
            df_a.loc[df_a['unit'] == n, 'asset_id'] = asset_id
            asset_id = asset_id + 1

        df_temp = pd.concat([df_a, df_y, df_w, df_x, df_v, df_t], axis=1)
        #print(df_temp.head())
        if(len(df)) == 0:
            df = df_temp
        else:
            df = pd.concat([df, df_temp], axis=0)      
        
        del df_a, df_w, df_x, df_v, df_t, df_y, a_data, w_data, t_data, x_data, y_data, df_temp
    break
    ####### NOTICE THE BREAK HERE!! only loading first dataset for testing purposes!!
df.asset_id = df.asset_id.astype(int)
df.unit = df.unit.astype(int)
df.cycle = df.cycle.astype(int)
df.hs = df.hs.astype(int)
df.Fc = df.Fc.astype(int)

N-CMAPSS_DS01-005.h5
dev
<N-CMAPSS_DS01-005.h5> : [1. 2. 3. 4. 5. 6.]
test
<N-CMAPSS_DS01-005.h5> : [ 7.  8.  9. 10.]


## get the labels

In [3]:
y_labels = t_labels
t_labels = []
t_labels.append(w_labels)
t_labels.append(x_labels)
t_labels = [l for labels in t_labels for l in labels]
print(y_labels)
print(t_labels)
print(v_labels)

['fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod', 'LPT_eff_mod', 'LPT_flow_mod']
['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf']
['T40', 'P30', 'P45', 'W21', 'W22', 'W25', 'W31', 'W32', 'W48', 'W50', 'SmFan', 'SmLPC', 'SmHPC', 'phi']


## create the augmented auxiliary data by aggregating over units

In [4]:
df_aux = df[['asset_id', 'Fc', 'unit', 'dataset', 'cycle']].groupby('asset_id').agg({'Fc':'max',
                                                                         'unit':'max',
                                                                         'dataset':'max', 
                                                                         'cycle':['min','max']})
df_aux.reset_index(inplace=True)
df_aux.columns=['asset_id', 'group_id', 'unit', 'dataset', 'age', 'eol']
df_aux.age = df_aux.age - 1.0
df_aux.head()

Unnamed: 0,asset_id,group_id,unit,dataset,age,eol
0,1,1,1,DS01-005,0.0,100
1,2,3,2,DS01-005,0.0,75
2,3,2,3,DS01-005,0.0,100
3,4,1,4,DS01-005,0.0,95
4,5,3,5,DS01-005,0.0,89


## connect to db

In [5]:
# THESE ARE YOUR CREDENTIALS IN PLAIN TEXT!
params = utils.get_aws_secret("/secret/ncmapssdb")
#print(params)
db, cur =  api.connect(params)
db.set_session(autocommit=True)
del(params)

[INFO] connecting to db.
[INFO] connected.


## create asset type

In [10]:
asset_type = api._create_asset_type(asset_type='engine', subtype='ncmapss', description='turbine engine from N-CMAPSS dataset unit', db=db, cur=cur)
print(asset_type)

   id    type  subtype                                description
0   1  engine  ncmapss  turbine engine from N-CMAPSS dataset unit


## create assets and components
#### this could be rewritten as a function for use with df_aux.apply()....

In [11]:
# this could be rewritten as a function for use with df_aux.apply()....
for i in range(0, len(df_aux)):
    asset = api._create_asset(type_id=int(asset_type.id.values[0]),
                              common_name='ncmapss unit',
                              age=float(df_aux.iloc[i].age),
                              eol=float(df_aux.iloc[i].eol),
                              rul=float(df_aux.iloc[i].eol - df_aux.iloc[i].age),
                              units='cycles',
                              serial_number=utils.generate_serial_number(length=8),
                              db=db,
                              cur=cur)
    print(asset)

    component = api._create_component(asset=asset, 
                                      group_id=df_aux.iloc[i].group_id, 
                                      unit=df_aux.iloc[i].unit, 
                                      dataset=df_aux.iloc[i].dataset, 
                                      db=db, 
                                      cur=cur)
    print(component)

   id  type_id     owner process_id serial_number   common_name  age    eol  \
0   1        1  darrahts       None      huS6ayLm  ncmapss unit  0.0  100.0   

     rul   units  
0  100.0  cycles  
None
   id  type_id     owner process_id serial_number   common_name  age   eol  \
0   2        1  darrahts       None      LPL4Nwf7  ncmapss unit  0.0  75.0   

    rul   units  
0  75.0  cycles  
None
   id  type_id     owner process_id serial_number   common_name  age    eol  \
0   3        1  darrahts       None      k2FthJ7H  ncmapss unit  0.0  100.0   

     rul   units  
0  100.0  cycles  
None
   id  type_id     owner process_id serial_number   common_name  age   eol  \
0   4        1  darrahts       None      hbpNtu9H  ncmapss unit  0.0  95.0   

    rul   units  
0  95.0  cycles  
None
   id  type_id     owner process_id serial_number   common_name  age   eol  \
0   5        1  darrahts       None      E9JxF6DG  ncmapss unit  0.0  89.0   

    rul   units  
0  89.0  cycles  
None
  

### convert index to datetime 
- given there is no time information with the provided data, set the interval at your discretion (ex: 1 second)

# TIM HERE!! TODO grab last record from db, and start index from that value
### the "id" column will only start at 1 once

In [37]:
start_id = 1 # get_last_id(implement_function_here)
df.index = pd.to_datetime(df.index, unit='s', origin='unix')
df.index.names=['dt']
df.reset_index(inplace=True)
df.index += start_id
df.index.names=['id']
df.reset_index(inplace=True)
df.head()

Unnamed: 0,id,dt,unit,cycle,Fc,hs,asset_id,dataset,y,alt,...,fan_eff_mod,fan_flow_mod,LPC_eff_mod,LPC_flow_mod,HPC_eff_mod,HPC_flow_mod,HPT_eff_mod,HPT_flow_mod,LPT_eff_mod,LPT_flow_mod
0,1,1970-01-01 00:00:00,1,1,1,1,1,DS01-005,99,3013.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.000604,0.0,0.0,0.0
1,2,1970-01-01 00:00:01,1,1,1,1,1,DS01-005,99,3020.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.000604,0.0,0.0,0.0
2,3,1970-01-01 00:00:02,1,1,1,1,1,DS01-005,99,3025.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.000604,0.0,0.0,0.0
3,4,1970-01-01 00:00:03,1,1,1,1,1,DS01-005,99,3035.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.000604,0.0,0.0,0.0
4,5,1970-01-01 00:00:04,1,1,1,1,1,DS01-005,99,3043.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.000604,0.0,0.0,0.0


# NOTE it is up to you to ensure you do not exceed your system memory when using batch insert
- the data tables are broken down slightly different then they are presented in the dataset, see the table schema
- summary_tb: id (auto generated), asset_id, cycle, alt, Mach, TRA, T2)
- telemetry_tb: dt (timestamp or datetime), all of the telemetry columns
- degradation_tb: dt (timestamp or datetime), all of the degradation columns and health state

In [38]:
summary_cols = api.get_fields('summary_tb', as_list=True, db=db)

telemetry_cols = api.get_fields('telemetry_tb', as_list=True, db=db)

degradation_cols = api.get_fields('degradation_tb', as_list=True, db=db)

print(f"summary_cols: {summary_cols}")
print(f"telemetry_cols: {telemetry_cols}")
print(f"degradation_cols: {degradation_cols}")
print("NOTE: the 'id' column will be added back, telemetry and degradation relation on summary id")

summary_cols: ['id', 'asset_id', 'cycle', 'hs', 'alt', 'Mach', 'TRA', 'T2']
telemetry_cols: ['id', 'dt', 'Wf', 'Nf', 'Ne', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50']
degradation_cols: ['id', 'fan_eff_mod', 'fan_flow_mod', 'LPC_eff_mod', 'LPC_flow_mod', 'HPC_eff_mod', 'HPC_flow_mod', 'HPT_eff_mod', 'HPT_flow_mod', 'LPT_eff_mod', 'LPT_flow_mod']
NOTE: the 'id' column will be added back, telemetry and degradation relation on summary id


## insert summary data first, since telemetry and degradation index off it

In [23]:
api.batch_insert(df[summary_cols], 'summary_tb', db, cur)

executing query now...


## Insert telemetry and degradation data

## Misc usage

In [163]:
api._create_asset_type(asset_type='engine', subtype='turbine', description='N-CMAPSS dataset unit', db=db, cur=cur)

1

In [156]:
asset_tb_cols = api.get_fields('asset_tb', as_list=True, db=db)
asset_tb_cols.remove('id')
asset_tb_cols

['owner',
 'type_id',
 'process_id',
 'serial_number',
 'common_name',
 'age',
 'eol',
 'rul',
 'units']

In [157]:
engine_tb_cols = api.get_fields('engine_tb', as_list=True, db=db)
engine_tb_cols

['id', 'group_id', 'unit', 'dataset']

In [278]:
db_tables = api.get_tables(db)
print(db_tables)

          table_name
0    process_type_tb
1         process_tb
2           group_tb
3         summary_tb
4       telemetry_tb
5     degradation_tb
6  engine_ncmapss_tb
7      asset_type_tb
8           asset_tb


In [328]:
i = 0
df_aux.head()

Unnamed: 0,group_id,unit,dataset,age,eol
0,1.0,1.0,DS01-005,1.0,100.0
1,3.0,2.0,DS01-005,1.0,75.0
2,2.0,3.0,DS01-005,1.0,100.0
3,1.0,4.0,DS01-005,1.0,95.0
4,3.0,5.0,DS01-005,1.0,89.0


In [239]:
api._get_asset(serial_number='sd3kg0dk00', db=db)

select * from asset_tb where "serial_number" = 'sd3kg0dk00';
<connection object at 0x00000220A9C1C378; dsn: 'user=darrahts password=xxx dbname=ncmapss_db host=10.166.1.192 port=5432', closed: 0>


Unnamed: 0,id,type_id,owner,process_id,serial_number,common_name,age,eol,rul,units
0,1,1,darrahts,,sd3kg0dk00,ncmapss unit,0.0,100.0,100.0,cycles


In [216]:
api.table_exists(f"{asset_type.type.values[0]}_{asset_type.subtype.values[0]}_tb", db)

In [68]:
api.batch_insert(db, tb, cols, values, cur)

True

In [None]:
asset_type_id = api._get_asset_type(asset_type='engine', subtype='ncmapss', db=db)
print(asset_type_id)
print(type(asset_type_id))

In [273]:
asset_type = api._get_asset_type(type_id=3, db=db)
print(asset_type)
print(type(asset_type))

Empty DataFrame
Columns: [id, type, subtype, description]
Index: []
<class 'pandas.core.frame.DataFrame'>
