# PREPROCESSING FILE
- loads the datasets you choose,
- creates a dataframe
- resamples
- saves
#### Do this here, then use another notebook for individual tasks
### imports

In [1]:
import os
import pickle
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
import scipy.stats as stats
import math
import random
import gc
import multiprocessing as mp
import tensorflow as tf
%matplotlib inline

# load all data for DS08
- two files, dev and test set

In [2]:
h5_dir = 'data_h5/'
fnames = ['N-CMAPSS_DS08a-009.h5', 'N-CMAPSS_DS08c-008.h5']
sets = ['dev', 'test']

df = pd.DataFrame()
ui = 0

for filename in fnames:
    print(filename)
    for _set in sets:
        print(_set)
        with h5py.File(h5_dir+filename, 'r') as hdf:
            a_data = np.array(hdf.get(f"A_{_set}"))
            w_data = np.array(hdf.get(f"W_{_set}"))
            x_data = np.array(hdf.get(f"X_s_{_set}"))
            xv_data = np.array(hdf.get(f"X_v_{_set}"))
            t_data = np.array(hdf.get(f"T_{_set}"))
            y_data = np.array(hdf.get(f"Y_{_set}"))

            a_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('A_var')))]
            w_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('W_var')))]
            x_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('X_s_var')))]
            xv_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('X_v_var')))]
            t_labels = [l.decode('utf-8') for l in list(np.array(hdf.get('T_var')))]
            
        df_a = DataFrame(data=a_data, columns=a_labels)
        df_a['ui'] = -1
        df_a['dataset'] =  filename.split('_')[1].split('.')[0]
        df_w = DataFrame(data=w_data, columns=w_labels)
        df_x = DataFrame(data=x_data, columns=x_labels)
        df_xv = DataFrame(data=xv_data, columns=xv_labels)
        df_t = DataFrame(data=t_data, columns=t_labels)
        df_y = DataFrame(data=y_data, columns=['y'])
        print(f"<{filename}> : {pd.unique(df_a.unit)}")
        for n in list(pd.unique(df_a.unit)):
            df_a.loc[df_a['unit'] == n, 'ui'] = ui
            ui = ui + 1

        df_temp = pd.concat([df_a, df_y, df_w, df_x, df_xv, df_t], axis=1)
        #print(df_temp.head())
        if(len(df)) == 0:
            df = df_temp
        else:
            df = pd.concat([df, df_temp], axis=0)      
        
        del df_a, df_w, df_x, df_xv, df_t, df_y, a_data, w_data, t_data, x_data, y_data, df_temp


N-CMAPSS_DS08a-009.h5
dev
<N-CMAPSS_DS08a-009.h5> : [1. 2. 3. 4. 5. 6. 7. 8. 9.]
test
<N-CMAPSS_DS08a-009.h5> : [10. 11. 12. 13. 14. 15.]
N-CMAPSS_DS08c-008.h5
dev
<N-CMAPSS_DS08c-008.h5> : [1. 2. 3. 4. 5. 6.]
test
<N-CMAPSS_DS08c-008.h5> : [ 7.  8.  9. 10.]


In [None]:
pd.unique(df.ui)

In [None]:
df.head()

In [2]:
df = pd.read_csv('data_csv/df08_all_resampled_interp_h0_fc1.csv')

df.head()

In [3]:
pd.unique(df.ui)

array([ 0,  2,  8,  9, 14], dtype=int64)

In [12]:
with open(csv_dir + 'v_labels.txt', "w") as f:
    for l in xv_labels:
        f.write(f"{l}\n")

## Create the augmented auxiliary data by aggregating over units

In [3]:
df_max = df[['ui', 'Fc', 'unit', 'y', 'dataset']].groupby('ui').agg('max')
df_max.reset_index(inplace=True)
df_max.head()

Unnamed: 0,ui,Fc,unit,y,dataset
0,0,1.0,1.0,71,DS08a-009
1,1,3.0,2.0,50,DS08a-009
2,2,1.0,3.0,74,DS08a-009
3,3,2.0,4.0,70,DS08a-009
4,4,2.0,5.0,55,DS08a-009


### get the labels

In [8]:
y_labels = t_labels
t_labels = []
t_labels.append(w_labels)
t_labels.append(x_labels)
t_labels = [l for labels in t_labels for l in labels]
print(y_labels)
print(t_labels)

['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf']
['alt', 'Mach', 'TRA', 'T2', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50', 'Nf', 'Nc', 'Wf']


## SAVE the two datasets

In [9]:
csv_dir = 'data_csv/'
# df.to_csv(csv_dir+'df08_all.csv')
# df_max.to_csv(csv_dir+'df08_aux.csv')

with open(csv_dir+'y_labels.txt', "w") as f:
    for l in y_labels:
        f.write(f"{l}\n")

with open(csv_dir+'t_labels.txt', "w") as f:
    for l in t_labels:
        f.write(f"{l}\n")

# DO ONCE, then proceed below
## LOAD the dataset and resample it, then SAVE it
- for some reason I could not stop getting duplicate index errors when trying to resample before saving and loading

In [6]:
del df
df = pd.read_csv(csv_dir+'df08_all.csv')
df.drop(columns=[df.columns[0]], inplace=True)
df.index = pd.to_timedelta(df.index, unit='s')
df = df.resample('10S').interpolate(method='time')
df.head()

Unnamed: 0,unit,cycle,Fc,hs,ui,dataset,y,alt,Mach,TRA,...,fan_eff_mod,fan_flow_mod,LPC_eff_mod,LPC_flow_mod,HPC_eff_mod,HPC_flow_mod,HPT_eff_mod,HPT_flow_mod,LPT_eff_mod,LPT_flow_mod
0 days 00:00:00,1.0,1.0,1.0,1.0,0,DS08a-009,71,3003.0,0.39249,84.286507,...,-0.000873,-5.7e-05,-0.00081,-0.000665,0.000142,-0.000928,-0.000612,-0.000411,-0.001029,-0.000784
0 days 00:00:10,1.0,1.0,1.0,1.0,0,DS08a-009,71,3067.0,0.398034,84.286507,...,-0.000873,-5.7e-05,-0.00081,-0.000665,0.000142,-0.000928,-0.000612,-0.000411,-0.001029,-0.000784
0 days 00:00:20,1.0,1.0,1.0,1.0,0,DS08a-009,71,3155.0,0.401877,84.286507,...,-0.000873,-5.7e-05,-0.00081,-0.000665,0.000142,-0.000928,-0.000612,-0.000411,-0.001029,-0.000784
0 days 00:00:30,1.0,1.0,1.0,1.0,0,DS08a-009,71,3258.0,0.40383,84.286507,...,-0.000873,-5.7e-05,-0.00081,-0.000665,0.000142,-0.000928,-0.000612,-0.000411,-0.001029,-0.000784
0 days 00:00:40,1.0,1.0,1.0,1.0,0,DS08a-009,71,3366.0,0.40635,84.286507,...,-0.000873,-5.7e-05,-0.00081,-0.000665,0.000142,-0.000928,-0.000612,-0.000411,-0.001029,-0.000784


In [7]:
df.to_csv(csv_dir+'df08_all.csv')