In [26]:
import os
import sys
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
base_dir = os.path.dirname(os.getcwd())
print(base_dir)
sys.path.insert(1, base_dir)
from package.api import DB as api
import package.utils as utils
import package.tuning as tuning
utils.check_gpu()

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers, optimizers, metrics
#import tensorflow_addons as tfa

import keras_tuner as kt

from kerastuner_tensorboard_logger import (
    TensorBoardLogger,
    setup_tb  # Optional
)


%matplotlib inline
%load_ext autoreload
%autoreload 2


paths_df = pd.read_csv(base_dir + '/paths.csv')
paths_df['path'] = base_dir + '/' + paths_df['path']

Fc = 2
dataset = 'DS08'

log_location = base_dir + '/logs'
model_location = base_dir + '/models'
data_location = base_dir + '/data'
data_header = f'Fc-{Fc}_dataset-{dataset}'



params = {'datasource.username': 'macslab', # the username of the logged in user
            'datasource.password': 'Ch0colate!', 
            'datasource.database': 'ncmapss_db', # <- NO CHANGE 
            'datasource.url': '10.2.219.98', # <- or your database installation location
            'datasource.port': '5432'} # <- most likely don't change
db, cur =  api.connect(params)
db.set_session(autocommit=True)
del(params)


units_df = api._get_units(db=db)
units = units_df[(units_df['Fc'] == Fc) & (units_df['dataset'].str.contains(dataset))]


tables = ['summary_tb', 'telemetry_tb']
downsample=10
df = api._get_data(db=db,
                   units=pd.unique(units.id),
                   tables=tables,
                   downsample=downsample).astype(np.float32)
utils.add_time_column(units=pd.unique(units.id), df=df)
utils.add_rul_column(units=pd.unique(units.id), df=df)


W_cols = ['Mach', 'alt', 'TRA', 'T2', 'time']
Xs_cols = ['Wf', 'Nf', 'Nc', 'T24', 'T30', 'T48', 'T50', 'P15', 'P2', 'P21', 'P24', 'Ps30', 'P40', 'P50']
aux_cols = ['cycle', 'hs', 'Fc', 'asset_id']

model = keras.models.load_model(paths_df[paths_df['name']=='flight_effects'].path.values[0])
yscaler = joblib.load(paths_df[paths_df['name']=='flight_effects_yscaler'].path.values[0])
xscaler = joblib.load(paths_df[paths_df['name']=='flight_effects_xscaler'].path.values[0])


trace = yscaler.transform(df[Xs_cols])
pred = model.predict(xscaler.transform(df[W_cols]))
res = trace - pred
dfx = pd.DataFrame(data=res, columns=Xs_cols)
df_x = pd.DataFrame(data=xscaler.transform(df[W_cols]), columns=W_cols)
dfx = pd.concat([dfx, df_x, df[aux_cols]], axis=1)
dfx['rul'] = df['rul'].values
dfx.time = dfx.time + (dfx.cycle -1)
dfx0 = dfx[dfx.hs == 0]

/home/macslab/phm2021_data_challenge
2.8.0
[INFO] GPU?: <True> [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
expanding memory growth
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[INFO] connecting to db.
[INFO] connected.


In [18]:
data_header

'Fc-1_dataset-DS08'

In [27]:
lookback = 100
horizon = 1
n_out = 1
n_features = 19
input_shape = (lookback, n_features)

monitor = 'val_root_mean_squared_error'
mode = 'min'
min_delta = .1
patience = 10

batch_size = 256
epochs = 120

traces = []

preds = []

test_units = []

params = []

#params.append(tuning.MyParameters(layers=6, units=16, dropout_rate=.25, learning_rate = .001, recurrent_dropout=0.0, l2=.00001, l1=.00001))
params.append(tuning.MyParameters(layers=3, units=24, dropout_rate=.2, learning_rate = .00075, recurrent_dropout=0.0, l2=.00001, l1=.00001))
params.append(tuning.MyParameters(layers=3, units=32, dropout_rate=.2, learning_rate = .00075, recurrent_dropout=0.0, l2=.00001, l1=.00001))
params.append(tuning.MyParameters(layers=3, units=64, dropout_rate=.2, learning_rate = .00075, recurrent_dropout=0.0, l2=.00001, l1=.00001))


def decay_schedule(epoch, lr):
    if epoch > 1 and epoch < 10:
        lr = lr * .99
    elif epoch >= 10 and epoch < 25:
        lr = lr * .98
    else:
        lr = lr * .97
            
    return lr

lr_scheduler = keras.callbacks.LearningRateScheduler(decay_schedule)

early_stopping = keras.callbacks.EarlyStopping(monitor=monitor,
                                               mode=mode,
                                               min_delta=min_delta,
                                               patience=patience,
                                               verbose=1,
                                               restore_best_weights=True)#True)

In [14]:
for j in range(len(params)):
    model_name = str(params[j])
    model = utils.load_model(model_location, data_header, model_name, model_number=0)
    print(model.summary())

In [None]:
footer = 'transfer'

models = []
scores = []
models.append([])
scores.append([])

K = len(units)
start = 0
for j in range(0, len(params)):
    results = {}
    results[f'model_{j}'] = {}
    results[f'model_{j}']['params'] = params[j].__dict__
    
    models.append([])
    scores.append([])
    
    for i in range(start, K):        
        test_df = dfx0[dfx0.asset_id == units.iloc[i].id]
        test_y = test_df.pop('rul')
        
        val_unit_id = units[(units.id != units.iloc[i].id) & (units.id != 71) & (units.id != 77) & (units.id != 78) & (units.id != 79)].sample(1).id.values[0]
        
        val_df = dfx0[dfx0.asset_id == val_unit_id]
        val_y = val_df.pop('rul')
        
        train_df = dfx0[(dfx0.asset_id != units.iloc[i].id) & (dfx0.asset_id != val_unit_id)]
        train_y = train_df.pop('rul')

        print("temporalizing")
        
        X_train, y_train = utils.temporalize_data(train_df[W_cols + Xs_cols].values, train_y.values, lookback, horizon, n_features, n_out)
        X_test, y_test = utils.temporalize_data(test_df[W_cols + Xs_cols].values, test_y.values, lookback, horizon, n_features, n_out)
        X_val, y_val = utils.temporalize_data(val_df[W_cols + Xs_cols].values, val_y.values, lookback, horizon, n_features, n_out)

        X_train = np.array(X_train).astype(np.float32)
        y_train = np.array(y_train).astype(np.float32)

        X_test = np.array(X_test).astype(np.float32)
        y_test = np.array(y_test).astype(np.float32)
        
        X_val = np.array(X_val).astype(np.float32)
        y_val = np.array(y_val).astype(np.float32)

        my_tuning = tuning.Tuning(input_shape, n_out)

        #model = my_tuning.build_bilstm_model(params[j])
        ##### load a model trained on flight class 3
        model_name = str(params[j])
        model = utils.load_model(model_location, f'Fc-3_dataset-{dataset}', model_name, model_number=8)
    
        
        tensorboard = keras.callbacks.TensorBoard(log_dir=f'{log_location}/kmeans/{data_header}/{footer}/model_{j}/test_unit_{units.iloc[i].id}/val_unit_{val_unit_id}',
                                      histogram_freq=1,
                                      write_images=True,
                                      write_graph=True)
        
        history = model.fit(X_train,
                            y_train,
                            batch_size=batch_size,
                            epochs=epochs,
                            shuffle=False,
                            validation_data=(X_val, y_val),
                            callbacks=[early_stopping, lr_scheduler, tensorboard],
                            verbose=1)

        test_score = model.evaluate(X_test, y_test, batch_size=batch_size)

        res = model.predict(X_test)

        traces.append(y_test.flatten())
        preds.append(res.flatten())
        test_units.append(units.iloc[i].id)
        
        print(f'training on <{[int(x) for x in pd.unique(train_df.asset_id)]}>, validating on <{val_unit_id}>, testing on <{units.iloc[i].id}>')
        figure = plt.figure(figsize=(6,6))
        plt.plot(y_test.flatten(), c='r')
        plt.plot(res.flatten(), c='b')
        plt.show()

        model.save(model_location + '/' + data_header + f'/{footer}/' + str(params[j]) + f'/model_{i}')
        model.save(model_location + '/' + data_header + f'/{footer}/' + str(params[j]) + f'/model_{i}.h5')
        
        variables = {"size_kb": os.path.getsize(model_location + '/' + data_header + f'/{footer}/' + str(params[j]) + f'/model_{i}.h5')/1024,
                     "val_rmse": history.history['val_root_mean_squared_error'][-1], 
                     "test_rmse": test_score[1], 
                     "test_unit": test_units[i], 
                     "val_unit": val_unit_id,
                     "trace": list(traces[i]),
                     "pred": list(preds[i])
                    }

        results[f'model_{j}'][f'data_{i}'] = variables
        
        models[j].append(model)
        scores[j].append([(variables['val_rmse'] + variables['test_rmse']) / 2, variables['val_rmse'], variables['test_rmse']])
        

    mdl = f'model_{j}'
    for attribute, value in results[mdl].items():
        if 'data' in str(attribute):
            print(results[mdl][attribute].keys())
            results[mdl][attribute]['pred'] = [float(x) for x in results[mdl][attribute]['pred']]
            results[mdl][attribute]['trace'] = [float(x) for x in results[mdl][attribute]['trace']]
            results[mdl][attribute]['test_unit'] = int(results[mdl][attribute]['test_unit'])
            results[mdl][attribute]['val_unit'] = int(results[mdl][attribute]['val_unit'])
        
    with open(f'{data_header}_{j}_transfer.json', 'w') as outfile:
        json.dump(results, outfile)

In [15]:
data_header

'Fc-3_dataset-DS08'

In [None]:
model_location + '/' + data_header + '/' + str(params[j])

In [None]:
kfold_data = []
for i in range(3):
    with open(f'kfold_data_{i}.json', 'r') as f:
        kfold_data.append(json.loads(f.read()))

In [None]:
data_header

In [None]:
for j in range(10):
    print(results[f'model_{i}'][f'data_{j}']['test_rmse'])

In [None]:
test_rmse_vals = np.zeros((10,3))
val_rmse_vals = np.zeros((10,3))
for i in range(3):
    print(f"model_{i}")
    for j in range(10):
        print(kfold_data[i][f'model_{i}'][f'data_{j}']['val_rmse'])
        val_rmse_vals[j,i] = kfold_data[i][f'model_{i}'][f'data_{j}']['val_rmse']
        test_rmse_vals[j,i] = kfold_data[i][f'model_{i}'][f'data_{j}']['test_rmse']

In [None]:
test_rmse_df = pd.DataFrame(test_rmse_vals, columns=['m1', 'm2', 'm3'])

In [None]:
test_rmse_df = test_rmse_df.drop([1,2,3,4])
test_rmse_df

In [None]:
val_rmse_df = pd.DataFrame(val_rmse_vals, columns=['m1', 'm2', 'm3'])

In [None]:
test_rmse_df

In [None]:
sizes = np.array([24, 32, 64])
scores = np.array([np.mean(test_rmse_df['m1'].values), np.mean(test_rmse_df['m1'].values), np.mean(test_rmse_df['m3'].values)])
stds = np.array([np.std(test_rmse_df['m1'].values), np.std(test_rmse_df['m1'].values), np.std(test_rmse_df['m1'].values)])

results_df = pd.DataFrame(np.stack([sizes, scores, stds], axis=1), columns = ['size', 'mean', 'std'])

plt.figure(figsize=(12, 8))
sns.scatterplot(data=results_df, s=250, x='size', y='mean', hue='size', palette='Dark2', legend=False)
sns.scatterplot(x=results_df['size'], s=150, y=results_df['mean'] + results_df['std'], marker='^', hue=results_df['size'],
                palette='Dark2', legend=False)
sns.scatterplot(x=results_df['size'], s=150, y=results_df['mean'] - results_df['std'], marker='v', hue=results_df['size'],
                palette='Dark2', legend=False)

for i in range(len(results_df)):
        plt.plot([results_df['size'][i], results_df['size'][i]],
                 [results_df['mean'][i] - results_df['std'][i], results_df['mean'][i] + results_df['std'][i]], color='gray')
        
        
# sizes = np.array([24, 32, 64])
# scores = np.array([np.mean(val_rmse_df['m1'].values), np.mean(val_rmse_df['m1'].values), np.mean(val_rmse_df['m3'].values)])
# stds = np.array([np.std(val_rmse_df['m1'].values), np.std(val_rmse_df['m1'].values), np.std(val_rmse_df['m1'].values)])

# results_df = pd.DataFrame(np.stack([sizes, scores, stds], axis=1), columns = ['size', 'mean', 'std'])

# sns.scatterplot(data=results_df, s=250, x='size', y='mean', hue='size', palette='Dark2', legend=False)
# sns.scatterplot(x=results_df['size'], s=150, y=results_df['mean'] + results_df['std'], marker='^', hue=results_df['size'],
#                 palette='Dark2', legend=False)
# sns.scatterplot(x=results_df['size'], s=150, y=results_df['mean'] - results_df['std'], marker='v', hue=results_df['size'],
#                 palette='Dark2', legend=False)

# for i in range(len(results_df)):
#         plt.plot([results_df['size'][i], results_df['size'][i]],
#                  [results_df['mean'][i] - results_df['std'][i], results_df['mean'][i] + results_df['std'][i]], color='gray')
        
        
        
plt.title('kfold test scores for 3 models')
plt.show()

In [None]:
units.id.values

In [None]:
x = np.arange(0,10)

plt.figure(figsize=(12,8))
for i in range(10):
    plt.text(i-.5, .5, f'unit {units.id.values[i]}')
    for j in range(3):
        plt.scatter(x[i], test_rmse_vals[i,j], c='r')
        
    for j in range(3):
        plt.scatter(x[i], val_rmse_vals[i,j], c='g')
        
plt.title('test unit scores for 3 models')
plt.show()

In [None]:
for i in range(3):
    print(f"model_{i}")
    for j in range(10):
        plt.figure(figsize=(6,6))
        plt.plot(kfold_data[i][f'model_{i}'][f'data_{j}']['pred'], c='r', label='pred')
        plt.plot(kfold_data[i][f'model_{i}'][f'data_{j}']['trace'], c='b', label='trace')
        plt.legend()
        plt.show()