In [1]:
import pandas as pd
import numpy as np
import sklearn
import time

# Load the database, shouldn't take that long. This is all data!

t1 = time.time()
print('Loading database ...')
df = pd.read_hdf('database/all_data_comp.h5','table')
print('Time to load database:', time.time()-t1)


Loading database ...
Time to load database: 4.129496812820435


In [3]:
fo_1_3_total = df['FO_day_engine_1_3'].dropna()
fo_2_4_total = df['FO_day_engine_2_4'].dropna()

# this gives the total in tonnes for each day.

# The dataset is not complete overlapping in time with data from both the mass-flow meters and the
# the rest of the data. So we have to manually filter out the time interval which we are interested in.

date_begin = '2014-02-01'
date_end = '2014-12-16'

# Dict of var names we want to use.


var_names = {'ae1_frp':'AE1 FUEL RACK POSIT:1742:mm:Average:900',
             'ae2_frp':'AE2 FUEL RACK POSIT:2742:mm:Average:900',
             'ae3_frp':'AE3 FUEL RACK POSIT:3742:mm:Average:900',
             'ae4_frp':'AE4 FUEL RACK POSIT:4742:mm:Average:900',
             
             'ae1_cac_P':'AE1 CA OTL CAC PRESS:1649:Bar:Average:900',
             'ae2_cac_P':'AE2 CA OTL CAC PRESS:2649:Bar:Average:900',
             'ae3_cac_P':'AE3 CA OTL CAC PRESS:3649:Bar:Average:900',
             'ae4_cac_P':'AE4 CA OTL CAC PRESS:4649:Bar:Average:900',
             
             'ae1_cac_ca':'AE1 EXH CA OUTET 1:1543:  C:Average:900',
             'ae2_cac_ca':'AE2 EXH CA OUTET 1:2543:  C:Average:900',
             'ae3_cac_ca':'AE3 EXH CA OUTET 1:3543:  C:Average:900',
             'ae4_cac_ca':'AE4 EXH CA OUTET 1:4543:  C:Average:900',
             
             'ae1_exh':'AE1 EXH MEAN VALUE:1591:  C:Average:900',
             'ae2_exh':'AE2 EXH MEAN VALUE:2591:  C:Average:900',
             'ae3_exh':'AE3 EXH MEAN VALUE:3591:  C:Average:900',
             'ae4_exh':'AE4 EXH MEAN VALUE:4591:  C:Average:900',
             
             'ae1_fo_P':'AE1 FO INLET PRESS:1603:Bar:Average:900',
             'ae2_fo_P':'AE2 FO INLET PRESS:2603:Bar:Average:900',
             'ae3_fo_P':'AE3 FO INLET PRESS:3603:Bar:Average:900',
             'ae4_fo_P':'AE4 FO INLET PRESS:4603:Bar:Average:900',
             
             'ae1_rpm':'AE1 ENG SPEED:1745:RPM:Average:900',
             'ae2_rpm':'AE2 ENG SPEED:2745:RPM:Average:900',
             'ae3_rpm':'AE3 ENG SPEED:3745:RPM:Average:900',
             'ae4_rpm':'AE4 ENG SPEED:4745:RPM:Average:900',
             
             'me1_frp':'ME1 FUEL RACK POSIT:10005:%:Average:900',
             'me2_frp':'ME2 FUEL RACK POSIT:20005:%:Average:900',
             'me3_frp':'ME3 FUEL RACK POSIT:30005:%:Average:900',
             'me4_frp':'ME4 FUEL RACK POSIT:40005:%:Average:900',
             
             'me1_ca_T':'ME1 CA TEMP COOL OUT:1343:C:Average:900',
             'me2_ca_T':'ME2 CA TEMP COOL OUT:2343:C:Average:900',
             'me3_ca_T':'ME3 CA TEMP COOL OUT:3343:C:Average:900',
             'me4_ca_T':'ME4 CA TEMP COOL OUT:4343:C:Average:900',
             
             'me1_cac_T':'ME1 CHARGE AIR TEMP:1347:C:Average:900',
             'me2_cac_T':'ME2 CHARGE AIR TEMP:2347:C:Average:900',
             'me3_cac_T':'ME3 CHARGE AIR TEMP:3347:C:Average:900',
             'me4_cac_T':'ME4 CHARGE AIR TEMP:4347:C:Average:900',
             
             'me1_exh_T':'ME1 EXH GAS MEAN:1125:C:Average:900',
             'me2_exh_T':'ME2 EXH GAS MEAN:2125:C:Average:900',
             'me3_exh_T':'ME3 EXH GAS MEAN:3125:C:Average:900',
             'me4_exh_T':'ME4 EXH GAS MEAN:4125:C:Average:900',
             
             'me1_rpm':'ME1 ENGINE SPEED:1364:rpm:Average:900',
             'me2_rpm':'ME2 ENGINE SPEED:2364:rpm:Average:900',
             'me3_rpm':'ME3 ENGINE SPEED:3364:rpm:Average:900',
             'me4_rpm':'ME4 ENGINE SPEED:4364:rpm:Average:900',
             
             'fo_booster_13':'FO BOOST 1 CONSUMPT:6165:m3/h:Average:900',
             'fo_booster_24':'FO BOOST 2 CONSUMPT:6166:m3/h:Average:900'}

for names in var_names:
    if var_names[names] in list(df):
        #print(var_names[names])
        pass
    else:
        print('*** VAR MISSING *** ', var_names[names], ' *** VAR MISSING ***')

        
eng_13 = [var_names['ae1_frp'],
          var_names['ae3_frp'],
          var_names['ae1_cac_P'],
          var_names['ae3_cac_P'],
          var_names['ae1_cac_ca'],
          var_names['ae3_cac_ca'],
          var_names['ae1_exh'],
          var_names['ae3_exh'],
          var_names['ae1_fo_P'],
          var_names['ae3_fo_P'],
          var_names['ae1_rpm'],
          var_names['ae3_rpm'],
          var_names['me1_frp'],
          var_names['me3_frp'],
          var_names['me1_ca_T'],
          var_names['me3_ca_T'],
          var_names['me1_cac_T'],
          var_names['me3_cac_T'],
          var_names['me1_exh_T'],
          var_names['me3_exh_T'],
          var_names['me1_rpm'],
          var_names['me3_rpm']]
          
eng_24 = [var_names['ae2_frp'],
          var_names['ae4_frp'],
          var_names['ae2_cac_P'],
          var_names['ae4_cac_P'],
          var_names['ae2_cac_ca'],
          var_names['ae4_cac_ca'],
          var_names['ae2_exh'],
          var_names['ae4_exh'],
          var_names['ae2_fo_P'],
          var_names['ae4_fo_P'],
          var_names['ae2_rpm'],
          var_names['ae4_rpm'],
          var_names['me2_frp'],
          var_names['me4_frp'],
          var_names['me2_ca_T'],
          var_names['me4_ca_T'],
          var_names['me2_cac_T'],
          var_names['me4_cac_T'],
          var_names['me2_exh_T'],
          var_names['me4_exh_T'],
          var_names['me2_rpm'],
          var_names['me4_rpm']]
        

# Create np arrays. X vals are engine inputs, y are fo flows    

X_13 = np.array(df[[
            var_names['ae1_frp'],
            var_names['ae3_frp'],
            var_names['me1_frp'],
            var_names['me3_frp'],
            var_names['ae1_rpm'],
            var_names['ae3_rpm'],
            var_names['me1_rpm'],
            var_names['me3_rpm'],       
            ]][date_begin:date_end])
X_24 = np.array(df[[
            var_names['ae2_frp'],
            var_names['ae4_frp'],
            var_names['me2_frp'],
            var_names['me4_frp'],
            var_names['ae2_rpm'],
            var_names['ae4_rpm'],
            var_names['me2_rpm'],
            var_names['me4_rpm'],
            ]][date_begin:date_end])

#X_13 = np.array(df[eng_13][date_begin:date_end])
#X_24 = np.array(df[eng_24][date_begin:date_end])

y_13 = np.array(df[var_names['fo_booster_13']][date_begin:date_end]).reshape(-1,1)
y_24 = np.array(df[var_names['fo_booster_24']][date_begin:date_end]).reshape(-1,1)


In [49]:
#  modeling with sklearn auto

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

X_train_13, X_test_13, y_train_13, y_test_13 = train_test_split(X_13, y_13, train_size=0.75, test_size=0.25)

nn_13 = MLPRegressor(hidden_layer_sizes=1000,activation='logistic',max_iter=5000)

nn_13.fit(X_train_13, y_train_13)

print(nn_13.score(X_test_13,y_test_13))


  y = column_or_1d(y, warn=True)


0.979470021899


In [62]:
np.random.randint

NameError: name 'randint' is not defined

In [69]:
for i in np.random.randint(1,1000,10):
    print('Prediction:', nn_13.predict(X_13[i].reshape(1,-1)), 'Truth:',y_13[i], 'Percent:', nn_13.predict(X_13[i].reshape(1,-1))/y_13[i] *100 )
print('MSE: ',sklearn.metrics.mean_squared_error(nn_13.predict(X_13),y_13))

Prediction: [ 0.38995344] Truth: [ 0.44060957] Percent: [ 88.5031713]
Prediction: [-0.00994166] Truth: [ 0.00426667] Percent: [-233.0076947]
Prediction: [ 0.58271242] Truth: [ 0.60053339] Percent: [ 97.03247506]
Prediction: [ 0.38855713] Truth: [ 0.39801909] Percent: [ 97.62273781]
Prediction: [ 0.43924144] Truth: [ 0.50156195] Percent: [ 87.57471254]
Prediction: [ 0.8268092] Truth: [ 0.83611436] Percent: [ 98.88709449]
Prediction: [ 0.53894741] Truth: [ 0.56060956] Percent: [ 96.13596375]
Prediction: [ 0.70577583] Truth: [ 0.71428579] Percent: [ 98.80860572]
Prediction: [ 0.57808325] Truth: [ 0.61196196] Percent: [ 94.46391922]
Prediction: [ 0.60331778] Truth: [ 0.62620959] Percent: [ 96.34438491]
MSE:  0.00277486959482


In [55]:
np.random.random_integers(1,1000,10)

  """Entry point for launching an IPython kernel.


array([ 99, 847, 333, 902, 740, 190, 656, 491, 605, 982])

In [51]:
sklearn.metrics.mean_squared_error(nn_13.predict(X_13),y_13)

0.0027748695948221856

In [48]:
nn_13.get_params()

{'activation': 'tanh',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': 1000,
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_iter': 5000,
 'momentum': 0.9,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

In [79]:
import autosklearn.regression
from sklearn.model_selection import train_test_split

X_train_13, X_test_13, y_train_13, y_test_13 = train_test_split(X_13, y_13, train_size=0.75, test_size=0.25)
model = autosklearn.regression.AutoSklearnRegressor()

model.fit(X_train_13,y_train_13)




In [82]:
model.predict(X_13[1].reshape(1,-1))

array([ 0.3622281], dtype=float32)

In [None]:
for i in np.random.randint(1,1000,10):
    print('Prediction:', model.predict(X_13[i].reshape(1,-1)), 'Truth:',y_13[i], 'Percent:', model.predict(X_13[i].reshape(1,-1))/y_13[i] *100 )
print('MSE: ',sklearn.metrics.mean_squared_error(model.predict(X_13),y_13))