In [1]:
import pandas as pd
import torch
import sklearn
from sklearn import preprocessing
import time
import numpy as np

# Load the database, shouldn't take that long. This is all data!

t1 = time.time()
print('Loading database ...')
df = pd.read_hdf('database/all_data_comp.h5','table')
print('Time to load database:', time.time()-t1)


Loading database ...
Time to load database: 4.392061233520508


In [11]:
fo_1_3_total = df['FO_day_engine_1_3'].dropna()
fo_2_4_total = df['FO_day_engine_2_4'].dropna()

# this gives the total in tonnes for each day.

# The dataset is not complete overlapping in time with data from both the mass-flow meters and the
# the rest of the data. So we have to manually filter out the time interval which we are interested in.

date_begin = '2014-02-01'
date_end = '2014-12-16'

# Dict of var names we want to use.


var_names = {'ae1_frp':'AE1 FUEL RACK POSIT:1742:mm:Average:900',
             'ae2_frp':'AE2 FUEL RACK POSIT:2742:mm:Average:900',
             'ae3_frp':'AE3 FUEL RACK POSIT:3742:mm:Average:900',
             'ae4_frp':'AE4 FUEL RACK POSIT:4742:mm:Average:900',
             
             'ae1_cac_P':'AE1 CA OTL CAC PRESS:1649:Bar:Average:900',
             'ae2_cac_P':'AE2 CA OTL CAC PRESS:2649:Bar:Average:900',
             'ae3_cac_P':'AE3 CA OTL CAC PRESS:3649:Bar:Average:900',
             'ae4_cac_P':'AE4 CA OTL CAC PRESS:4649:Bar:Average:900',
             
             'ae1_cac_ca':'AE1 EXH CA OUTET 1:1543:  C:Average:900',
             'ae2_cac_ca':'AE2 EXH CA OUTET 1:2543:  C:Average:900',
             'ae3_cac_ca':'AE3 EXH CA OUTET 1:3543:  C:Average:900',
             'ae4_cac_ca':'AE4 EXH CA OUTET 1:4543:  C:Average:900',
             
             'ae1_exh':'AE1 EXH MEAN VALUE:1591:  C:Average:900',
             'ae2_exh':'AE2 EXH MEAN VALUE:2591:  C:Average:900',
             'ae3_exh':'AE3 EXH MEAN VALUE:3591:  C:Average:900',
             'ae4_exh':'AE4 EXH MEAN VALUE:4591:  C:Average:900',
             
             'ae1_fo_P':'AE1 FO INLET PRESS:1603:Bar:Average:900',
             'ae2_fo_P':'AE2 FO INLET PRESS:2603:Bar:Average:900',
             'ae3_fo_P':'AE3 FO INLET PRESS:3603:Bar:Average:900',
             'ae4_fo_P':'AE4 FO INLET PRESS:4603:Bar:Average:900',
             
             'ae1_rpm':'AE1 ENG SPEED:1745:RPM:Average:900',
             'ae2_rpm':'AE2 ENG SPEED:2745:RPM:Average:900',
             'ae3_rpm':'AE3 ENG SPEED:3745:RPM:Average:900',
             'ae4_rpm':'AE4 ENG SPEED:4745:RPM:Average:900',
             
             'me1_frp':'ME1 FUEL RACK POSIT:10005:%:Average:900',
             'me2_frp':'ME2 FUEL RACK POSIT:20005:%:Average:900',
             'me3_frp':'ME3 FUEL RACK POSIT:30005:%:Average:900',
             'me4_frp':'ME4 FUEL RACK POSIT:40005:%:Average:900',
             
             'me1_ca_T':'ME1 CA TEMP COOL OUT:1343:C:Average:900',
             'me2_ca_T':'ME2 CA TEMP COOL OUT:2343:C:Average:900',
             'me3_ca_T':'ME3 CA TEMP COOL OUT:3343:C:Average:900',
             'me4_ca_T':'ME4 CA TEMP COOL OUT:4343:C:Average:900',
             
             'me1_cac_T':'ME1 CHARGE AIR TEMP:1347:C:Average:900',
             'me2_cac_T':'ME2 CHARGE AIR TEMP:2347:C:Average:900',
             'me3_cac_T':'ME3 CHARGE AIR TEMP:3347:C:Average:900',
             'me4_cac_T':'ME4 CHARGE AIR TEMP:4347:C:Average:900',
             
             'me1_exh_T':'ME1 EXH GAS MEAN:1125:C:Average:900',
             'me2_exh_T':'ME2 EXH GAS MEAN:2125:C:Average:900',
             'me3_exh_T':'ME3 EXH GAS MEAN:3125:C:Average:900',
             'me4_exh_T':'ME4 EXH GAS MEAN:4125:C:Average:900',
             
             'me1_rpm':'ME1 ENGINE SPEED:1364:rpm:Average:900',
             'me2_rpm':'ME2 ENGINE SPEED:2364:rpm:Average:900',
             'me3_rpm':'ME3 ENGINE SPEED:3364:rpm:Average:900',
             'me4_rpm':'ME4 ENGINE SPEED:4364:rpm:Average:900',
             
             'fo_booster_13':'FO BOOST 1 CONSUMPT:6165:m3/h:Average:900',
             'fo_booster_24':'FO BOOST 2 CONSUMPT:6166:m3/h:Average:900'}

for names in var_names:
    if var_names[names] in list(df):
        #print(var_names[names])
        pass
    else:
        print('*** VAR MISSING *** ', var_names[names], ' *** VAR MISSING ***')

        
eng_13 = [var_names['ae1_frp'],
          var_names['ae3_frp'],
          var_names['ae1_cac_P'],
          var_names['ae3_cac_P'],
          var_names['ae1_cac_ca'],
          var_names['ae3_cac_ca'],
          var_names['ae1_exh'],
          var_names['ae3_exh'],
          var_names['ae1_fo_P'],
          var_names['ae3_fo_P'],
          var_names['ae1_rpm'],
          var_names['ae3_rpm'],
          var_names['me1_frp'],
          var_names['me3_frp'],
          var_names['me1_ca_T'],
          var_names['me3_ca_T'],
          var_names['me1_cac_T'],
          var_names['me3_cac_T'],
          var_names['me1_exh_T'],
          var_names['me3_exh_T'],
          var_names['me1_rpm'],
          var_names['me3_rpm']]
          
eng_24 = [var_names['ae2_frp'],
          var_names['ae4_frp'],
          var_names['ae2_cac_P'],
          var_names['ae4_cac_P'],
          var_names['ae2_cac_ca'],
          var_names['ae4_cac_ca'],
          var_names['ae2_exh'],
          var_names['ae4_exh'],
          var_names['ae2_fo_P'],
          var_names['ae4_fo_P'],
          var_names['ae2_rpm'],
          var_names['ae4_rpm'],
          var_names['me2_frp'],
          var_names['me4_frp'],
          var_names['me2_ca_T'],
          var_names['me4_ca_T'],
          var_names['me2_cac_T'],
          var_names['me4_cac_T'],
          var_names['me2_exh_T'],
          var_names['me4_exh_T'],
          var_names['me2_rpm'],
          var_names['me4_rpm']]
        

# Create np arrays. X vals are engine inputs, y are fo flows    
# For the first tries I am only using the variables below...

X_13 = np.array(df[[
            var_names['ae1_frp'],
            var_names['ae3_frp'],
            var_names['me1_frp'],
            var_names['me3_frp'],
            var_names['ae1_rpm'],
            var_names['ae3_rpm'],
            var_names['me1_rpm'],
            var_names['me3_rpm'],       
            ]][date_begin:date_end])
X_24 = np.array(df[[
            var_names['ae2_frp'],
            var_names['ae4_frp'],
            var_names['me2_frp'],
            var_names['me4_frp'],
            var_names['ae2_rpm'],
            var_names['ae4_rpm'],
            var_names['me2_rpm'],
            var_names['me4_rpm'],
            ]][date_begin:date_end])

#X_13 = np.array(df[eng_13][date_begin:date_end])
#X_24 = np.array(df[eng_24][date_begin:date_end])

y_13 = np.array(df[var_names['fo_booster_13']][date_begin:date_end]).reshape(-1,1)
y_24 = np.array(df[var_names['fo_booster_24']][date_begin:date_end]).reshape(-1,1)


In [12]:
#  modeling with tpot

from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

# Number of generations, should be boosted if we want better results. 3 gens take ~ 30min on my macbook...
gen = 3

X_train_13, X_test_13, y_train_13, y_test_13 = train_test_split(X_13, y_13, train_size=0.75, test_size=0.25)
tpot = TPOTRegressor(generations=gen, population_size=50, verbosity=2)
tpot.fit(X_train_13, y_train_13)

print(tpot.score(X_test_13,y_test_13))


  y = column_or_1d(y, warn=True)
Optimization Progress:  50%|█████     | 100/200 [08:21<19:44, 11.85s/pipeline]

Generation 1 - Current best internal CV score: 0.001246231618836063


Optimization Progress:  75%|███████▌  | 150/200 [21:15<14:28, 17.37s/pipeline]  

Generation 2 - Current best internal CV score: 0.0010530140427884927


                                                                              

Generation 3 - Current best internal CV score: 0.0010530140427884927

Best pipeline: ExtraTreesRegressor(input_matrix, bootstrap=False, max_features=0.95, min_samples_leaf=1, min_samples_split=6, n_estimators=100)
0.000912624265516


In [13]:

# Lets just make a quick prediction..

print(tpot.predict(X_13[3].reshape(1,-1)))
print(y_13[5])

# Export the model
tpot.export('eng13.py')

# This is how to import the model into another code:
# exec(open('eng13.py').read(), globals())

[ 0.40552998]
[ 0.4179048]


In [24]:
100-sklearn.metrics.mean_squared_error(tpot.predict(X_test_13),y_test_13)/(max(y_test_13)-min(y_test_13))*100

array([ 99.96609338])

In [19]:
print(tpot.predict(X_13[5].reshape(1,-1)))
print(y_13[5])

[ 0.40552998]
[ 0.4179048]


In [26]:
MSE = sklearn.metrics.mean_squared_error(tpot.predict(X_test_13), y_test_13)
print('Model eng1/3 with TPOT MSE:', MSE)

MSE_perc = 100-sklearn.metrics.mean_squared_error(tpot.predict(X_13),y_13)/(max(y_13)-min(y_13))*100
print('Model eng1/3 with TPOT MSE percent accuracy:', MSE_perc)

Model eng1/3 with TPOT MSE: 0.000912624265516
Model eng1/3 with TPOT MSE percent accuracy: [ 99.98512634]


In [30]:
#  The other engine pair.

# Number of generations, should be boosted if we want better results. 3 gens take ~ 30min on my macbook...
gen = 3

X_train_24, X_test_24, y_train_24, y_test_24 = train_test_split(X_13, y_13, train_size=0.75, test_size=0.25)
tpot_24 = TPOTRegressor(generations=gen, population_size=50, verbosity=2)
tpot_24.fit(X_train_24, y_train_24)

print(tpot_24.score(X_test_24,y_test_24))


  y = column_or_1d(y, warn=True)
Optimization Progress:  50%|█████     | 100/200 [10:15<07:12,  4.33s/pipeline] 

Generation 1 - Current best internal CV score: 0.0010504567993377197


Optimization Progress:  75%|███████▌  | 150/200 [15:51<07:06,  8.52s/pipeline]

Generation 2 - Current best internal CV score: 0.0010504567993377197


                                                                                

Generation 3 - Current best internal CV score: 0.0010504567993377197

Best pipeline: XGBRegressor(LassoLarsCV(input_matrix, normalize=False), learning_rate=0.1, max_depth=8, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.45)
0.00105974029125


In [36]:
# Export the model

tpot_24.export('eng24.py')


True

In [35]:

MSE_13 = sklearn.metrics.mean_squared_error(tpot.predict(X_test_13), y_test_13)
print('Model eng1/3 with TPOT MSE:', MSE_13)

MSE_13_perc = 100-sklearn.metrics.mean_squared_error(tpot.predict(X_13),y_13)/(max(y_13)-min(y_13))*100
print('Model eng1/3 with TPOT MSE percent accuracy:', MSE_13_perc)

MSE_24 = sklearn.metrics.mean_squared_error(tpot.predict(X_test_24), y_test_24)
print('Model eng2/4 with TPOT MSE:', MSE_24)

MSE_24_perc = 100-sklearn.metrics.mean_squared_error(tpot.predict(X_24),y_24)/(max(y_24)-min(y_24))*100
print('Model eng2/4 with TPOT MSE percent accuracy:', MSE_24_perc)


Model eng1/3 with TPOT MSE: 0.000912624265516
Model eng1/3 with TPOT MSE percent accuracy: [ 99.98512634]
Model eng2/4 with TPOT MSE: 0.00042016566897
Model eng2/4 with TPOT MSE percent accuracy: [ 99.19915618]
