In [None]:
'''
Data munging libraries

'''
import pandas as pd
import numpy as np
import statsmodels.api as sm
import random
import joblib
'''
Visualization Libraries

'''
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#%matplotlib notebook
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 2) 
from bokeh.plotting import figure, output_notebook, show, gridplot
from bokeh.layouts import row
from bokeh.io import output_notebook

'''
ML libraries

'''

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn import tree
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, RANSACRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score;

# Define file name of model_data¶

In [None]:
file_name_model_data = "../model_data/hrsg_model_data.gzip"

In [None]:
model_df=pd.read_csv(file_name_model_data, compression='gzip' ,encoding="ISO-8859-2")

In [None]:
model_df.describe()

In [None]:
mw1='GT8 Generator Watts Max Selected {Avg}'
mw2='GT9 Generator Watts Max Selected {Avg}'
baro = 'SITE AMBIENT CONDITIONS BARO PRESS XMTR {Avg}'
fuel='GT8 Fuel Energy Flow MMBTU'
exh = 'GT8 Exhaust Temp Median Corrected By Average {Avg}'
std1 = 'GT8 Generator Watts Max Selected {StdDev}'
std2 = 'GT9 Generator Watts Max Selected {StdDev}'
hpsteam1 = 'HRSG 8 HP STEAM FLOW {KPPH}'
hppress1 = 'HRSG 8 HP STEAM OUTLET PRESSURE {Avg}'
hptemp1 = 'HRSG 8 HP STEAM OUTLET TEMPERATURE {Avg}'
lpsteam1 = 'HRSG 8 LP CORRECTED AND SELECTED STEAM FLOW {KPPH}'
lppress1 ='HRSG 8 LP STEAM OUTLET PRESSURE TRANSMITTER {Avg}'
lptemp1 = 'HRSG 8 LP STEAM OUTLET TEMPERATURE {Avg}'
dbfuel1 = 'HRSG 8 DB MMBTU 2'
#ipflow1 = 'HRSG 8 COLD REHEAT STEAM FLOW DUALSEL {Avg}'

crhflow1 = "HRSG 8 COLD REHEAT STEAM FLOW DUALSEL {KPPH}"

crhpress1 = "CRH 8001 PRESSURE A {Avg}"

crhtemp1 = "CRH TO HRSG 8001 THERMOCOUPLE {Avg}"

ipflow1 = "HRSG 8 DUALSEL IP STEAM FLOW {KPPH}"

hrhpress1 = "HRH 8001 PRESSURE A {Avg}"

hrhtemp1 = "HRSG 8001 HRH MAIN LINE THERMOCOUPLE {Avg}"


sh1a = "HRSG 8 HP SUPERHEATER 1 OUTLET THERMOCOUPLE A {Avg}"
sh1b = "HRSG 8 HP SUPERHEATER 1 OUTLET THERMOCOUPLE B {Avg}"
sh1c = "HRSG 8 HP SUPERHEATER 1 OUTLET THERMOCOUPLE C {Avg}"
sh1d = "HRSG 8 HP SUPERHEATER 1 OUTLET THERMOCOUPLE D {Avg}"



In [None]:
#HRSG(HP/IP/LP(Flow, P, T)=Function of (MW, GT Fuel, Exh Temp, Duct Firing fuel)
hrsgdf = model_df[[mw1,mw2,fuel,exh,hpsteam1,hppress1,hptemp1,lpsteam1,lppress1,lptemp1,dbfuel1,
                  crhflow1, crhpress1, crhtemp1,  
                   ipflow1, hrhpress1, hrhtemp1,  
                  sh1a, sh1b, sh1c, sh1d,std1]] 

In [None]:
hrsgdf = hrsgdf[(hrsgdf[hpsteam1]>10)&(hrsgdf[mw1]>80)&(hrsgdf[mw2]>80)&
                (hrsgdf[std1]<0.5)&(hrsgdf[dbfuel1]>-5)&(hrsgdf[dbfuel1]<800)].reset_index(drop=True)

In [None]:
hrsgdf["superheat 1 temp max"] = np.max([hrsgdf[sh1a].values,hrsgdf[sh1b].values,
                                         hrsgdf[sh1c].values,hrsgdf[sh1d].values],axis=0)


In [None]:
hrsgdf['HRH 8 Flow']=hrsgdf[crhflow1]+hrsgdf[ipflow1]

In [None]:
sh1 = "superheat 1 temp max"
hrhflow1='HRH 8 Flow'


In [None]:
hrsgdf.describe()

In [None]:
## Multi output random forest regression HRSG 1
x1 = hrsgdf[[mw1,exh,dbfuel1]]
y1 = hrsgdf[[hpsteam1,hppress1,hptemp1,sh1]]
#hp_ranf1 = RandomForestRegressor(max_depth=10,n_estimators=150,random_state = 6567)
hp_ranf1 = LinearRegression()
hp_reg1 = hp_ranf1.fit(x1, y1)

In [None]:
# Lin reg of hrsg1 hppress
x = hrsgdf[[mw1,exh,dbfuel1]]
y = hrsgdf[[hppress1,sh1]]
hppress1_lm = LinearRegression()
hrsg1_lm = hppress1_lm.fit(x,y)



In [None]:
pred = hp_reg1.predict(x1)

flow_err = pred[:,0] - y1.iloc[:,0]
press_err= pred[:,1] - y1.iloc[:,1]
temp_err= pred[:,2] - y1.iloc[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t1.scatter(list(range(len(flow_err))), flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t2.scatter(list(range(len(press_err))), press_err)

t3 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t3.scatter(list(range(len(temp_err))), temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
pred = hp_reg1.predict(x1)

flow_err = pred[:,0]
press_err= pred[:,1]
temp_err= pred[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Flow", x_axis_label='actual',
            y_axis_label='predicted')
t1.scatter(y1.iloc[:,0], flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Press", x_axis_label='actual', 
            y_axis_label='predicted')
t2.scatter(y1.iloc[:,1], press_err)

t3 = figure(plot_width=350, plot_height=350,title="Temp", x_axis_label='actual', 
            y_axis_label='predicted')
t3.scatter(y1.iloc[:,2], temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
print(np.sqrt(np.var(flow_err)))
print(np.mean(np.abs((flow_err) / y1.iloc[:,0])))

print(np.sqrt(np.var(press_err)))
print(np.mean(np.abs((press_err) / y1.iloc[:,1])))

print(np.sqrt(np.var(temp_err)))
print(np.mean(np.abs((temp_err) / y1.iloc[:,2])))

In [None]:
## Multi output random forest regression HRSG 1 LP
x1 = hrsgdf[[mw1,exh,dbfuel1]]
y1 = hrsgdf[[lpsteam1,lppress1,lptemp1]]
#lp_ranf1 = RandomForestRegressor(max_depth=30,n_estimators=300,random_state = 6567)
lp_ranf1 = LinearRegression()
lp_reg1 = lp_ranf1.fit(x1, y1)

In [None]:
pred = lp_reg1.predict(x1)

flow_err = pred[:,0] - y1.iloc[:,0]
press_err= pred[:,1] - y1.iloc[:,1]
temp_err= pred[:,2] - y1.iloc[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t1.scatter(list(range(len(flow_err))), flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t2.scatter(list(range(len(press_err))), press_err)

t3 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t3.scatter(list(range(len(temp_err))), temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
flow_err = pred[:,0]
press_err= pred[:,1]
temp_err= pred[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Flow", x_axis_label='actual',
            y_axis_label='predicted')
t1.scatter(y1.iloc[:,0], flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Press", x_axis_label='actual', 
            y_axis_label='predicted')
t2.scatter(y1.iloc[:,1], press_err)

t3 = figure(plot_width=350, plot_height=350,title="Temp", x_axis_label='actual', 
            y_axis_label='predicted')
t3.scatter(y1.iloc[:,2], temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
print(np.sqrt(np.var(flow_err)))
print(np.mean(np.abs((flow_err) / y1.iloc[:,0])))

print(np.sqrt(np.var(press_err)))
print(np.mean(np.abs((press_err) / y1.iloc[:,1])))

print(np.sqrt(np.var(temp_err)))
print(np.mean(np.abs((temp_err) / y1.iloc[:,2])))

In [None]:
sns.lmplot(x=mw1, y=hrhflow1, data=hrsgdf,height=6,fit_reg=False,scatter_kws={"s": 25},
           line_kws={"color":"black","linewidth":4},ci=None);

In [None]:
sns.lmplot(x=mw1, y=crhflow1, data=hrsgdf,height=6,fit_reg=False,scatter_kws={"s": 25},
           line_kws={"color":"black","linewidth":4},ci=None);

In [None]:
## Multi output random forest regression HRSG 1 hrh
x1 = hrsgdf[[mw1,exh,dbfuel1]]
y1 = hrsgdf[[hrhflow1,hrhpress1,hrhtemp1]]
#hrh_ranf1 = RandomForestRegressor(max_depth=50,n_estimators=100,random_state = 6567)
hrh_ranf1 = LinearRegression()
hrh_reg1 = hrh_ranf1.fit(x1, y1)

In [None]:
pred = hrh_reg1.predict(x1)

flow_err = pred[:,0] - y1.iloc[:,0]
press_err= pred[:,1] - y1.iloc[:,1]
temp_err= pred[:,2] - y1.iloc[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t1.scatter(list(range(len(flow_err))), flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t2.scatter(list(range(len(press_err))), press_err)

t3 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t3.scatter(list(range(len(temp_err))), temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
flow_err = pred[:,0]
press_err= pred[:,1]
temp_err= pred[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Flow", x_axis_label='actual',
            y_axis_label='predicted')
t1.scatter(y1.iloc[:,0], flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Press", x_axis_label='actual', 
            y_axis_label='predicted')
t2.scatter(y1.iloc[:,1], press_err)

t3 = figure(plot_width=350, plot_height=350,title="Temp", x_axis_label='actual', 
            y_axis_label='predicted')
t3.scatter(y1.iloc[:,2], temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
print(np.sqrt(np.var(flow_err)))
print(np.mean(np.abs((flow_err) / y1.iloc[:,0])))

print(np.sqrt(np.var(press_err)))
print(np.mean(np.abs((press_err) / y1.iloc[:,1])))

print(np.sqrt(np.var(temp_err)))
print(np.mean(np.abs((temp_err) / y1.iloc[:,2])))

In [None]:
## Multi output random forest regression HRSG 1 crh
x1 = hrsgdf[[mw1,exh,dbfuel1]]
y1 = hrsgdf[[crhflow1,crhpress1,crhtemp1]]
crh_ranf1 = RandomForestRegressor(max_depth=50,n_estimators=100,random_state = 6567)
crh_reg1 = MultiOutputRegressor(crh_ranf1).fit(x1, y1)

In [None]:
pred = crh_reg1.predict(x1)

flow_err = pred[:,0] - y1.iloc[:,0]
press_err= pred[:,1] - y1.iloc[:,1]
temp_err= pred[:,2] - y1.iloc[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t1.scatter(list(range(len(flow_err))), flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t2.scatter(list(range(len(press_err))), press_err)

t3 = figure(plot_width=350, plot_height=350,title="Residual Plot", x_axis_label='Index', 
            y_axis_label='Error')
t3.scatter(list(range(len(temp_err))), temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
flow_err = pred[:,0]
press_err= pred[:,1]
temp_err= pred[:,2]

output_notebook()
t1 = figure(plot_width=350, plot_height=350,title="Flow", x_axis_label='actual',
            y_axis_label='predicted')
t1.scatter(y1.iloc[:,0], flow_err)


t2 = figure(plot_width=350, plot_height=350,title="Press", x_axis_label='actual', 
            y_axis_label='predicted')
t2.scatter(y1.iloc[:,1], press_err)

t3 = figure(plot_width=350, plot_height=350,title="Temp", x_axis_label='actual', 
            y_axis_label='predicted')
t3.scatter(y1.iloc[:,2], temp_err)

show(gridplot([[t1,t2,t3]]))

In [None]:
print(np.sqrt(np.var(flow_err)))
print(np.mean(np.abs((flow_err) / y1.iloc[:,0])))

print(np.sqrt(np.var(press_err)))
print(np.mean(np.abs((press_err) / y1.iloc[:,1])))

print(np.sqrt(np.var(temp_err)))
print(np.mean(np.abs((temp_err) / y1.iloc[:,2])))

hp_ranf = RandomForestRegressor()

param_grid = {'estimator__max_depth':[10,20,30,50],
              'estimator__n_estimators':[100,200,300,500]}

gb = GradientBoostingRegressor()
gs = GridSearchCV(MultiOutputRegressor(hp_ranf), param_grid=param_grid,cv=5)
gs.fit(x1,y1)
gs.best_estimator_

In [None]:
pkl = "../../../pickles/hrsg8.pkl"

models = {'hp<flow|press|temp|super_heat_temp><mw|exh|db_fuel>':hp_reg1,
          'hrh<flow|press|temp><mw|exh|db_fuel>':hrh_reg1,
          'lp<flow|press|temp><mw|exh|db_fuel>':lp_reg1,
          'lm<hp_press|super_heat><mw|exh|db_fuel>': hrsg1_lm
          }

with open(pkl, "wb") as f:
    joblib.dump(models, f)
    print(f'{f.name}')