# Setup

## Library import

In [1]:
# Data manipulation
# ==============================================================================
import pandas as pd
# import matplotlib.pyplot as plt

# Visualization
# ==============================================================================
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

# Stattiscals tests
# ==============================================================================
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold

from statsmodels.graphics.tsaplots import plot_pacf

# GreyBox [own package]
# ==============================================================================
from greyboxmodel.model import TiTe
from greyboxmodel.fit import train_models

# Warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

# Autoreload extension
# ==============================================================================
%reload_ext autoreload
%autoreload 2


# Data import
We retrieve all the required data for the analysis.

In [2]:
# Duration of a record
# ==============================================================================
rec_duration = 1 # hour

# Loading temperature Data
# ==============================================================================
input_df = pd.read_csv('../data/demo_data.csv', index_col=0, parse_dates=True)
input_df.head()

Unnamed: 0,Ph,Ti,Ta,Th
2019-12-23 00:00:00+00:00,0.0,18.1375,5.4,4.6
2019-12-23 01:00:00+00:00,0.0,18.1,5.1,4.45
2019-12-23 02:00:00+00:00,0.0,18.0125,5.0,4.15
2019-12-23 03:00:00+00:00,97.223561,17.9625,4.8,4.1
2019-12-23 04:00:00+00:00,76.20225,17.9875,5.5,68.15


Setting a set of custom fields in our input data: `Ti0` and `Te0`. These fields define the initial conditions for `Ti` and `Te` for each record of the input data respectively. E.g. if a sub-model is trained based on the 10-20 records of the training data, the initial conditions for the above params will be set by the 10. record of the input data. 

In [5]:
# Setting the initial conditions
# ==============================================================================
input_df['Ti0'] = input_df['Ti']
input_df['Te0'] = input_df['Ti'] - 2

input_X = input_df[['Ph', 'Ta', 'Ti0', 'Te0']]
input_y = input_df['Ti']

print(f'Input X shape: {input_X.shape}, input y shape: {input_y.shape}')

Input X shape: (792, 4), input y shape: (792,)


In [6]:
# Splitting the data into train and test
# ==============================================================================
X_train, X_test, y_train, y_test = train_test_split(input_X, input_y, test_size= 5/33, shuffle=False)

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (672, 4), y_train shape: (672,)
X_test shape: (120, 4), y_test shape: (120,)


## Visualization

In [7]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

for var in ["Ta", "Ti", "Th"]:
    fig.add_trace(go.Scatter(x=input_df.index, y=input_df[var], name=var), row=1, col=1)

fig.add_trace(go.Scatter(x=input_df.index, y=input_df["Ph"], name="Ph"), row=2, col=1)


fig.update_layout(height=600, width=1000, title_text="Input data", yaxis_title_text="Temperature [°C]", yaxis2_title_text="Power [kW]", showlegend=False)
fig.show()

# Data processing

Use the `sklearn.model_selection.train_test_split` function to split the input data into train and test data. (Input data is 33 days long and 5 days of test data is specified)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(input_X, input_y, test_size=5 / 33, shuffle=False)

print(f'Train: X shape: {X_train.shape}, y shape: {y_train.shape}')
print(f'Test: X shape: {X_test.shape}, y shape: {y_test.shape}')

Train: X shape: (672, 3), y shape: (672,)
Test: X shape: (120, 3), y shape: (120,)


In [7]:
X_train.to_numpy()

array([[ 0.  ,  5.4 ,  4.6 ],
       [ 0.  ,  5.1 ,  4.45],
       [ 0.  ,  5.  ,  4.15],
       ...,
       [ 0.  ,  0.8 , 11.3 ],
       [ 0.  ,  0.6 , 11.15],
       [ 0.  ,  1.4 , 11.1 ]])

In [12]:
y_train

2019-12-23 00:00:00+00:00    18.1375
2019-12-23 01:00:00+00:00    18.1000
2019-12-23 02:00:00+00:00    18.0125
2019-12-23 03:00:00+00:00    17.9625
2019-12-23 04:00:00+00:00    17.9875
                              ...   
2020-01-19 19:00:00+00:00    18.3875
2020-01-19 20:00:00+00:00    18.2750
2020-01-19 21:00:00+00:00    18.1750
2020-01-19 22:00:00+00:00    18.0875
2020-01-19 23:00:00+00:00    17.9875
Name: Ti, Length: 672, dtype: float64

In [11]:
y_train.to_numpy()

array([18.1375, 18.1   , 18.0125, 17.9625, 17.9875, 18.2375, 18.5125,
       18.8   , 19.325 , 19.7375, 20.1625, 20.5125, 20.6125, 20.65  ,
       20.625 , 20.6125, 20.575 , 20.475 , 20.225 , 20.025 , 19.8625,
       19.725 , 19.575 , 19.4625, 19.3625, 19.2375, 19.15  , 19.0625,
       19.075 , 19.175 , 19.2375, 19.375 , 19.675 , 20.0625, 20.4875,
       20.625 , 20.7   , 20.7625, 20.775 , 20.75  , 20.7375, 20.7   ,
       20.55  , 20.4125, 20.2625, 20.125 , 20.0125, 19.8875, 19.7875,
       19.6875, 19.5875, 19.4875, 19.4   , 19.3125, 19.2125, 19.1375,
       19.0375, 19.05  , 19.125 , 19.2125, 19.2625, 19.475 , 19.325 ,
       19.2375, 19.0375, 18.925 , 18.8625, 18.8125, 18.7125, 18.6125,
       18.5125, 18.4125, 18.3125, 18.225 , 18.15  , 18.0875, 17.9875,
       17.925 , 17.9   , 17.8125, 17.75  , 17.7125, 17.625 , 17.575 ,
       17.575 , 17.5375, 17.5125, 17.4875, 17.4625, 17.4125, 17.375 ,
       17.325 , 17.325 , 17.2375, 17.2375, 17.175 , 17.1125, 17.05  ,
       17.0125, 16.9

In [15]:
import numpy as np
from typing import cast


In [16]:
cast(np.ndarray, y_train.values)

array([18.1375, 18.1   , 18.0125, 17.9625, 17.9875, 18.2375, 18.5125,
       18.8   , 19.325 , 19.7375, 20.1625, 20.5125, 20.6125, 20.65  ,
       20.625 , 20.6125, 20.575 , 20.475 , 20.225 , 20.025 , 19.8625,
       19.725 , 19.575 , 19.4625, 19.3625, 19.2375, 19.15  , 19.0625,
       19.075 , 19.175 , 19.2375, 19.375 , 19.675 , 20.0625, 20.4875,
       20.625 , 20.7   , 20.7625, 20.775 , 20.75  , 20.7375, 20.7   ,
       20.55  , 20.4125, 20.2625, 20.125 , 20.0125, 19.8875, 19.7875,
       19.6875, 19.5875, 19.4875, 19.4   , 19.3125, 19.2125, 19.1375,
       19.0375, 19.05  , 19.125 , 19.2125, 19.2625, 19.475 , 19.325 ,
       19.2375, 19.0375, 18.925 , 18.8625, 18.8125, 18.7125, 18.6125,
       18.5125, 18.4125, 18.3125, 18.225 , 18.15  , 18.0875, 17.9875,
       17.925 , 17.9   , 17.8125, 17.75  , 17.7125, 17.625 , 17.575 ,
       17.575 , 17.5375, 17.5125, 17.4875, 17.4625, 17.4125, 17.375 ,
       17.325 , 17.325 , 17.2375, 17.2375, 17.175 , 17.1125, 17.05  ,
       17.0125, 16.9

## Set up the model training parameters.  

The `Ti0` param is the initial condition for the internal temperature at t=0 - this is set to the first value in `y_train` and is fixed, hence `vary: False`.  

The `Te0` param is the initial condition for the building envelope temperature at t=0 - this is set to the first value in `y_train` (as the inital estimate that `Te0` will be quite close to `Ti0`) and is NOT fixed, hence `vary: True`.  

`Ci`, `Ce`, `Rie` and `Ria` params are the initial conditions for these thermal parameters. As these will be fit by the model training their default is `vary: True`. The values for these params' initial conditions are set arbitrarily to `1` it is assumed that no estimates have been calculated for them (e.g. based on building physical properties).


In [10]:
train_params = {
    'Ti0': {'value': y_train.iloc[0], 'vary': False},
    'Te0': {'value': y_train.iloc[0], 'vary': True},
    'Ci': {'value': 1},
    'Ce': {'value': 1},
    'Rie': {'value': 1},
    'Rea': {'value': 1},
}

Instantiate the `TiTe` model and fit to the training data using the least-squares method.

In [12]:
model = TiTe(params=train_params, rec_duration=rec_duration).fit(input=X_train, yhat=y_train, method='leastsq')

Display the result of the fit: `Te0`, `Ci`, `Ce`, `Rie` and `Ria` params have been fit, however their errors are not calculated

In [13]:
model.result.params

name,value,standard error,relative error,initial value,min,max,vary
Ti0,18.1375,0.0,(0.00%),18.1375,-inf,inf,False
Te0,19.6339055,0.04521507,(0.23%),18.1375,-inf,inf,True
Ci,0.97097694,0.61236448,(63.07%),1.0,-inf,inf,True
Ce,0.97985442,0.0536611,(5.48%),1.0,-inf,inf,True
Rie,1.02014937,0.07505803,(7.36%),1.0,-inf,inf,True
Rea,1.02910871,0.0879257,(8.54%),1.0,-inf,inf,True


Get the training results by calling `predict` for the training data

In [14]:
train_results = model.predict(X_train)

Plot the modelled and measured data for the train set.  

There is something wrong with the model and we get HUGE oscillations.  

In [20]:
train_results.var.keys()

dict_keys(['Ti', 'Te'])

In [25]:
# Plotting results
# ==============================================================================
fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x=X_train.index, y=y_train, name="Ti Measured"), row=1, col=1)
fig.add_trace(
    go.Scatter(x=train_results.input.index, y=train_results.yhat, name="Ti Predicted"),
    row=1,
    col=1,
)
fig.add_trace(
    go.Scatter(
        x=train_results.input.index,
        y=train_results.var["Te"],
        name="Te Modeled",
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=X_train.index,
        y=(y_train - train_results.yhat),
        name="Residual",
        mode="markers",
        marker=dict(opacity=0.5, color="black"),
        showlegend=False,
    ),
    row=1,
    col=2,
)

# fig.add_trace(
#     go.Scatter(
#         x=y_train,
#         y=train_results.yhat,
#         # name="U",
#         mode="markers",
#         marker=dict(opacity=0.5, color="blue"),
#     ),
#     row=2,
#     col=1,
# )

fig.update_layout(
    # height=600,
    # width=1000,
    title_text="Train results",
    yaxis_title_text="Temperature [°C]",
)

fig.show()

As the models get more complex and the training data gets larger, the chances of the fitting process experiencing some sort of breakdown gets higher. If we had a set of better initial conditions (than the arbitrary 1s used here) for the model parameters, the fitting process would find the global minimum with a higher probability (and quicker).  

One option would be to try to calculate these parameters from the physical properties of the building. These data are not always available though, and each model type has a different set of parameters making this process really honourous.

The other option is the throw in an extra layer inspired by _Genetic Algorithms_ and _Machine Learning_

In [26]:
test_results = model.predict(input=X_test, ic_params={'Ti0': y_test.iloc[0], 'Te0': train_results.var['Te'][-1]})

In [28]:
# Plotting results
# ==============================================================================
fig = make_subplots(rows=1, cols=2)

fig.add_trace(go.Scatter(x=X_test.index, y=y_test, name="Ti Measured"), row=1, col=1)
fig.add_trace(
    go.Scatter(x=test_results.input.index, y=test_results.yhat, name="Ti Predicted"),
    row=1,
    col=1,
)
fig.add_trace(
    go.Scatter(
        x=test_results.input.index,
        y=test_results.var["Te"],
        name="Te Modeled",
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=X_train.index,
        y=(y_test - test_results.yhat),
        name="Residual",
        mode="markers",
        marker=dict(opacity=0.5, color="black"),
        showlegend=False,
    ),
    row=1,
    col=2,
)

# fig.add_trace(
#     go.Scatter(
#         x=y_test,
#         y=test_results.yhat,
#         # name="U",
#         mode="markers",
#         marker=dict(opacity=0.5, color="blue"),
#     ),
#     row=2,
#     col=1,
# )

fig.update_layout(
    # height=600,
    # width=1000,
    title_text="Test results",
    yaxis_title_text="Temperature [°C]",
    # showlegend=False,
)

fig.show()

# References
We report here relevant references:
1. author1, article1, journal1, year1, url1
2. author2, article2, journal2, year2, url2