# Exploratory Data Analysis (EDA)

In [None]:
# import libraries
import sys
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sqlalchemy import create_engine
import seaborn as sns
import colorlover as cl

# config graphs
%matplotlib inline
sns.set(rc={"figure.figsize": (26, 10)})

In [None]:
# load custom functions
sys.path.append("../")
from cnrdlib import cl_eda as cle

In [None]:
# load 5-min data from data-etl notebook
engine = create_engine(f"sqlite:///../data/RawData.db")
df = pd.read_sql_table("SensorData_5min", con=engine, index_col="timestamp")
df.head()

In [None]:
# load 1-hour data from data-etl notebook
dfe = pd.read_sql_table("SensorData_1hour", con=engine, index_col="timestamp")
dfe.head()

In [None]:
# examine trends
cle.plot_timeseries_static(dfe, "WIP_temp", "WIP_energy", "WIP cold room temperature vs energy consumption")

In [None]:
cle.plot_timeseries_static(dfe, "Ext_temp", "WIP_energy", "External temperature vs energy consumption")

In [None]:
cle.plot_timeseries_plotly(dfe, ["WIP_energy", "WIP_temp", "Ext_temp"], "WIP cold room energy vs cold room temperature and external temperature")

- There appears to be some peaks in energy that match daily temperatures during week days, but not much variation on weekends. The day of the week may be a good feature to use.
- There are some periods where no clear pattern is seen - these may be specific plant conditions or maintenance.

In [None]:
cle.plot_timeseries_plotly(dfe, ["DP_energy", "DP_temp", "Ext_temp"], "Dispatch cold room energy vs cold room temperature and external temperature")

In [None]:
# plot histograms
cle.plot_hist(["Ext_temp", "WIP_temp", "WIP_energy"], dfe)

In [None]:
cle.plot_hist(["Ext_temp", "DP_temp", "DP_energy"], dfe)

- External temperature is normally distributed, which makes sense
- The `WIP_temp` looks as expected with a specific set-point. The `DP_temp` seems to have two different set-points.
- On the energy, we can see periods where the equipment was off (bin at 0).

In [None]:
# add weekday feature
dfe["weekday"] = dfe.index.day_name()

In [None]:
# examine how energy differs per weekday
sns.kdeplot(data=dfe, x="WIP_energy", hue="weekday", palette="tab10");

In [None]:
sns.kdeplot(data=dfe, x="DP_energy", hue="weekday", palette="tab10");

In [None]:
# scatter plot for energy vs temperature
sns.lmplot(x="WIP_temp", y="WIP_energy", data=dfe, aspect=2, lowess=True, line_kws={'color': 'red'});

- The WIP temperature is more an operating parameter rather than a variable that affects energy consumption. 
- The temperature is concentrated around the 4dC mark (the setpoint) and variation in energy consumption can be seen.

In [None]:
sns.lmplot(x="DP_temp", y="DP_energy", data=dfe, aspect=2, lowess=True, line_kws={'color': 'red'});

In [None]:
sns.lmplot(x="Ext_temp", y="WIP_energy", data=dfe, aspect=2, lowess=True, line_kws={'color': 'red'});

- Seems to be a correlation between energy and external temperature, which makes sense. The hotter it is outide, the more cooling energy is required. This also depends on the quality of the insulation.
- Other factors that are not measured are the number of times the fridge door is opened and it hot cheese is placed in the cold room.
- There appear to be some bad data points that seem stuck at 2 for `WIP_energy`

In [None]:
sns.lmplot(x="Ext_temp", y="DP_energy", data=dfe, aspect=2, lowess=True, line_kws={'color': 'red'});

In [None]:
sns.lmplot(x="Ext_temp", y="WIP_energy", data=dfe, aspect=2, lowess=True, hue="weekday", palette="tab10");

- As though earlier, energy consumption on Saterdays and Sundays are different from other days during the week. No works takes place on weekends.
- During the week, there is constant movement in and out of the cold room and adding new product.

In [None]:
sns.lmplot(x="Ext_temp", y="DP_energy", data=dfe, aspect=2, lowess=True, hue="weekday", palette="tab10");

In [None]:
dfe.head()

## Calculate Specific Energy Consumptions (SEC)

In [None]:
dfe["SEC_WIP"] = dfe.WIP_energy / dfe.Ext_temp
dfe["SEC_DP"] = dfe.DP_energy / dfe.Ext_temp

In [None]:
sns.lmplot(x="Ext_temp", y="SEC_WIP", data=dfe, aspect=2, lowess=True, line_kws={'color': 'red'});

In [None]:
sns.lmplot(x="Ext_temp", y="SEC_DP", data=dfe, aspect=2, lowess=True, line_kws={'color': 'red'});

## Calculate cooling degree days as a potential feature

In [None]:
# use setpoint as base temperature
base_temperature = 4

In [None]:
dfe["CDD"] = dfe.Ext_temp - base_temperature
dfe.head(10)

In [None]:
dfe["CDD_check"] = 0
dfe["CDD_check"][dfe.CDD > 0] = 1
dfe.head(10)

In [None]:
dfe.CDD_check[dfe.CDD_check == 0]

In [None]:
dfe["CDD_calc"] = dfe.CDD * dfe.CDD_check

In [None]:
dfe_daily = dfe.resample("D").agg({"CDD_calc": "sum", "WIP_energy": "sum", "DP_energy": "sum", "WIP_temp": "mean", "DP_temp": "mean", "Ext_temp": "mean"})
dfe_daily.head()

In [None]:
dfe_daily.CDD_calc = dfe_daily.CDD_calc/24
dfe_daily.head()

In [None]:
sns.lmplot(x="CDD_calc", y="WIP_energy", data=dfe_daily, aspect=2, lowess=True, line_kws={'color': 'red'});

In [None]:
sns.lmplot(x="CDD_calc", y="DP_energy", data=dfe_daily, aspect=2, lowess=True, line_kws={'color': 'red'});

In [None]:
fig = go.Figure()
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=dfe_daily.index, y=dfe_daily.DP_energy, mode='lines', name='energy'), secondary_y=False)
fig.add_trace(go.Scatter(x=dfe_daily.index, y=dfe_daily.DP_temp, mode='lines', name='cold room temp'), secondary_y=True)
fig.add_trace(go.Scatter(x=dfe_daily.index, y=dfe_daily.Ext_temp, mode='lines', name='external temp'), secondary_y=True)
fig.show()

In [None]:
sns.lmplot(data=dfe_daily, x="Ext_temp", y="DP_energy", aspect=2, lowess=True, line_kws={'color': 'red'});

In [None]:
sns.histplot(data=dfe_daily, x="DP_temp");

In [None]:
# use 3.5 dC as point to split the setpoint
dfe_daily["DP_setpoint"] = "low"
dfe_daily["DP_setpoint"][dfe_daily.DP_temp > 3.5] = "high"

In [None]:
sns.lmplot(data=dfe_daily, x="Ext_temp", y="DP_energy", aspect=2, lowess=True, hue="DP_setpoint");

- As expected, the lower the setpoint, the more energy is being used. Noting that energy is highly influenced by the external temperature.

In [None]:
# scatter plot
scl = cl.scales["9"]["seq"]["Blues"]
colorscale = [[float(i) / float(len(scl) - 1), scl[i]] for i in range(len(scl))]

trace = go.Scatter(
    x=dfe_daily.Ext_temp,
    y=dfe_daily.DP_energy,
    text=dfe_daily.index,
    mode="markers+text",
    textposition="top center",
    hoverinfo="text",
    marker=dict(opacity=0.5, sizemin=5, sizemode="area"),
)
trace_c = go.Histogram2dContour(
    x=dfe_daily.Ext_temp,
    y=dfe_daily.DP_energy,
    ncontours=5,
    colorscale=colorscale,
    showscale=False,
    opacity=0.3,
)
data = [trace, trace_c]
layout = go.Layout(title="Scatter plot")
fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
# remove some outliers
dfe_daily.drop(["2021-05-02", "2021-05-04"], inplace=True)

In [None]:
# remove zero energy values
dfe_daily = dfe_daily[dfe_daily.DP_energy > 0]

In [None]:
sns.lmplot(data=dfe_daily, x="Ext_temp", y="DP_energy", aspect=2, lowess=True, hue="DP_setpoint");

In [None]:
sns.boxplot(x="DP_setpoint", y="DP_energy", data=dfe_daily);

## Estimate potential savings
- This method estimates energy savings assuming that the setpoint at 4 dC is within spec for the product. Anything lower than that is assumed to be cooling more than needed and this consuming more energy than needed.
- An energy model will be build to compare this basic analysis with a more robust and accurate method.

In [None]:
low_sp_mean = dfe_daily.DP_energy[dfe_daily.DP_setpoint == "low"].mean()
high_sp_mean = dfe_daily.DP_energy[dfe_daily.DP_setpoint == "high"].mean()
mean_difference = low_sp_mean - high_sp_mean
print(f"Mean energy difference is {mean_difference:0.2f} kWh per day")
print(f"Over a year, that equates to {mean_difference*365:0.0f} kWh per annum")
print(f"That is approximately a reduction of {mean_difference*365*1.04:0.0f} kg CO2e and saving R {mean_difference*365*1.80:0.0f} per annum")

In [None]:
total_energy_low = dfe_daily.DP_energy[dfe_daily.DP_setpoint == "low"].sum()
total_energy_high= dfe_daily.DP_energy[dfe_daily.DP_setpoint == "high"].sum()

estimated_annual_energy_low = total_energy_low / dfe_daily.DP_energy[dfe_daily.DP_setpoint == "low"].count() * 365
estimated_annual_energy_high = total_energy_high / dfe_daily.DP_energy[dfe_daily.DP_setpoint == "high"].count() * 365
total_difference = estimated_annual_energy_low - estimated_annual_energy_high
percentage_saving = total_difference / estimated_annual_energy_high * 100

print(f"Estimated annual energy consumption at low set-point: {estimated_annual_energy_low:0.0f} kWh")
print(f"Estimated annual energy consumption at high set-point: {estimated_annual_energy_high:0.0f} kWh")
print(f"Total estimated annual energy consumption difference: {total_difference:0.0f} kWh")
print(f"Percentage potential saving: {percentage_saving:0.2f}%")

In [None]:
# save data for modelling
dfe_daily.to_sql("DailyDataEnergy", engine, if_exists="replace")