In [1]:
import os
import pandas as pd
import numpy as np
import os
import psycopg2 # PostgreSQL database adapter for Python
from dotenv import load_dotenv # Reads the key-value pair from .env file and adds them to environment variable
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.losses import MeanAbsoluteError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load environment variables from .env file
load_dotenv()

# Accessing credentials
db_host = os.getenv("DB_HOST")
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")

In [3]:
# Connect to the database
conn = psycopg2.connect(
    host=db_host,
    dbname=db_name,
    user=db_user,
    password=db_password,
    port=db_port
)

In [4]:
query_main=query="""
    select * 
    from agg.tidy_data_final
    where site = 17
"""
df_main = pd.read_sql_query(query_main, conn)

In [5]:
pd.set_option('display.max_columns', None)
df_main = df_main[['timestamp','net_load','month', 'day', 'hour', 'day_of_week', 'season', 'avg_net_load','weekend_or_bank_holiday','solar_radiation','sunshine_duration']]
#df_main

In [6]:
#cheking the range of df_ma'timestamp']
print(df_main['timestamp'].max())

2020-03-29 22:30:00+00:00


In [7]:
#Set view format to scrollable for better visualization
pd.set_option('display.max_rows', 3000)

In [8]:
# Check the range of the timestamp column
min_timestamp = df_main['timestamp'].min()
max_timestamp = df_main['timestamp'].max()

# Get the last 25 values of the timestamp range
last_25_timestamps = df_main['timestamp'].sort_values().tail(1400)
last_25_timestamps

265    2020-02-29 17:00:00+00:00
266    2020-02-29 17:30:00+00:00
268    2020-02-29 18:00:00+00:00
267    2020-02-29 18:30:00+00:00
269    2020-02-29 19:00:00+00:00
270    2020-02-29 19:30:00+00:00
272    2020-02-29 20:00:00+00:00
271    2020-02-29 20:30:00+00:00
273    2020-02-29 21:00:00+00:00
274    2020-02-29 21:30:00+00:00
275    2020-02-29 22:00:00+00:00
276    2020-02-29 22:30:00+00:00
278    2020-02-29 23:00:00+00:00
277    2020-02-29 23:30:00+00:00
280    2020-03-01 00:00:00+00:00
279    2020-03-01 00:30:00+00:00
281    2020-03-01 01:00:00+00:00
282    2020-03-01 01:30:00+00:00
284    2020-03-01 02:00:00+00:00
283    2020-03-01 02:30:00+00:00
286    2020-03-01 03:00:00+00:00
285    2020-03-01 03:30:00+00:00
288    2020-03-01 04:00:00+00:00
287    2020-03-01 04:30:00+00:00
289    2020-03-01 05:00:00+00:00
290    2020-03-01 05:30:00+00:00
291    2020-03-01 06:00:00+00:00
292    2020-03-01 06:30:00+00:00
294    2020-03-01 07:00:00+00:00
293    2020-03-01 07:30:00+00:00
295    202

In [9]:
# Create a new dataframe with the missing timestamps
missing_timestamps = pd.date_range(start='2020-03-29 00:00:00+00:00', end='2020-03-29 01:30:00+00:00', freq='30min')
missing_timestamps = pd.DataFrame(missing_timestamps, columns=['timestamp'])

# Insert the missing timestamps into the df_main
df_main = pd.concat([df_main, missing_timestamps], axis=0)

# Sort the dataframe by timestamp
df_main = df_main.sort_values('timestamp')

# Reset the index of the dataframe
df_main = df_main.reset_index(drop=True)

# Fill the other cells of the inserted rows with NaN
df_main = df_main.fillna(np.nan)
df_main

Unnamed: 0,timestamp,net_load,month,day,hour,day_of_week,season,avg_net_load,weekend_or_bank_holiday,solar_radiation,sunshine_duration
0,2019-04-01 00:00:00+00:00,16.34,4.0,1.0,1.0,0.0,2.0,156.18,0.0,0.0,0.0
1,2019-04-01 00:30:00+00:00,20.03,4.0,1.0,1.0,0.0,2.0,209.12,0.0,0.0,0.0
2,2019-04-01 01:00:00+00:00,76.43,4.0,1.0,2.0,0.0,2.0,219.51,0.0,0.0,0.0
3,2019-04-01 01:30:00+00:00,182.94,4.0,1.0,2.0,0.0,2.0,214.43,0.0,0.0,0.0
4,2019-04-01 02:00:00+00:00,171.07,4.0,1.0,3.0,0.0,2.0,235.79,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17457,2020-03-29 20:30:00+00:00,287.34,3.0,29.0,21.0,6.0,2.0,326.19,1.0,0.0,0.0
17458,2020-03-29 21:00:00+00:00,236.86,3.0,29.0,22.0,6.0,2.0,291.50,1.0,0.0,0.0
17459,2020-03-29 21:30:00+00:00,207.46,3.0,29.0,22.0,6.0,2.0,267.92,1.0,0.0,0.0
17460,2020-03-29 22:00:00+00:00,193.51,3.0,29.0,23.0,6.0,2.0,220.97,1.0,0.0,0.0


In [10]:
missing_timestamps

Unnamed: 0,timestamp
0,2020-03-29 00:00:00+00:00
1,2020-03-29 00:30:00+00:00
2,2020-03-29 01:00:00+00:00
3,2020-03-29 01:30:00+00:00


In [11]:
df_main

Unnamed: 0,timestamp,net_load,month,day,hour,day_of_week,season,avg_net_load,weekend_or_bank_holiday,solar_radiation,sunshine_duration
0,2019-04-01 00:00:00+00:00,16.34,4.0,1.0,1.0,0.0,2.0,156.18,0.0,0.0,0.0
1,2019-04-01 00:30:00+00:00,20.03,4.0,1.0,1.0,0.0,2.0,209.12,0.0,0.0,0.0
2,2019-04-01 01:00:00+00:00,76.43,4.0,1.0,2.0,0.0,2.0,219.51,0.0,0.0,0.0
3,2019-04-01 01:30:00+00:00,182.94,4.0,1.0,2.0,0.0,2.0,214.43,0.0,0.0,0.0
4,2019-04-01 02:00:00+00:00,171.07,4.0,1.0,3.0,0.0,2.0,235.79,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17457,2020-03-29 20:30:00+00:00,287.34,3.0,29.0,21.0,6.0,2.0,326.19,1.0,0.0,0.0
17458,2020-03-29 21:00:00+00:00,236.86,3.0,29.0,22.0,6.0,2.0,291.50,1.0,0.0,0.0
17459,2020-03-29 21:30:00+00:00,207.46,3.0,29.0,22.0,6.0,2.0,267.92,1.0,0.0,0.0
17460,2020-03-29 22:00:00+00:00,193.51,3.0,29.0,23.0,6.0,2.0,220.97,1.0,0.0,0.0


In [12]:
# Check the range of the timestamp column
min_timestamp = df_main['timestamp'].min()
max_timestamp = df_main['timestamp'].max()

# Get the last 25 values of the timestamp range
last_25_timestamps = df_main['timestamp'].sort_values().tail(1700)
last_25_timestamps

15762   2020-02-23 13:00:00+00:00
15763   2020-02-23 13:30:00+00:00
15764   2020-02-23 14:00:00+00:00
15765   2020-02-23 14:30:00+00:00
15766   2020-02-23 15:00:00+00:00
15767   2020-02-23 15:30:00+00:00
15768   2020-02-23 16:00:00+00:00
15769   2020-02-23 16:30:00+00:00
15770   2020-02-23 17:00:00+00:00
15771   2020-02-23 17:30:00+00:00
15772   2020-02-23 18:00:00+00:00
15773   2020-02-23 18:30:00+00:00
15774   2020-02-23 19:00:00+00:00
15775   2020-02-23 19:30:00+00:00
15776   2020-02-23 20:00:00+00:00
15777   2020-02-23 20:30:00+00:00
15778   2020-02-23 21:00:00+00:00
15779   2020-02-23 21:30:00+00:00
15780   2020-02-23 22:00:00+00:00
15781   2020-02-23 22:30:00+00:00
15782   2020-02-23 23:00:00+00:00
15783   2020-02-23 23:30:00+00:00
15784   2020-02-24 00:00:00+00:00
15785   2020-02-24 00:30:00+00:00
15786   2020-02-24 01:00:00+00:00
15787   2020-02-24 01:30:00+00:00
15788   2020-02-24 02:00:00+00:00
15789   2020-02-24 02:30:00+00:00
15790   2020-02-24 03:00:00+00:00
15791   2020-0

In [13]:
# Check the range of the timestamp column
min_timestamp = df_main['timestamp'].min()
max_timestamp = df_main['timestamp'].max()

# Get the last 25 values of the timestamp range
last_25_timestamps = df_main['timestamp'].sort_values().tail(180)
last_25_timestamps

17282   2020-03-26 05:00:00+00:00
17283   2020-03-26 05:30:00+00:00
17284   2020-03-26 06:00:00+00:00
17285   2020-03-26 06:30:00+00:00
17286   2020-03-26 07:00:00+00:00
17287   2020-03-26 07:30:00+00:00
17288   2020-03-26 08:00:00+00:00
17289   2020-03-26 08:30:00+00:00
17290   2020-03-26 09:00:00+00:00
17291   2020-03-26 09:30:00+00:00
17292   2020-03-26 10:00:00+00:00
17293   2020-03-26 10:30:00+00:00
17294   2020-03-26 11:00:00+00:00
17295   2020-03-26 11:30:00+00:00
17296   2020-03-26 12:00:00+00:00
17297   2020-03-26 12:30:00+00:00
17298   2020-03-26 13:00:00+00:00
17299   2020-03-26 13:30:00+00:00
17300   2020-03-26 14:00:00+00:00
17301   2020-03-26 14:30:00+00:00
17302   2020-03-26 15:00:00+00:00
17303   2020-03-26 15:30:00+00:00
17304   2020-03-26 16:00:00+00:00
17305   2020-03-26 16:30:00+00:00
17306   2020-03-26 17:00:00+00:00
17307   2020-03-26 17:30:00+00:00
17308   2020-03-26 18:00:00+00:00
17309   2020-03-26 18:30:00+00:00
17310   2020-03-26 19:00:00+00:00
17311   2020-0

In [14]:
# Create and insert rows to account for missing values of half hourly data betwen 2020-03-28 23:30:00+00:00 and 2020-03-29 02:00:00+00:00
# Create a new dataframe with the missing timestamps
missing_timestamps = pd.date_range(start='2020-03-29 00:00:00+00:00', end='2020-03-29 01:30:00+00:00', freq='30min')
missing_timestamps = pd.DataFrame(missing_timestamps, columns=['timestamp'])
missing_timestamps

#insert the missing timestamps into the df_main
df_main = pd.concat([df_main, missing_timestamps], axis=0)
df_main

Unnamed: 0,timestamp,net_load,month,day,hour,day_of_week,season,avg_net_load,weekend_or_bank_holiday,solar_radiation,sunshine_duration
0,2019-04-01 00:00:00+00:00,16.34,4.0,1.0,1.0,0.0,2.0,156.18,0.0,0.0,0.0
1,2019-04-01 00:30:00+00:00,20.03,4.0,1.0,1.0,0.0,2.0,209.12,0.0,0.0,0.0
2,2019-04-01 01:00:00+00:00,76.43,4.0,1.0,2.0,0.0,2.0,219.51,0.0,0.0,0.0
3,2019-04-01 01:30:00+00:00,182.94,4.0,1.0,2.0,0.0,2.0,214.43,0.0,0.0,0.0
4,2019-04-01 02:00:00+00:00,171.07,4.0,1.0,3.0,0.0,2.0,235.79,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17461,2020-03-29 22:30:00+00:00,185.34,3.0,29.0,23.0,6.0,2.0,179.15,1.0,0.0,0.0
0,2020-03-29 00:00:00+00:00,,,,,,,,,,
1,2020-03-29 00:30:00+00:00,,,,,,,,,,
2,2020-03-29 01:00:00+00:00,,,,,,,,,,


## Create raws for missing data 
### (between 2020-03-28 23:30:00+00:00 and 2020-03-29 02:00:00+00:00, exclusive)

In [15]:
#min_date = df_main['timestamp'].min()
#max_date = df_main['timestamp'].max()

#date_range = pd.date_range(start=min_date, end=max_date, freq='30min')

# new df main inserting all the timestamps in date range
#df_main = df_main.set_index('timestamp').reindex(date_range).rename_axis('timestamp').reset_index()