In [16]:
import os
import pandas as pd
import numpy as np
import os
import psycopg2 # PostgreSQL database adapter for Python
from dotenv import load_dotenv # Reads the key-value pair from .env file and adds them to environment variable
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.losses import MeanAbsoluteError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import load_model
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import warnings
warnings.filterwarnings("ignore")

In [17]:
# Load environment variables from .env file
load_dotenv()

# Accessing credentials
db_host = os.getenv("DB_HOST")
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")

In [18]:
# Connect to the database
conn = psycopg2.connect(
    host=db_host,
    dbname=db_name,
    user=db_user,
    password=db_password,
    port=db_port
)

In [19]:
query_main=query="""
    select * 
    from agg.tidy_data_final
    where site = 25
"""
df_main = pd.read_sql_query(query_main, conn)

In [20]:
pd.set_option('display.max_columns', None)
df_main = df_main[['timestamp','net_load','month', 'day', 'hour', 'day_of_week', 'season', 'avg_net_load','weekend_or_bank_holiday','solar_radiation','sunshine_duration']]
#df_main

In [21]:
#cheking the range of df_ma'timestamp']
print(df_main['timestamp'].max())

2020-03-29 22:30:00+00:00


In [22]:
#Set view format to scrollable for better visualization
pd.set_option('display.max_rows', 3000)

In [23]:
# Check the range of the timestamp column
min_timestamp = df_main['timestamp'].min()
max_timestamp = df_main['timestamp'].max()

# Get the last 25 values of the timestamp range
last_25_timestamps = df_main['timestamp'].sort_values().tail(1400)
last_25_timestamps

6115   2020-02-29 16:30:00+00:00
6118   2020-02-29 17:00:00+00:00
6117   2020-02-29 17:30:00+00:00
6120   2020-02-29 18:00:00+00:00
6119   2020-02-29 18:30:00+00:00
6121   2020-02-29 19:00:00+00:00
6122   2020-02-29 19:30:00+00:00
6123   2020-02-29 20:00:00+00:00
6124   2020-02-29 20:30:00+00:00
6126   2020-02-29 21:00:00+00:00
6125   2020-02-29 21:30:00+00:00
6128   2020-02-29 22:00:00+00:00
6127   2020-02-29 22:30:00+00:00
6129   2020-02-29 23:00:00+00:00
6130   2020-02-29 23:30:00+00:00
6131   2020-03-01 00:00:00+00:00
6132   2020-03-01 00:30:00+00:00
6133   2020-03-01 01:00:00+00:00
6134   2020-03-01 01:30:00+00:00
6135   2020-03-01 02:00:00+00:00
6136   2020-03-01 02:30:00+00:00
6137   2020-03-01 03:00:00+00:00
6138   2020-03-01 03:30:00+00:00
6139   2020-03-01 04:00:00+00:00
6140   2020-03-01 04:30:00+00:00
6142   2020-03-01 05:00:00+00:00
6141   2020-03-01 05:30:00+00:00
6143   2020-03-01 06:00:00+00:00
6144   2020-03-01 06:30:00+00:00
6145   2020-03-01 07:00:00+00:00
6146   202

In [24]:
# Create a new dataframe with the missing timestamps
missing_timestamps = pd.date_range(start='2020-03-29 00:00:00+00:00', end='2020-03-29 01:30:00+00:00', freq='30min')
missing_timestamps = pd.DataFrame(missing_timestamps, columns=['timestamp'])

# Insert the missing timestamps into the df_main
df_main = pd.concat([df_main, missing_timestamps], axis=0)

# Sort the dataframe by timestamp
df_main = df_main.sort_values('timestamp')

# Reset the index of the dataframe
df_main = df_main.reset_index(drop=True)

# Fill the other cells of the inserted rows with NaN
df_main = df_main.fillna(np.nan)
df_main

Unnamed: 0,timestamp,net_load,month,day,hour,day_of_week,season,avg_net_load,weekend_or_bank_holiday,solar_radiation,sunshine_duration
0,2019-04-01 00:00:00+00:00,9.11000,4.0,1.0,1.0,0.0,2.0,156.18,0.0,0.0,0.0
1,2019-04-01 00:30:00+00:00,12.11000,4.0,1.0,1.0,0.0,2.0,209.12,0.0,0.0,0.0
2,2019-04-01 01:00:00+00:00,10.06000,4.0,1.0,2.0,0.0,2.0,219.51,0.0,0.0,0.0
3,2019-04-01 01:30:00+00:00,12.49000,4.0,1.0,2.0,0.0,2.0,214.43,0.0,0.0,0.0
4,2019-04-01 02:00:00+00:00,10.31000,4.0,1.0,3.0,0.0,2.0,235.79,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17435,2020-03-29 20:30:00+00:00,3.12000,3.0,29.0,21.0,6.0,2.0,326.19,1.0,0.0,0.0
17436,2020-03-29 21:00:00+00:00,2.47000,3.0,29.0,22.0,6.0,2.0,291.50,1.0,0.0,0.0
17437,2020-03-29 21:30:00+00:00,3.45000,3.0,29.0,22.0,6.0,2.0,267.92,1.0,0.0,0.0
17438,2020-03-29 22:00:00+00:00,5.13000,3.0,29.0,23.0,6.0,2.0,220.97,1.0,0.0,0.0


In [25]:
missing_timestamps

Unnamed: 0,timestamp
0,2020-03-29 00:00:00+00:00
1,2020-03-29 00:30:00+00:00
2,2020-03-29 01:00:00+00:00
3,2020-03-29 01:30:00+00:00


In [26]:
df_main

Unnamed: 0,timestamp,net_load,month,day,hour,day_of_week,season,avg_net_load,weekend_or_bank_holiday,solar_radiation,sunshine_duration
0,2019-04-01 00:00:00+00:00,9.11000,4.0,1.0,1.0,0.0,2.0,156.18,0.0,0.0,0.0
1,2019-04-01 00:30:00+00:00,12.11000,4.0,1.0,1.0,0.0,2.0,209.12,0.0,0.0,0.0
2,2019-04-01 01:00:00+00:00,10.06000,4.0,1.0,2.0,0.0,2.0,219.51,0.0,0.0,0.0
3,2019-04-01 01:30:00+00:00,12.49000,4.0,1.0,2.0,0.0,2.0,214.43,0.0,0.0,0.0
4,2019-04-01 02:00:00+00:00,10.31000,4.0,1.0,3.0,0.0,2.0,235.79,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17435,2020-03-29 20:30:00+00:00,3.12000,3.0,29.0,21.0,6.0,2.0,326.19,1.0,0.0,0.0
17436,2020-03-29 21:00:00+00:00,2.47000,3.0,29.0,22.0,6.0,2.0,291.50,1.0,0.0,0.0
17437,2020-03-29 21:30:00+00:00,3.45000,3.0,29.0,22.0,6.0,2.0,267.92,1.0,0.0,0.0
17438,2020-03-29 22:00:00+00:00,5.13000,3.0,29.0,23.0,6.0,2.0,220.97,1.0,0.0,0.0


In [27]:
# Check the range of the timestamp column
min_timestamp = df_main['timestamp'].min()
max_timestamp = df_main['timestamp'].max()

# Get the last 25 values of the timestamp range
last_25_timestamps = df_main['timestamp'].sort_values().tail(1700)
last_25_timestamps

15740   2020-02-23 12:30:00+00:00
15741   2020-02-23 13:00:00+00:00
15742   2020-02-23 13:30:00+00:00
15743   2020-02-23 14:00:00+00:00
15744   2020-02-23 14:30:00+00:00
15745   2020-02-23 15:00:00+00:00
15746   2020-02-23 15:30:00+00:00
15747   2020-02-23 16:00:00+00:00
15748   2020-02-23 16:30:00+00:00
15749   2020-02-23 17:00:00+00:00
15750   2020-02-23 17:30:00+00:00
15751   2020-02-23 18:00:00+00:00
15752   2020-02-23 18:30:00+00:00
15753   2020-02-23 19:00:00+00:00
15754   2020-02-23 19:30:00+00:00
15755   2020-02-23 20:00:00+00:00
15756   2020-02-23 20:30:00+00:00
15757   2020-02-23 21:00:00+00:00
15758   2020-02-23 21:30:00+00:00
15759   2020-02-23 22:00:00+00:00
15760   2020-02-23 22:30:00+00:00
15761   2020-02-23 23:00:00+00:00
15762   2020-02-23 23:30:00+00:00
15763   2020-02-24 00:00:00+00:00
15764   2020-02-24 00:30:00+00:00
15765   2020-02-24 01:00:00+00:00
15766   2020-02-24 01:30:00+00:00
15767   2020-02-24 02:00:00+00:00
15768   2020-02-24 02:30:00+00:00
15769   2020-0

In [28]:
# Check the range of the timestamp column
min_timestamp = df_main['timestamp'].min()
max_timestamp = df_main['timestamp'].max()

# Get the last 25 values of the timestamp range
last_25_timestamps = df_main['timestamp'].sort_values().tail(180)
last_25_timestamps

17260   2020-03-26 05:00:00+00:00
17261   2020-03-26 05:30:00+00:00
17262   2020-03-26 06:00:00+00:00
17263   2020-03-26 06:30:00+00:00
17264   2020-03-26 07:00:00+00:00
17265   2020-03-26 07:30:00+00:00
17266   2020-03-26 08:00:00+00:00
17267   2020-03-26 08:30:00+00:00
17268   2020-03-26 09:00:00+00:00
17269   2020-03-26 09:30:00+00:00
17270   2020-03-26 10:00:00+00:00
17271   2020-03-26 10:30:00+00:00
17272   2020-03-26 11:00:00+00:00
17273   2020-03-26 11:30:00+00:00
17274   2020-03-26 12:00:00+00:00
17275   2020-03-26 12:30:00+00:00
17276   2020-03-26 13:00:00+00:00
17277   2020-03-26 13:30:00+00:00
17278   2020-03-26 14:00:00+00:00
17279   2020-03-26 14:30:00+00:00
17280   2020-03-26 15:00:00+00:00
17281   2020-03-26 15:30:00+00:00
17282   2020-03-26 16:00:00+00:00
17283   2020-03-26 16:30:00+00:00
17284   2020-03-26 17:00:00+00:00
17285   2020-03-26 17:30:00+00:00
17286   2020-03-26 18:00:00+00:00
17287   2020-03-26 18:30:00+00:00
17288   2020-03-26 19:00:00+00:00
17289   2020-0

In [29]:
# Create and insert rows to account for missing values of half hourly data betwen 2020-03-28 23:30:00+00:00 and 2020-03-29 02:00:00+00:00
# Create a new dataframe with the missing timestamps
missing_timestamps = pd.date_range(start='2020-03-29 00:00:00+00:00', end='2020-03-29 01:30:00+00:00', freq='30min')
missing_timestamps = pd.DataFrame(missing_timestamps, columns=['timestamp'])
missing_timestamps

#insert the missing timestamps into the df_main
df_main = pd.concat([df_main, missing_timestamps], axis=0)
df_main

Unnamed: 0,timestamp,net_load,month,day,hour,day_of_week,season,avg_net_load,weekend_or_bank_holiday,solar_radiation,sunshine_duration
0,2019-04-01 00:00:00+00:00,9.11000,4.0,1.0,1.0,0.0,2.0,156.18,0.0,0.0,0.0
1,2019-04-01 00:30:00+00:00,12.11000,4.0,1.0,1.0,0.0,2.0,209.12,0.0,0.0,0.0
2,2019-04-01 01:00:00+00:00,10.06000,4.0,1.0,2.0,0.0,2.0,219.51,0.0,0.0,0.0
3,2019-04-01 01:30:00+00:00,12.49000,4.0,1.0,2.0,0.0,2.0,214.43,0.0,0.0,0.0
4,2019-04-01 02:00:00+00:00,10.31000,4.0,1.0,3.0,0.0,2.0,235.79,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
17439,2020-03-29 22:30:00+00:00,5.43077,3.0,29.0,23.0,6.0,2.0,179.15,1.0,0.0,0.0
0,2020-03-29 00:00:00+00:00,,,,,,,,,,
1,2020-03-29 00:30:00+00:00,,,,,,,,,,
2,2020-03-29 01:00:00+00:00,,,,,,,,,,


## Create raws for missing data 
### (between 2020-03-28 23:30:00+00:00 and 2020-03-29 02:00:00+00:00, exclusive)

In [30]:
#min_date = df_main['timestamp'].min()
#max_date = df_main['timestamp'].max()

#date_range = pd.date_range(start=min_date, end=max_date, freq='30min')

# new df main inserting all the timestamps in date range
#df_main = df_main.set_index('timestamp').reindex(date_range).rename_axis('timestamp').reset_index()