# Prepare data and use deep learning model for PV production forecast
### - Fetch historical PV production data from an API endpoint
### - Fetch historical weather data and forecast from openmeto API:
### - Make predictions for a future time interval
### - Evaluate model accuracy
-----------------------------------



#### 1. Import or Install the needed libraries


In [75]:
import os
import logging
from datetime import datetime, timedelta
import pandas as pd
import pandas as pd_f

import matplotlib.pyplot as plt

import numpy as np
import requests
import requests_cache
from retry_requests import retry
import plotly.graph_objects as go

import torch
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import openmeteo_requests

import plotly.offline as pyo

import plotly.io as pio

cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)


#### 2. Fetch the PV Production historical data from the API endpoint - http://209.38.208.230:8000/api/pvmeasurementdata/


In [76]:
# Load the data
start_date = '2024-12-01'
today = datetime.now().date() - timedelta(days=1)
end_date = today.strftime('%Y-%m-%d')
# custom end date
end_date = '2025-03-10'

ppe='590310600031289575'

url = f'http://209.38.208.230:8000/api/pvmeasurementdata/?start_date={start_date}&end_date={end_date}&ppe={ppe}'

# Get the data from the API
response = requests.get(url=url)


# Create a dataframe from the response

df_dam = pd.DataFrame(response.json())


# Clean the DataFrame
df_dam['production'] = df_dam['production'].replace(['-', 'n/e', 'N/A', 'NaN'], np.nan)
df_dam['production'] = df_dam['production'].astype(float)
df_dam['timestamp'] = pd.to_datetime(df_dam['timestamp'], errors='coerce', utc=True)
df_dam['timestamp'] = df_dam['timestamp'].dt.tz_convert('Europe/Warsaw')
df_dam['timestamp'] = df_dam['timestamp'].dt.tz_localize(None)  


# check for nans in the dataframe
if df_dam.isnull().values.any():
    print("Warning: NaNs detected in the DataFrame. Please fill or drop them.")

# drop all the columns instead of timestamp and production
df_dam = df_dam[['timestamp', 'production', 'latitude', 'longitude']]





In [77]:
# just for checking
print(df_dam.tail())

               timestamp  production latitude longitude
9500 2025-03-09 23:00:00         0.0  52.8961   18.3745
9501 2025-03-09 23:15:00         0.0  52.8961   18.3745
9502 2025-03-09 23:30:00         0.0  52.8961   18.3745
9503 2025-03-09 23:45:00         0.0  52.8961   18.3745
9504 2025-03-10 00:00:00         NaN   0.0000    0.0000


#### 3. Make some visualisations of the historical data

In [78]:
# # Some initial data visualisations

# start_week_number_analyzer = 1
# end_week_number_analyzer = 52

# df_dam_initial_chart = df_dam.copy()

# df_dam_initial_chart['WeekNumber'] = df_dam_initial_chart['timestamp'].dt.isocalendar().week

# # # Filter the data for the specified week range
# df_weeks = df_dam_initial_chart[(df_dam_initial_chart['WeekNumber'] >= start_week_number_analyzer) & (df_dam_initial_chart['WeekNumber'] <= end_week_number_analyzer)]

# # # Extract the date and hour for heatmap plotting
# df_weeks['Date'] = df_weeks['timestamp'].dt.date
# df_weeks['Hour'] = df_weeks['timestamp'].dt.hour

# # Aggregate the prices by taking the average for each combination of Hour and Date
# aggregated_data = df_weeks.groupby(['Hour', 'Date', 'WeekNumber'])['production'].mean().reset_index()

# # Pivot the data to create a matrix suitable for heatmap
# heatmap_data = aggregated_data.pivot(index='Hour', columns='Date', values='production')


# # Create a heatmap
# fig = go.Figure(data=go.Heatmap(
#     z=heatmap_data.values,
#     x=heatmap_data.columns,
#     y=heatmap_data.index,
#     colorscale='Viridis'
# ))

# # Update layout for better readability
# fig.update_layout(
#     title='DAM Market Price Heatmap',
#     xaxis_title='Date',
#     yaxis_title='Hour of Day',
#     yaxis_nticks=24,
#     xaxis_nticks=len(heatmap_data.columns),
#     height=600
# )

# # Show the heatmap plot
# fig.show()

# # Create a line plot of hourly price over time
# plt.figure(figsize=(10, 6))
# plt.plot(df_weeks['timestamp'], df_weeks['production'], linestyle=':', linewidth=1)
# plt.xlabel('Date and Hour')
# plt.ylabel('production')
# plt.title('Hourly production Over Time')
# plt.xticks(rotation=45)
# plt.grid(True)
# plt.tight_layout()
# plt.show()

#### 4. Fetch the historical weather data from open meto API - https://archive-api.open-meteo.com/v1/archive
#### The weather data includes: "temperature_2m", "cloud_cover", "cloud_cover_low", "wind_speed_10m", "direct_radiation", "diffuse_radiation", "global_tilted_irradiance", "is_day"
#### We have also some data cleaning as well as resempling the data of 15min

In [79]:
# Fetch the weather data
# # INCLUDE THE WEATHER FORECAST

def fetch_weather_data(start, end, url_weather = "https://archive-api.open-meteo.com/v1/archive"):

	lat = float(df_dam['latitude'].iloc[0])
	long = float(df_dam['longitude'].iloc[0])
	
	# start = datetime.strptime(end_date, '%Y-%m-%d')
	params = {
		"latitude": lat,
		"longitude": long,
		"start_date":start,	
		"end_date": end,
		"hourly": ["temperature_2m", "cloud_cover", "cloud_cover_low", "wind_speed_10m", "direct_radiation", "diffuse_radiation", "global_tilted_irradiance", "is_day"],
		"tilt": 30
	}
	
	responses = openmeteo.weather_api(url_weather, params=params)
	response_weather = responses[0]
	


	hourly = response_weather.Hourly()
	hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()	
	hourly_cloud_cover = hourly.Variables(1).ValuesAsNumpy()
	hourly_cloud_cover_low = hourly.Variables(2).ValuesAsNumpy()
	hourly_wind_speed_10m = hourly.Variables(3).ValuesAsNumpy()
	hourly_direct_radiation = hourly.Variables(4).ValuesAsNumpy()
	hourly_diffuse_radiation = hourly.Variables(5).ValuesAsNumpy()
	hourly_global_tilted_irradiance = hourly.Variables(6).ValuesAsNumpy()
	hourly_is_day = hourly.Variables(7).ValuesAsNumpy()

	hourly_data = {"date": pd.date_range(
		start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
		end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
		freq = pd.Timedelta(seconds = hourly.Interval()),
		inclusive = "left"
	)}
	

	hourly_data["temperature_2m"] = hourly_temperature_2m	
	hourly_data["cloud_cover"] = hourly_cloud_cover
	hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
	hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
	hourly_data["direct_radiation"] = hourly_direct_radiation
	hourly_data["diffuse_radiation"] = hourly_diffuse_radiation
	hourly_data["global_tilted_irradiance"] = hourly_global_tilted_irradiance
	hourly_data["is_day"] = hourly_is_day

	hourly_dataframe = pd.DataFrame(data = hourly_data)

	# Set index to datetime
	hourly_dataframe["date"] = pd.to_datetime(hourly_dataframe["date"])
	hourly_dataframe.set_index("date", inplace=True)

	# Resample to 15-minute intervals using linear interpolation
	resampled_df = hourly_dataframe.resample("15T").ffill()

	# Reset index to have 'date' as a column again
	resampled_df.reset_index(inplace=True)

	resampled_df["date"] = resampled_df["date"].dt.tz_localize(None)
	
	

	return resampled_df



#### 5. Merge and preprocess the historical PV measurement data as well as the Weather data 

In [80]:
# Merge the weather data with the production data

print(start_date, end_date)

resampled_df = fetch_weather_data(start_date, end_date)


combined_weather_and_df_dam = pd.merge(df_dam, resampled_df, how='inner', left_on='timestamp', right_on='date')

# Drop the duplicate date column
combined_weather_and_df_dam.drop(columns='date', inplace=True)

# Drop Latitude and Longitude columns
combined_weather_and_df_dam.drop(columns=['latitude', 'longitude'], inplace=True)

# Drop the rows with missing values
combined_weather_and_df_dam.dropna(inplace=True)

combined_weather_and_df_dam = combined_weather_and_df_dam.iloc[:-4]

print(combined_weather_and_df_dam.tail())

2024-12-01 2025-03-10
               timestamp  production  temperature_2m  cloud_cover  \
9403 2025-03-08 22:45:00         0.0          5.4285          0.0   
9404 2025-03-08 23:00:00         0.0          4.7285          0.0   
9405 2025-03-08 23:15:00         0.0          4.7285          0.0   
9406 2025-03-08 23:30:00         0.0          4.7285          0.0   
9407 2025-03-08 23:45:00         0.0          4.7285          0.0   

      cloud_cover_low  wind_speed_10m  direct_radiation  diffuse_radiation  \
9403              0.0       10.837435               0.0                0.0   
9404              0.0        9.743305               0.0                0.0   
9405              0.0        9.743305               0.0                0.0   
9406              0.0        9.743305               0.0                0.0   
9407              0.0        9.743305               0.0                0.0   

      global_tilted_irradiance  is_day  
9403                       0.0     0.0  
9404        


'T' is deprecated and will be removed in a future version, please use 'min' instead.



#### 6. Prepare the future covariates for the length of the prediction interval - this includes the weather forecast with all weather parameters for the length of the future prediction 

In [81]:
# Prepare future covariates
# Fetch the /historical/ forecast data

start_date_val = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=1)
start_date_val = start_date_val.strftime('%Y-%m-%d')
end_date_val = (datetime.strptime(start_date_val, '%Y-%m-%d') + timedelta(days=3)).strftime('%Y-%m-%d')
print(start_date_val, end_date_val)

# end_date_val = end_date_val.strftime('%Y-%m-%d')

# start_date_val = (datetime.strptime(end_date, '%Y-%m-%d') + timedelta(days=5)).strftime('%Y-%m-%d')
# end_date_val = (datetime.strptime(start_date_val, '%Y-%m-%d') + timedelta(days=1)).strftime('%Y-%m-%d')


forecast_df = fetch_weather_data(start_date_val, end_date_val, url_weather = "https://api.open-meteo.com/v1/forecast")

# Rename Date to timestamp
forecast_df.rename(columns={'date': 'timestamp'}, inplace=True)

forecast_df["item_id"] = "series_1"

forecast_df = forecast_df.iloc[:288]

future_covariates = TimeSeriesDataFrame.from_data_frame(
    forecast_df,
    id_column="item_id",
    timestamp_column="timestamp"
)

known_covariates = ["temperature_2m", "cloud_cover", "cloud_cover_low", "wind_speed_10m", "direct_radiation", "diffuse_radiation", "global_tilted_irradiance", "is_day"]


print(len(future_covariates))

print(future_covariates.head())

2025-03-09 2025-03-12
288
                              temperature_2m  cloud_cover  cloud_cover_low  \
item_id  timestamp                                                           
series_1 2025-03-09 00:00:00          3.4675          0.0              0.0   
         2025-03-09 00:15:00          3.4675          0.0              0.0   
         2025-03-09 00:30:00          3.4675          0.0              0.0   
         2025-03-09 00:45:00          3.4675          0.0              0.0   
         2025-03-09 01:00:00          2.9175          0.0              0.0   

                              wind_speed_10m  direct_radiation  \
item_id  timestamp                                               
series_1 2025-03-09 00:00:00        6.162207               0.0   
         2025-03-09 00:15:00        6.162207               0.0   
         2025-03-09 00:30:00        6.162207               0.0   
         2025-03-09 00:45:00        6.162207               0.0   
         2025-03-09 01:00:00   


'T' is deprecated and will be removed in a future version, please use 'min' instead.



#### 6. Prepare the needed for the autugluon ML framework. Set the model - "DeepAR" as well as some hyperparameters and prediction length

In [82]:
# Prepare data for the Autogluon
from lightning.pytorch.callbacks import EarlyStopping

combined_weather_and_df_dam["item_id"] = "series_1"

target_column = 'production'  

#Convert DataFrame to TimeSeriesDataFrame
train_data = TimeSeriesDataFrame.from_data_frame(
    combined_weather_and_df_dam,
    id_column="item_id",
    timestamp_column="timestamp"
)

model_path = "AutogluonModels/ag-20250310_110235/"  
predictor = TimeSeriesPredictor.load(model_path)


# #Initialize the predictor
# predictor = TimeSeriesPredictor(
#     target=target_column,    
#     prediction_length=288,
#     freq='15min',
#     known_covariates_names=known_covariates,
#     #path=model_save_path  # Set the path here
# )

# #Fit the predictor with cross-validation
# results = predictor.fit(
#     train_data=train_data,    
#     time_limit=1200,  # 20 min
#     #presets='fast_training',    
#     hyperparameters={
#                             "DeepAR": {
#                                 # You can specify DeepAR-specific hyperparameters here
#                                 # For example:
#                                 "context_length": 396,
#                                 "num_layers": 3,
#                                 "hidden_size": 288,
#                                 "dropout_rate": 0.2,
#                                 "learning_rate": 1e-3,
#                                 "epochs": 100,  # Added epochs parameter
#                                 "callbacks": [EarlyStopping(monitor="val_loss", patience=20, mode="min")]
#                             }
#     }                 
# )

Loading predictor from path c:\Users\Georgi\test_gluon\AutogluonModels\ag-20250310_110235


In [83]:
# Prepare the new covariates

# start_date_val = '2025-02-07'
# end_date_val = '2025-02-08'



# forecast_df = fetch_weather_data(start_date_val, end_date_val, url_weather = "https://api.open-meteo.com/v1/forecast")

# # Rename Date to timestamp
# forecast_df.rename(columns={'date': 'timestamp'}, inplace=True)

# forecast_df["item_id"] = "series_1"

# forecast_df = forecast_df.iloc[:96]

# updated_train = combined_weather_and_df_dam.tail(96)


# # offset the timestamp of updaited_train with + timedelta=(days=1)

# updated_train['timestamp'] = updated_train['timestamp'] + timedelta(days=1)

# forecast_df['timestamp'] = updated_train['timestamp'].values



# future_covariates = TimeSeriesDataFrame.from_data_frame(
#     forecast_df,
#     id_column="item_id",
#     timestamp_column="timestamp"
# )

# known_covariates = ["temperature_2m", "cloud_cover", "cloud_cover_low", "wind_speed_10m", "direct_radiation", "diffuse_radiation", "global_tilted_irradiance", "is_day"]

# print(len(future_covariates))


In [84]:
# make the predictions
print(len(future_covariates))
predictions = predictor.predict(data=train_data, known_covariates=future_covariates)
# predictions = predictor.predict(data=train_data, known_covariates=future_covariates)
predictions[predictions < 0] = 0
print(predictions)

288


Model not specified in predict, will default to the model with the best validation score: DeepAR


                                  mean  0.1  0.2  0.3  0.4       0.5  \
item_id  timestamp                                                     
series_1 2025-03-09 00:00:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-09 00:15:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-09 00:30:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-09 00:45:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-09 01:00:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
...                                ...  ...  ...  ...  ...       ...   
         2025-03-11 22:45:00  0.117117  0.0  0.0  0.0  0.0  0.028025   
         2025-03-11 23:00:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-11 23:15:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-11 23:30:00  0.000000  0.0  0.0  0.0  0.0  0.000000   
         2025-03-11 23:45:00  0.000000  0.0  0.0  0.0  0.0  0.000000   

                                   0.6       0.7       0.8     

In [85]:
# Evaluate the model with known data of production for the predicted period:

# start_date_val = end_date
# end_date_val = datetime.strptime(end_date, '%Y-%m-%d') + timedelta(days=5)
# end_date_val = end_date_val.strftime('%Y-%m-%d') 


# url_val = f'http://209.38.208.230:8000/api/pvmeasurementdata/?start_date={start_date_val}&end_date={end_date_val}&ppe={ppe}'
# response_val = requests.get(url=url_val)
# # Create a dataframe from the response
# df_validation = pd.DataFrame(response_val.json())

# df_validation['timestamp'] = pd.to_datetime(df_validation['timestamp'], errors='coerce', utc=True)

# df_validation['timestamp'] = df_validation['timestamp'].dt.tz_convert('Europe/Warsaw')

# df_validation['timestamp'] = df_validation['timestamp'].dt.tz_localize(None)  # Remove timezone

# # prepare the prediction df
# df_pred = predictions.reset_index()


# plt.figure(figsize=(12, 6))

# # Plot predicted mean
# plt.plot(df_pred["timestamp"], df_pred["mean"], label="Predicted", linestyle='--', color="red", alpha=0.7)

# plt.plot(df_validation["timestamp"], df_validation["production"], 
#              label="Actual", linestyle='-', color="green", alpha=0.7)

# # Optionally plot prediction intervals
# plt.xlabel("Timestamp")
# plt.ylabel("Production")
# plt.title("Actual vs. Predicted")
# plt.legend()
# plt.grid()
# plt.show()


In [None]:
# Generate ZUSE File

import pandas as pd
from openpyxl import load_workbook


# Load your DataFrame (assuming you already have it)
# forecast_data = pd.read_csv('your_data.csv')  # Uncomment if you need to load the data

# Load the existing Excel workbook
workbook = load_workbook('ZUSE_template.xlsx')

# Select the active sheet
sheet = workbook.active

df_pred = predictions.reset_index()
forecast_data = df_pred.copy()


# get the values between 96 and 192
forecast_data = forecast_data['mean'].iloc[96:192]

#mean is always zero between timestamp 18:30 and 07:30
forecast_data.loc[(forecast_data.index.time >= datetime.strptime('18:30', '%H:%M').time()) | 
                  (forecast_data.index.time <= datetime.strptime('07:30', '%H:%M').time()), 'mean'] = 0



data_to_write = forecast_data.tolist()

date_to_zuse = datetime.strptime(end_date_val, '%Y-%m-%d') - timedelta(days=1)

# # write end_date_val to the excel file on row 11, column 3
sheet.cell(row=11, column=3, value=date_to_zuse)
sheet.cell(row=11, column=1, value=ppe)
# Write the data horizontally starting from row 11, column 4
for col, value in enumerate(data_to_write, start=4):
    sheet.cell(row=11, column=col, value=value)

# Save the workbook
workbook.save(f'ZUSE_{ppe}.xlsx')

In [87]:
print(df_pred[['timestamp','mean']])


              timestamp      mean
0   2025-03-09 00:00:00  0.000000
1   2025-03-09 00:15:00  0.000000
2   2025-03-09 00:30:00  0.000000
3   2025-03-09 00:45:00  0.000000
4   2025-03-09 01:00:00  0.000000
..                  ...       ...
283 2025-03-11 22:45:00  0.117117
284 2025-03-11 23:00:00  0.000000
285 2025-03-11 23:15:00  0.000000
286 2025-03-11 23:30:00  0.000000
287 2025-03-11 23:45:00  0.000000

[288 rows x 2 columns]
