In [1]:
import pandas as pd
import numpy as np

# Load historical generation data (training target)
gen_df = pd.read_csv(r"C:\Users\tomas\Downloads\repsol_dataset\Generacion_fotovoltaica.csv", parse_dates=['FECHA'])
gen_df.rename(columns={'FECHA': 'Datetime', 'TOTAL_KWH_ENERGIA': 'GEN_KWH'}, inplace=True)

# Load actual solar energy used (for underutilization)
actual_used_df = pd.read_csv(r"C:\Users\tomas\Downloads\repsol_dataset\Consumo_fotovoltaica.csv", parse_dates=['FECHA'])
actual_used_df.rename(columns={'FECHA': 'Datetime', 'TOTAL_KWH_ENERGIA': 'ACTUAL_USED_KWH'}, inplace=True)

# Load meteorological data
df_raw = pd.read_excel(r"C:\Users\tomas\Downloads\repsol_dataset\Meteorologia.xlsx", header=None)

# Extract column names from the first row
columns = df_raw.iloc[0, 0].split(',')

# Split the rest of the data (starting from second row) by comma
meteo_df = df_raw.iloc[1:, 0].str.split(',', expand=True)

# Assign column names
meteo_df.columns = columns

meteo_df['FORECAST_TIMESTAMP'] = pd.to_datetime(meteo_df['FORECAST_TIMESTAMP'])

meteo_df.rename(columns={'FORECAST_TIMESTAMP': 'Datetime'}, inplace=True)

# Convert weather timestamps from UTC to Europe/Madrid
meteo_df['Datetime'] = meteo_df['Datetime'].dt.tz_convert('Europe/Madrid').dt.tz_localize(None)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\tomas\\Downloads\\repsol_dataset\\Generacion_fotovoltaica.csv'

In [49]:
# Select key features
weather_features = [
    'dswrfsurface_0',   # Downward shortwave radiation
    'SUNSDsurface_0',   # Sunshine duration
    'tccatmosphere_0',  # Total cloud cover
    '2theightAboveGround_2',  # Air temp at 2m
]

# Merge generation with weather on timestamp
df = pd.merge(gen_df, meteo_df[['Datetime'] + weather_features], on='Datetime', how='inner')

# Add hour and day features
df['hour'] = df['Datetime'].dt.hour
df['dayofweek'] = df['Datetime'].dt.dayofweek

In [69]:
df = df.fillna(0)

In [71]:
for col in ['dswrfsurface_0', 'SUNSDsurface_0', 'tccatmosphere_0', '2theightAboveGround_2']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [73]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

X = df[weather_features + ['hour', 'dayofweek']]
y = df['GEN_KWH']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(n_estimators=200, learning_rate=0.05)
model.fit(X_train, y_train)

# Evaluate
val_preds = model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_preds))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000974 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 6
[LightGBM] [Info] Start training from score 22.413871
Validation MAE: 8.658541151123117


In [77]:
# Filter weather data for September 2024
sept_2024 = meteo_df[(meteo_df['Datetime'] >= '2024-09-01') & (meteo_df['Datetime'] < '2024-10-01')].copy()
sept_2024['hour'] = sept_2024['Datetime'].dt.hour
sept_2024['dayofweek'] = sept_2024['Datetime'].dt.dayofweek

bad_cols = ['dswrfsurface_0', 'SUNSDsurface_0', 'tccatmosphere_0', '2theightAboveGround_2']
for col in bad_cols:
    sept_2024[col] = pd.to_numeric(sept_2024[col], errors='coerce')

# Predict
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
sept_2024['KWH_ENERGIA'] = model.predict(X_pred)

In [82]:
# Keep only the first occurrence of each datetime
sept_2024 = sept_2024.drop_duplicates(subset='Datetime', keep='first')

In [84]:
sept_2024

Unnamed: 0,Datetime,LATITUDE,LONGITUDE,10uheightAboveGround_10,10vheightAboveGround_10,2rheightAboveGround_2,2shheightAboveGround_2,2theightAboveGround_2,SUNSDsurface_0,aptmpheightAboveGround_2,...,tmaxheightAboveGround_2,tminheightAboveGround_2,tozneatmosphereSingleLayer_0,tpsurface_0,tsurface_0,uheightAboveGround_80,vheightAboveGround_80,hour,dayofweek,KWH_ENERGIA
9708,2024-09-01 00:00:00,40.0,-4.0,-1.0089697265625,1.171298828125,52.1,0.011780882792663577,299.468311,2700.0,299.4450927734375,...,301.890185546875,299.43662109375003,300.92003173828124,0.0,297.92784423828124,-1.18162841796875,1.659375,0,6,-0.244288
9709,2024-09-01 01:00:00,40.0,-4.0,-1.816669921875,2.3122412109375,52.7,0.011742399094238283,299.168311,2700.0,299.1817260742188,...,301.84506835937503,299.15354003906253,300.1502197265625,0.0,297.53529052734376,-2.3431689453125,3.52731201171875,1,6,-0.263050
9710,2024-09-01 02:00:00,40.0,-4.0,-2.87714599609375,0.2129248046875,57.300000000000004,0.011819082577209475,297.886133,2700.0,297.9121826171875,...,301.890185546875,297.86337890625003,298.9931274414063,0.1875,296.33662109375,-4.1669970703125,1.9096948242187501,2,6,-0.263050
9711,2024-09-01 03:00:00,40.0,-4.0,-1.166337890625,-1.19864990234375,68.9,0.012476466995849611,295.812207,0.0,295.85858154296875,...,296.548828125,295.7705322265625,299.73134765625,0.375,295.90000000000003,-2.3645703125,-1.1485986328125,3,6,-0.241299
9712,2024-09-01 04:00:00,40.0,-4.0,-1.01611572265625,-1.3288940429687501,73.10000000000001,0.012696543623352053,295.068311,0.0,295.07318115234375,...,296.580517578125,295.1021240234375,298.6601196289063,0.8125,295.15097656250003,-2.289638671875,-1.4480957031250001,4,6,-0.241299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10423,2024-09-30 19:00:00,40.0,-4.0,-0.5104150390625,-0.12672607421875,18.8,0.00436906375946045,299.956201,18000.0,299.07314453125,...,300.3341552734375,298.156201171875,294.69312744140626,0.0,299.80258789062503,-0.505068359375,-0.03859375,19,0,36.533308
10424,2024-09-30 20:00:00,40.0,-4.0,-0.9595703125,-0.594892578125,23.0,0.004384414951171876,296.645093,21300.0,296.658447265625,...,300.3341552734375,296.6450927734375,294.35650634765625,0.0,293.5,-0.96138671875,-0.5853564453125,20,0,13.310006
10425,2024-09-30 21:00:00,40.0,-4.0,-0.7605859375,-0.28977294921875,25.200000000000003,0.004515163479156495,295.768311,0.0,295.7519897460938,...,296.6450927734375,295.768310546875,294.21693115234376,0.0,292.4490234375,-0.7918408203125,-0.34413818359375,21,0,0.142421
10426,2024-09-30 22:00:00,40.0,-4.0,-0.642294921875,0.638994140625,26.0,0.004515556107788087,295.199976,0.0,295.24302978515624,...,296.6450927734375,295.1999755859375,295.337451171875,0.0,291.80751953125,-0.659873046875,0.7074365234375,22,0,-0.134348


In [98]:
# Save prediction file
sept_2024[['Datetime', 'KWH_ENERGIA']].to_excel("Objective1_Solar_Prediction.xlsx", index=False)