In [1]:
import pandas as pd
import numpy as np

# Load historical generation data (training target)
gen_df = pd.read_csv(r"C:\Users\waldo\OneDrive\Documentos\GitHub\IE_Datathon\Generacion_fotovoltaica.csv", parse_dates=['FECHA'])
gen_df.rename(columns={'FECHA': 'Datetime', 'TOTAL_KWH_ENERGIA': 'GEN_KWH'}, inplace=True)

# Load actual solar energy used (for underutilization)
actual_used_df = pd.read_csv(r"C:\Users\waldo\OneDrive\Documentos\GitHub\IE_Datathon\Consumo_fotovoltaica.csv", parse_dates=['FECHA'])
actual_used_df.rename(columns={'FECHA': 'Datetime', 'TOTAL_KWH_ENERGIA': 'ACTUAL_USED_KWH'}, inplace=True)

# Load meteorological data
df_raw = pd.read_excel(r"C:\Users\waldo\OneDrive\Documentos\GitHub\IE_Datathon\Meteorologia.xlsx", header=None)

# Extract column names from the first row
columns = df_raw.iloc[0, 0].split(',')

# Split the rest of the data (starting from second row) by comma
meteo_df = df_raw.iloc[1:, 0].str.split(',', expand=True)

# Assign column names
meteo_df.columns = columns

meteo_df['FORECAST_TIMESTAMP'] = pd.to_datetime(meteo_df['FORECAST_TIMESTAMP'])

meteo_df.rename(columns={'FORECAST_TIMESTAMP': 'Datetime'}, inplace=True)

# Convert weather timestamps from UTC to Europe/Madrid
meteo_df['Datetime'] = meteo_df['Datetime'].dt.tz_convert('Europe/Madrid').dt.tz_localize(None)

In [2]:
# Select key features
weather_features = [
    'dswrfsurface_0',   # Downward shortwave radiation
    'SUNSDsurface_0',   # Sunshine duration
    'tccatmosphere_0',  # Total cloud cover
    '2theightAboveGround_2',  # Air temp at 2m
]

# Merge generation with weather on timestamp
df = pd.merge(gen_df, meteo_df[['Datetime'] + weather_features], on='Datetime', how='inner')

# Add hour and day features
df['hour'] = df['Datetime'].dt.hour
df['dayofweek'] = df['Datetime'].dt.dayofweek

In [3]:
df = df.fillna(0)

In [4]:
for col in ['dswrfsurface_0', 'SUNSDsurface_0', 'tccatmosphere_0', '2theightAboveGround_2']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [5]:
! pip install lightgbm 
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

X = df[weather_features + ['hour', 'dayofweek']]
y = df['GEN_KWH']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(n_estimators=200, learning_rate=0.05)
model.fit(X_train, y_train)

# Evaluate
val_preds = model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_preds))





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 6
[LightGBM] [Info] Start training from score 22.413871
Validation MAE: 8.658541151123117


In [6]:
# Filter weather data for September 2024
sept_2024 = meteo_df[(meteo_df['Datetime'] >= '2024-09-01') & (meteo_df['Datetime'] < '2024-10-01')].copy()
sept_2024['hour'] = sept_2024['Datetime'].dt.hour
sept_2024['dayofweek'] = sept_2024['Datetime'].dt.dayofweek

bad_cols = ['dswrfsurface_0', 'SUNSDsurface_0', 'tccatmosphere_0', '2theightAboveGround_2']
for col in bad_cols:
    sept_2024[col] = pd.to_numeric(sept_2024[col], errors='coerce')

# Predict
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
sept_2024['KWH_ENERGIA'] = model.predict(X_pred)

In [7]:
# Keep only the first occurrence of each datetime
sept_2024 = sept_2024.drop_duplicates(subset='Datetime', keep='first')

In [8]:
sept_2024

Unnamed: 0,Datetime,LATITUDE,LONGITUDE,10uheightAboveGround_10,10vheightAboveGround_10,2rheightAboveGround_2,2shheightAboveGround_2,2theightAboveGround_2,SUNSDsurface_0,aptmpheightAboveGround_2,...,tmaxheightAboveGround_2,tminheightAboveGround_2,tozneatmosphereSingleLayer_0,tpsurface_0,tsurface_0,uheightAboveGround_80,vheightAboveGround_80,hour,dayofweek,KWH_ENERGIA
9708,2024-09-01 00:00:00,40.0,-4.0,-1.0089697265625,1.171298828125,52.1,0.011780882792663577,299.468311,2700.0,299.4450927734375,...,301.890185546875,299.43662109375003,300.92003173828124,0.0,297.92784423828124,-1.18162841796875,1.659375,0,6,-0.244288
9709,2024-09-01 01:00:00,40.0,-4.0,-1.816669921875,2.3122412109375,52.7,0.011742399094238283,299.168311,2700.0,299.1817260742188,...,301.84506835937503,299.15354003906253,300.1502197265625,0.0,297.53529052734376,-2.3431689453125,3.52731201171875,1,6,-0.263050
9710,2024-09-01 02:00:00,40.0,-4.0,-2.87714599609375,0.2129248046875,57.300000000000004,0.011819082577209475,297.886133,2700.0,297.9121826171875,...,301.890185546875,297.86337890625003,298.9931274414063,0.1875,296.33662109375,-4.1669970703125,1.9096948242187501,2,6,-0.263050
9711,2024-09-01 03:00:00,40.0,-4.0,-1.166337890625,-1.19864990234375,68.9,0.012476466995849611,295.812207,0.0,295.85858154296875,...,296.548828125,295.7705322265625,299.73134765625,0.375,295.90000000000003,-2.3645703125,-1.1485986328125,3,6,-0.241299
9712,2024-09-01 04:00:00,40.0,-4.0,-1.01611572265625,-1.3288940429687501,73.10000000000001,0.012696543623352053,295.068311,0.0,295.07318115234375,...,296.580517578125,295.1021240234375,298.6601196289063,0.8125,295.15097656250003,-2.289638671875,-1.4480957031250001,4,6,-0.241299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10423,2024-09-30 19:00:00,40.0,-4.0,-0.5104150390625,-0.12672607421875,18.8,0.00436906375946045,299.956201,18000.0,299.07314453125,...,300.3341552734375,298.156201171875,294.69312744140626,0.0,299.80258789062503,-0.505068359375,-0.03859375,19,0,36.533308
10424,2024-09-30 20:00:00,40.0,-4.0,-0.9595703125,-0.594892578125,23.0,0.004384414951171876,296.645093,21300.0,296.658447265625,...,300.3341552734375,296.6450927734375,294.35650634765625,0.0,293.5,-0.96138671875,-0.5853564453125,20,0,13.310006
10425,2024-09-30 21:00:00,40.0,-4.0,-0.7605859375,-0.28977294921875,25.200000000000003,0.004515163479156495,295.768311,0.0,295.7519897460938,...,296.6450927734375,295.768310546875,294.21693115234376,0.0,292.4490234375,-0.7918408203125,-0.34413818359375,21,0,0.142421
10426,2024-09-30 22:00:00,40.0,-4.0,-0.642294921875,0.638994140625,26.0,0.004515556107788087,295.199976,0.0,295.24302978515624,...,296.6450927734375,295.1999755859375,295.337451171875,0.0,291.80751953125,-0.659873046875,0.7074365234375,22,0,-0.134348


In [None]:
# Save prediction file
sept_2024[['Datetime', 'KWH_ENERGIA']].to_excel("Objective1_Solar_Prediction.xlsx", index=False)

Grid search 

In [9]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer, mean_absolute_error


In [10]:
model = LGBMRegressor(n_estimators=200, learning_rate=0.05)
model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 6
[LightGBM] [Info] Start training from score 22.413871


In [11]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
}

# Define model and scoring
lgbm = LGBMRegressor(random_state=42)
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring=scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Print best result
print("Best parameters:", grid_search.best_params_)
print("Best CV MAE (negative):", grid_search.best_score_)

# Use best estimator for final fit
model = grid_search.best_estimator_


Fitting 3 folds for each of 81 candidates, totalling 243 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 6
[LightGBM] [Info] Start training from score 22.413871
Best parameters: {'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 300, 'num_leaves': 100}
Best CV MAE (negative): -6.447795005699529


In [12]:
# Evaluate
val_preds = model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_preds))


Validation MAE: 5.854544342852751


In [15]:
sept_2024.to_csv('Predicted_Generation_Sept2024.csv', index=False)


Second Gridsearch  (not good)

In [16]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# Define custom MAE scorer (positive values for easier interpretation)
def positive_mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

mae_scorer = make_scorer(positive_mae, greater_is_better=False)

# Expanded and refined parameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.03, 0.05],
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'min_child_samples': [10, 20],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.7, 0.9],
}

# Set up grid search
model = LGBMRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=mae_scorer,
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit
grid_search.fit(X_train, y_train)

# Evaluate best model
print("Best Parameters:", grid_search.best_params_)
print("Best CV MAE:", abs(grid_search.best_score_))

# Use best model on validation set
best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_preds))


Fitting 5 folds for each of 648 candidates, totalling 3240 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1040
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 6
[LightGBM] [Info] Start training from score 22.413871
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 10, 'n_estimators': 500, 'num_leaves': 70, 'subsample': 0.7}
Best CV MAE: 6.841222407636141
Validation MAE: 6.638008077770587


In [19]:
# Filter weather data for September 2024
sept_2024 = meteo_df[(meteo_df['Datetime'] >= '2024-09-01') & (meteo_df['Datetime'] < '2024-10-01')].copy()

# Add time-based features
sept_2024['hour'] = sept_2024['Datetime'].dt.hour
sept_2024['dayofweek'] = sept_2024['Datetime'].dt.dayofweek

# Convert weather columns to numeric (if not done already)
for col in weather_features:
    sept_2024[col] = pd.to_numeric(sept_2024[col], errors='coerce')


In [20]:
# Ensure unique timestamps before prediction
sept_2024 = sept_2024.drop_duplicates(subset='Datetime', keep='first')


In [21]:
# Build prediction input
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]

# Predict with best model
sept_2024['KWH_ENERGIA'] = best_model.predict(X_pred)


In [22]:
# Save full DataFrame with predictions
sept_2024.to_csv('Predicted_Generation_Sept2024_final.csv', index=False)


Stacked Ensemble

In [23]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor


In [24]:
base_models = [
    ('lgbm', LGBMRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42, verbosity=0)),
    ('rf', RandomForestRegressor(random_state=42))
]


In [25]:
meta_model = Ridge()


In [26]:
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)


In [27]:
stacked_model.fit(X_train, y_train)
val_preds = stacked_model.predict(X_val)

from sklearn.metrics import mean_absolute_error
print("Stacked Model MAE:", mean_absolute_error(y_val, val_preds))


Stacked Model MAE: 4.424385024267067


In [28]:
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]  # Add more features if used
sept_2024['KWH_ENERGIA'] = stacked_model.predict(X_pred)


In [29]:
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('Predicted_Sept2024.csv', index=False)


Neural networks

In [30]:
! pip install tensorflow






In [31]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler

# Normalize inputs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1)  # Regression output
])

# Compile model
model.compile(optimizer='adam', loss='mae', metrics=['mae'])

# Train
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    verbose=1,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 16.1373 - mae: 16.1373 - val_loss: 11.6772 - val_mae: 11.6772
Epoch 2/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 12.3403 - mae: 12.3403 - val_loss: 10.8955 - val_mae: 10.8955
Epoch 3/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 11.6124 - mae: 11.6124 - val_loss: 10.3202 - val_mae: 10.3202
Epoch 4/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 11.1427 - mae: 11.1427 - val_loss: 9.7531 - val_mae: 9.7531
Epoch 5/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 10.7253 - mae: 10.7253 - val_loss: 9.4592 - val_mae: 9.4592
Epoch 6/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 10.4894 - mae: 10.4894 - val_loss: 9.1709 - val_mae: 9.1709
Epoch 7/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━

In [32]:
val_preds = model.predict(X_val_scaled).flatten()
from sklearn.metrics import mean_absolute_error
print("Neural Network MAE:", mean_absolute_error(y_val, val_preds))


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Neural Network MAE: 8.75945765583381


In [33]:
# Normalize prediction data
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
X_pred_scaled = scaler.transform(X_pred)

# Predict
sept_2024['KWH_ENERGIA'] = model.predict(X_pred_scaled).flatten()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


In [35]:
#sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('Predicted_Generation_NN_Sept2024.csv', index=False)



More complex neural

In [38]:
# Check for any NaNs post-scaling
import numpy as np
print("NaNs in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaNs in X_val_scaled:", np.isnan(X_val_scaled).sum())
print("NaNs in y_train:", np.isnan(y_train).sum())
print("NaNs in y_val:", np.isnan(y_val).sum())


NaNs in X_train_scaled: 0
NaNs in X_val_scaled: 1
NaNs in y_train: 0
NaNs in y_val: 0


In [39]:
# Before scaling
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)

# Then re-scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [40]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, LeakyReLU, Add
from sklearn.preprocessing import StandardScaler

# Normalize inputs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Input layer
input_layer = Input(shape=(X_train.shape[1],))

# Block 1
x = Dense(256)(input_layer)
x = BatchNormalization()(x)
x = LeakyReLU()(x)
x = Dropout(0.3)(x)

# Block 2
x_res = Dense(256)(x)
x_res = BatchNormalization()(x_res)
x_res = LeakyReLU()(x_res)
x_res = Dropout(0.3)(x_res)

# Skip connection
x = Add()([x, x_res])  # residual link

# Block 3
x = Dense(128)(x)
x = BatchNormalization()(x)
x = LeakyReLU()(x)
x = Dropout(0.3)(x)

# Output
output = Dense(1)(x)

# Define model
model = Model(inputs=input_layer, outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mae', metrics=['mae'])

# Train
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=200,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True)],
    verbose=1
)


Epoch 1/200
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - loss: 17.5367 - mae: 17.5367 - val_loss: 11.4332 - val_mae: 11.4332
Epoch 2/200
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 12.9080 - mae: 12.9080 - val_loss: 10.8993 - val_mae: 10.8993
Epoch 3/200
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 12.5130 - mae: 12.5130 - val_loss: 10.7037 - val_mae: 10.7037
Epoch 4/200
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 12.2576 - mae: 12.2576 - val_loss: 10.7633 - val_mae: 10.7633
Epoch 5/200
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 12.2200 - mae: 12.2200 - val_loss: 10.3198 - val_mae: 10.3198
Epoch 6/200
[1m1093/1093[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 11.9117 - mae: 11.9117 - val_loss: 10.0325 - val_mae: 10.0325
Epoch 7/200
[1m1093/1093[0m [32m━━━━━━━━━━━

In [41]:
val_preds = model.predict(X_val_scaled).flatten()
print("Advanced NN MAE:", mean_absolute_error(y_val, val_preds))


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Advanced NN MAE: 9.057459408646396


In [42]:
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
X_pred_scaled = scaler.transform(X_pred)
sept_2024['KWH_ENERGIA'] = model.predict(X_pred_scaled).flatten()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [43]:
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('Predicted_Generation_NN_Advanced_last_Sept2024.csv', index=False)


