In [77]:
import pandas as pd
import numpy as np

# Load historical generation data (training target)
gen_df = pd.read_csv(r"C:\Users\waldo\OneDrive\Documentos\GitHub\IE_Datathon\Generacion_fotovoltaica.csv", parse_dates=['FECHA'])
gen_df.rename(columns={'FECHA': 'Datetime', 'TOTAL_KWH_ENERGIA': 'GEN_KWH'}, inplace=True)

# Load actual solar energy used (for underutilization)
actual_used_df = pd.read_csv(r"C:\Users\waldo\OneDrive\Documentos\GitHub\IE_Datathon\Consumo_fotovoltaica.csv", parse_dates=['FECHA'])
actual_used_df.rename(columns={'FECHA': 'Datetime', 'TOTAL_KWH_ENERGIA': 'ACTUAL_USED_KWH'}, inplace=True)

# Load meteorological data
df_raw = pd.read_excel(r"C:\Users\waldo\OneDrive\Documentos\GitHub\IE_Datathon\Meteorologia.xlsx", header=None)

# Extract column names from the first row
columns = df_raw.iloc[0, 0].split(',')

# Split the rest of the data (starting from second row) by comma
meteo_df = df_raw.iloc[1:, 0].str.split(',', expand=True)

# Assign column names
meteo_df.columns = columns

meteo_df['FORECAST_TIMESTAMP'] = pd.to_datetime(meteo_df['FORECAST_TIMESTAMP'])

meteo_df.rename(columns={'FORECAST_TIMESTAMP': 'Datetime'}, inplace=True)

# Convert weather timestamps from UTC to Europe/Madrid
meteo_df['Datetime'] = meteo_df['Datetime'].dt.tz_convert('Europe/Madrid').dt.tz_localize(None)

In [78]:
# Select key features
weather_features = [
    'dswrfsurface_0',   # Downward shortwave radiation
    'SUNSDsurface_0',   # Sunshine duration
    'tccatmosphere_0',  # Total cloud cover
    '2theightAboveGround_2',
    '2rheightAboveGround_2',
    'dlwrfsurface_0',    # Air temp at 2m
]

# Merge generation with weather on timestamp
df = pd.merge(gen_df, meteo_df[['Datetime'] + weather_features], on='Datetime', how='inner')

# Add hour and day features
df['hour'] = df['Datetime'].dt.hour
df['dayofweek'] = df['Datetime'].dt.dayofweek

In [79]:
df.isna().sum()

Datetime                     0
GEN_KWH                  10467
dswrfsurface_0               0
SUNSDsurface_0               0
tccatmosphere_0              0
2theightAboveGround_2        0
2rheightAboveGround_2        0
dlwrfsurface_0               0
hour                         0
dayofweek                    0
dtype: int64

In [80]:
df[df['GEN_KWH'].isna()]['hour'].value_counts().sort_index()


0     414
1     414
2     414
3     414
4     414
5     414
6     414
7     441
8     468
9     504
10    450
11    450
12    459
13    459
14    459
15    459
16    450
17    432
18    423
19    423
20    423
21    423
22    423
23    423
Name: hour, dtype: int64

In [81]:
import numpy as np

# Step 1: Fill night-time NaNs (hours 0-5 and 19-23) with 0
df.loc[df['hour'].isin([0, 1, 2, 3, 4, 5, 19, 20, 21, 22, 23]) & df['GEN_KWH'].isna(), 'GEN_KWH'] = 0

# Step 2: Fill day-time NaNs (hours 6-18) with that hour's mean
for h in range(6, 19):  # 6 to 18 inclusive
    hour_mask = (df['hour'] == h)
    hour_mean = df.loc[hour_mask & df['GEN_KWH'].notna(), 'GEN_KWH'].mean()
    df.loc[hour_mask & df['GEN_KWH'].isna(), 'GEN_KWH'] = hour_mean


In [82]:
print("Remaining NaNs:", df['GEN_KWH'].isna().sum())


Remaining NaNs: 0


In [83]:
for col in ['dswrfsurface_0', 'SUNSDsurface_0', 'tccatmosphere_0', '2theightAboveGround_2', '2rheightAboveGround_2','dlwrfsurface_0']:
    df[col] = pd.to_numeric(df[col], errors='coerce')



In [84]:
! pip install lightgbm 
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

X = df[weather_features + ['hour', 'dayofweek']]
y = df['GEN_KWH']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(n_estimators=200, learning_rate=0.05)
model.fit(X_train, y_train)

# Evaluate
val_preds = model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_preds))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 8
[LightGBM] [Info] Start training from score 25.358757




Validation MAE: 6.743245735395628


In [85]:
# Filter weather data for September 2024
sept_2024 = meteo_df[(meteo_df['Datetime'] >= '2024-09-01') & (meteo_df['Datetime'] < '2024-10-01')].copy()
sept_2024['hour'] = sept_2024['Datetime'].dt.hour
sept_2024['dayofweek'] = sept_2024['Datetime'].dt.dayofweek

bad_cols = ['dswrfsurface_0', 'SUNSDsurface_0', 'tccatmosphere_0', '2theightAboveGround_2', '2rheightAboveGround_2','dlwrfsurface_0']
for col in bad_cols:
    sept_2024[col] = pd.to_numeric(sept_2024[col], errors='coerce')

# Predict
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
sept_2024['KWH_ENERGIA'] = model.predict(X_pred)

In [86]:
# Keep only the first occurrence of each datetime
sept_2024 = sept_2024.drop_duplicates(subset='Datetime', keep='first')

BEST ONE SO FAR 

In [89]:
# Save prediction file

sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv("ETL_Solar_Prediction.csv", index=False)

Stacked ensemble

In [15]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [16]:
base_models = [
    ('lgbm', LGBMRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42, verbosity=0)),
    ('rf', RandomForestRegressor(random_state=42))
]

In [17]:
meta_model = Ridge()

In [18]:
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

In [19]:
stacked_model.fit(X_train, y_train)
val_preds = stacked_model.predict(X_val)

from sklearn.metrics import mean_absolute_error
print("Stacked Model MAE:", mean_absolute_error(y_val, val_preds))

Stacked Model MAE: 3.6316174871149958


In [20]:
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]  # Add more features if used
sept_2024['KWH_ENERGIA'] = stacked_model.predict(X_pred)

In [21]:
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('ETL2_Predicted_Sept2024.csv', index=False)

Neural Networks

In [22]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler

# Normalize inputs
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Define model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1)  # Regression output
])

# Compile model
model.compile(optimizer='adam', loss='mae', metrics=['mae'])

# Train
history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    verbose=1,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 15.6154 - mae: 15.6154 - val_loss: 9.7567 - val_mae: 9.7567
Epoch 2/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 10.6303 - mae: 10.6303 - val_loss: 8.9386 - val_mae: 8.9386
Epoch 3/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 10.0532 - mae: 10.0532 - val_loss: 8.4487 - val_mae: 8.4487
Epoch 4/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 9.5064 - mae: 9.5064 - val_loss: 8.5029 - val_mae: 8.5029
Epoch 5/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 9.3254 - mae: 9.3254 - val_loss: 8.2649 - val_mae: 8.2649
Epoch 6/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 9.0156 - mae: 9.0156 - val_loss: 7.8407 - val_mae: 7.8407
Epoch 7/100
[1m2185/2185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s

In [23]:
val_preds = model.predict(X_val_scaled).flatten()
from sklearn.metrics import mean_absolute_error
print("Neural Network MAE:", mean_absolute_error(y_val, val_preds))


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 925us/step
Neural Network MAE: 7.164529995835415


In [24]:
# Normalize prediction data
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
X_pred_scaled = scaler.transform(X_pred)

# Predict
sept_2024['KWH_ENERGIA'] = model.predict(X_pred_scaled).flatten()

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [25]:
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('ETL3_Generation_NN_Sept2024.csv', index=False)

Grid search neural network   11.0232

In [38]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from itertools import product
import pandas as pd
import numpy as np

In [39]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [40]:
param_grid = {
    'units_1': [64, 128],
    'units_2': [32, 64],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [0.001, 0.01],
    'batch_size': [32],
    'epochs': [50]
}

In [41]:
results = []

for u1, u2, dr, lr, bs, ep in product(*param_grid.values()):
    print(f"Training: u1={u1}, u2={u2}, dropout={dr}, lr={lr}, bs={bs}")

    # Build model
    model = Sequential([
        Dense(u1, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(dr),
        Dense(u2, activation='relu'),
        Dropout(dr),
        Dense(1)
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
                  loss='mae', metrics=['mae'])

    # Train
    model.fit(X_train_scaled, y_train,
              validation_data=(X_val_scaled, y_val),
              epochs=ep,
              batch_size=bs,
              verbose=0)

    # Evaluate
    val_preds = model.predict(X_val_scaled).flatten()
    mae = mean_absolute_error(y_val, val_preds)

    print(f"MAE: {mae:.3f}")

    # Store result
    results.append({
        'units_1': u1,
        'units_2': u2,
        'dropout': dr,
        'learning_rate': lr,
        'batch_size': bs,
        'epochs': ep,
        'val_mae': mae,
        'model': model
    })

Training: u1=64, u2=32, dropout=0.2, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 7.457
Training: u1=64, u2=32, dropout=0.2, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.937
Training: u1=64, u2=32, dropout=0.3, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 7.445
Training: u1=64, u2=32, dropout=0.3, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
MAE: 7.159
Training: u1=64, u2=64, dropout=0.2, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.993
Training: u1=64, u2=64, dropout=0.2, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.810
Training: u1=64, u2=64, dropout=0.3, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 7.289
Training: u1=64, u2=64, dropout=0.3, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 7.741
Training: u1=128, u2=32, dropout=0.2, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
MAE: 6.857
Training: u1=128, u2=32, dropout=0.2, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.656
Training: u1=128, u2=32, dropout=0.3, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 7.162
Training: u1=128, u2=32, dropout=0.3, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 7.172
Training: u1=128, u2=64, dropout=0.2, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.592
Training: u1=128, u2=64, dropout=0.2, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.795
Training: u1=128, u2=64, dropout=0.3, lr=0.001, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step  
MAE: 6.876
Training: u1=128, u2=64, dropout=0.3, lr=0.01, bs=32


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
MAE: 6.928


In [42]:
results_df = pd.DataFrame(results).sort_values('val_mae')
best_model = results_df.iloc[0]['model']
print("✅ Best hyperparameters:")
print(results_df.iloc[0])

✅ Best hyperparameters:
units_1                                                  128
units_2                                                   64
dropout                                                  0.2
learning_rate                                          0.001
batch_size                                                32
epochs                                                    50
val_mae                                             6.592138
model            <Sequential name=sequential_12, built=True>
Name: 12, dtype: object


In [43]:
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]
X_pred_scaled = scaler.transform(X_pred)
sept_2024['KWH_ENERGIA'] = best_model.predict(X_pred_scaled).flatten()


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [44]:
# Save predictions to CSV
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('Predicted_Generation_grid_NN_Sept2024.csv', index=False)


KNN shit 

In [53]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Select features
features = weather_features + ['hour', 'dayofweek']
target = 'GEN_KWH'

# Scale data (important for KNN!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[features])
X_val_scaled = scaler.transform(X_val[features])


In [54]:
# Check for any NaNs post-scaling
import numpy as np
print("NaNs in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaNs in X_val_scaled:", np.isnan(X_val_scaled).sum())
print("NaNs in y_train:", np.isnan(y_train).sum())
print("NaNs in y_val:", np.isnan(y_val).sum())


NaNs in X_train_scaled: 0
NaNs in X_val_scaled: 0
NaNs in y_train: 0
NaNs in y_val: 0


In [55]:
# Before scaling
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)

# Then re-scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [56]:
# Try different values of k (neighbors)
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

# Predict on validation set
val_preds = knn.predict(X_val_scaled)

# Evaluate
print("KNN Validation MAE:", mean_absolute_error(y_val, val_preds))


KNN Validation MAE: 2.7949449067027636


In [57]:
X_pred = sept_2024[features]
X_pred_scaled = scaler.transform(X_pred)

sept_2024['KWH_ENERGIA'] = knn.predict(X_pred_scaled)


In [58]:
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('Predicted_Generation_KNN2_Sept2024.csv', index=False)


In [52]:
for k in [3, 5, 7, 10]:
    model = KNeighborsRegressor(n_neighbors=k)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_val_scaled)
    print(f"k={k}, MAE={mean_absolute_error(y_val, preds):.3f}")


k=3, MAE=2.795
k=5, MAE=3.588
k=7, MAE=4.381
k=10, MAE=5.273


hyper parameters lightbm 

In [59]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor

param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.03, 0.05],
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.9, 1.0],
}

model = LGBMRegressor(random_state=42)

search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    scoring='neg_mean_absolute_error',
    n_iter=20,  # fewer = faster
    cv=3,
    verbose=2,
    n_jobs=-1
)

search.fit(X_train, y_train)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 69890, number of used features: 8
[LightGBM] [Info] Start training from score 25.358757


In [60]:
print("Best Parameters:", search.best_params_)
print("Best Cross-Validated MAE:", -search.best_score_)


Best Parameters: {'subsample': 1.0, 'num_leaves': 70, 'n_estimators': 500, 'max_depth': 20, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Best Cross-Validated MAE: 5.091122228132636


In [61]:
# Use the best model found
best_model = search.best_estimator_

# Predict on validation set
val_preds = best_model.predict(X_val)
from sklearn.metrics import mean_absolute_error
print("Validation MAE:", mean_absolute_error(y_val, val_preds))


Validation MAE: 4.845127283976257


In [62]:
X_pred = sept_2024[weather_features + ['hour', 'dayofweek']]  # Make sure same features!
sept_2024['KWH_ENERGIA'] = best_model.predict(X_pred)


In [63]:
sept_2024[['Datetime', 'KWH_ENERGIA']].to_csv('Predicted_Generation_LGBM_Tuned_Sept2024.csv', index=False)




LAST TRY 