In [2]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd

from sklearn.model_selection import cross_validate, learning_curve, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures


In [3]:
import sys
import os

# Get the current working directory
current_dir = os.getcwd()

# Assuming your notebook is in the 'feature_engin' directory,
# and you want to add 'main' to the path
main_dir = os.path.dirname(current_dir)
sys.path.append(main_dir)

from feature_engin.main_feature_engin import merge_all_datasets

raw_data = merge_all_datasets()


Loaded germany_electricity_generation_2018-2023.csv successfully.
Loaded weather_north_hourly.csv successfully.
Loaded weather_south_hourly.csv successfully.
Loaded weather_brocken_hourly.csv successfully.
Loaded holidays.csv successfully.
Loaded PMI_germany.csv successfully.
Loaded coal_price.csv successfully.
Loaded ttf_price.csv successfully.
Loaded oil_price.csv successfully.
The DataFrame has 207264 rows before deleting NaN rows.
The DataFrame has 207164 rows AFTER deleting NaN rows.
Dataset saved to: fianal_ML_data.csv
Loaded germany_electricity_generation_2018-2023.csv successfully.
Loaded weather_north_hourly.csv successfully.
Loaded weather_south_hourly.csv successfully.
Loaded weather_brocken_hourly.csv successfully.
Loaded holidays.csv successfully.
Loaded PMI_germany.csv successfully.
Loaded coal_price.csv successfully.
Loaded ttf_price.csv successfully.
Loaded oil_price.csv successfully.
The DataFrame has 207264 rows before deleting NaN rows.
The DataFrame has 207164 rows 

In [4]:
data = raw_data.copy()


In [5]:
# Calculate the correlation matrix
corr_matrix = data.corr()

# check cross-correlation with target
corr_matrix["day_ahead_price"].abs().sort_values()


geothermal                0.009678
solar                     0.023719
solarenergy_brocken       0.037248
solarradiation_brocken    0.037518
holiday                   0.039644
temp_brocken              0.040015
fractional_hour           0.041269
others                    0.042110
temp_north                0.044154
temp_south                0.047608
solarradiation_south      0.049336
solarenergy_south         0.049683
actual_pmi                0.053675
load                      0.067852
waste                     0.086675
hydro_reservoir           0.103315
day_of_week               0.106187
windspeed_south           0.115083
oil                       0.126654
windspeed_north           0.130555
week_of_year              0.133946
wind_offshore             0.138390
month                     0.141184
ttf_volume                0.151949
hydro                     0.152398
lignite                   0.173607
hydro_storage_out         0.180897
windspeed_brocken         0.181456
nat_gas             

In [6]:
# Calculate the absolute correlation with the target variable
target_corr = corr_matrix["day_ahead_price"].abs()

# Filter out features with correlation less than 0.1
selected_features = target_corr[target_corr >= 0.1].index.tolist()


# Create a new dataset with the selected features
filtered_data = data[selected_features]


In [7]:
filtered_data.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207164 entries, 2018-01-02 00:00:00+00:00 to 2023-11-29 10:45:00+00:00
Data columns (total 30 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   day_of_week           207164 non-null  float64
 1   week_of_year          207164 non-null  float64
 2   month                 207164 non-null  float64
 3   year                  207164 non-null  float64
 4   hydro_storage_in      207164 non-null  float64
 5   cross_border          207164 non-null  float64
 6   nuclear               207164 non-null  float64
 7   hydro                 207164 non-null  float64
 8   biomass               207164 non-null  float64
 9   lignite               207164 non-null  float64
 10  hard_coal             207164 non-null  float64
 11  oil                   207164 non-null  float64
 12  coal_gas              207164 non-null  float64
 13  nat_gas               207164 non-null  float64
 14  hydro_

In [8]:
# Select 50% of the data randomly
subset = filtered_data.sample(frac=1)

# Prepare your feature set and target variable
X = subset.drop(columns=['day_ahead_price'])
y = subset['day_ahead_price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Instantiate model
model = LinearRegression()

# 5-fold cross-validate model
cv_results = cross_validate(model, X, y, cv=10)

# Scores
cv_results['test_score']

# Mean of scores
cv_score = cv_results['test_score'].mean()

cv_score


0.757863727886234

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Assuming X and y are your features and target variable

# Define a neural network regressor
nn_regressor = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)

# Create a pipeline with a scaler and the neural network
pipeline = Pipeline([
    ('regressor', nn_regressor)
])

# Train the neural network using the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_train_pred = pipeline.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"Training Mean Squared Error: {mse_train}")
print(f"Training R-squared: {r2_train}")

# Save the model to a file
joblib.dump(pipeline, 'nn_model.pkl')

# Learning curve
train_sizes, train_scores, validation_scores = learning_curve(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5,
    scoring='r2'
)

# Plot the learning curve
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
validation_scores_std = np.std(validation_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1, color="g")

plt.title("Learning Curve (Neural Network)")
plt.xlabel("Training Set Size")
plt.ylabel("R-squared Score")
plt.legend(loc="best")
plt.grid()
plt.show()


Training Mean Squared Error: 110.95249898995584
Training R-squared: 0.9887592463369509




In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Assuming X and y are your scaled features and target variable

# Define a support vector regressor
svr_regressor = SVR(kernel='rbf', C=1.0, epsilon=0.1)

# Create a pipeline with a scaler and the support vector regressor
pipeline = Pipeline([
    ('regressor', svr_regressor)
])

# Train the support vector regressor using the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the model
y_train_pred = pipeline.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

print(f"Training Mean Squared Error: {mse_train}")
print(f"Training R-squared: {r2_train}")

# Save the model to a file
joblib.dump(pipeline, 'svr_model.pkl')

# Learning curve
train_sizes, train_scores, validation_scores = learning_curve(
    estimator=pipeline,
    X=X_train,
    y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 10),
    cv=5,
    scoring='r2'
)

# Plot the learning curve
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
validation_scores_std = np.std(validation_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g", label="Cross-validation score")

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, validation_scores_mean - validation_scores_std, validation_scores_mean + validation_scores_std, alpha=0.1, color="g")

plt.title("Learning Curve (SVR)")
plt.xlabel("Training Set Size")
plt.ylabel("R-squared Score")
plt.legend(loc="best")
plt.grid()
plt.show()


Training Mean Squared Error: 1863.9637548026353
Training R-squared: 0.8099512245751916


KeyboardInterrupt: 