In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [24]:
file_path = 'Data assignment 1/Feature data.csv'
data = pd.read_csv(file_path)
data2=pd.read_csv("/workspaces/46765-ml-for-energy-systems/assignment1/Data process/Forcasted Weather data new.csv")

In [25]:
#Dropping the features that are not required for the analysis
data2=data2.drop('Accumulated percipitation', axis=1)
data2=data2.drop('Mean humidity', axis=1)
data2=data2.drop('Wind Speed Y Direction', axis=1)
data2=data2.drop('Wind Speed X Direction', axis=1)
data2=data2.drop('Mean temperature', axis=1)
data2=data2.drop('Minimum temperature', axis=1)
data2=data2.drop('Solar Shortwave Flux', axis=1)
data2=data2.drop('Unnamed: 0',axis=1)
data2.head()

Unnamed: 0,ts,Maximum temperature,Mean wind direction,Mean wind speed
0,2022-01-01 00:00:00,279.84314,235.57857,5.813893
1,2022-01-01 01:00:00,279.84314,235.57857,5.813893
2,2022-01-01 02:00:00,279.84314,235.57857,5.813893
3,2022-01-01 03:00:00,279.84314,235.57857,5.813893
4,2022-01-01 04:00:00,279.84314,235.57857,5.813893


## Step 2
### Feature scaling

In this step the different features are scaled, in order to make sure the models can interpret the features on a similar scale. The wind speed and the maximum temperature undergo the standardscaler, while wind direction is converted to sinus and cosinus components. The power production is normalized using the nominal capacity of 30 MW (https://stateofgreen.com/en/solutions/kalby-wind-turbines/). 
 

In [26]:
# Import required scalers
scaler_standard = StandardScaler()

### 1. Standard Scaling for wind speed and temperature
data['Mean wind speed'] = scaler_standard.fit_transform(data[['Mean wind speed']])
data['Maximum temperature'] = scaler_standard.fit_transform(data[['Maximum temperature']])

### 2. Wind Direction (convert to sin and cos components)
data['Wind direction sin'] = np.sin(np.deg2rad(data['Mean wind direction']))
data['Wind direction cos'] = np.cos(np.deg2rad(data['Mean wind direction']))

### 3. Normalize Power Production 
nominal_capacity = 30000 # production capacity is 30 MW, unit of power production is kW so nominal capacity is 30000 (kW)
data['AKI Kalby Active Power'] = data['AKI Kalby Active Power'] / nominal_capacity

# Dropping the original wind direction after scaling
data = data.drop('Mean wind direction', axis=1)

In [27]:
### 1. Standard Scaling for wind speed and temperature for the Forecasted Weather Data
data2['Mean wind speed'] = scaler_standard.fit_transform(data2[['Mean wind speed']])
data2['Maximum temperature'] = scaler_standard.fit_transform(data2[['Maximum temperature']])

### 2. Wind Direction (convert to sin and cos components) for the Forecasted Weather Data
data2['Wind direction sin'] = np.sin(np.deg2rad(data2['Mean wind direction']))
data2['Wind direction cos'] = np.cos(np.deg2rad(data2['Mean wind direction']))


# Dropping the original wind direction after transformation for the Forecasted Weather Data
data2 = data2.drop('Mean wind direction', axis=1)

In [28]:
# Make sure datetime is set as the index
data['datetime'] = pd.to_datetime(data['datetime'])
data.set_index('datetime', inplace=True)

In [29]:
# datetime is set as the index for the Forecasted Weather Data
data2['ts'] = pd.to_datetime(data2['ts'])
data2.set_index('ts', inplace=True)

In [30]:
# Set target and features, and remove non-numeric columns
target_column = 'AKI Kalby Active Power'
features = data.select_dtypes(include=[np.number]).drop(columns=[target_column])

In [31]:
# Load the dataframe to check if scaling worked
data.head()

Unnamed: 0_level_0,AKI Kalby Active Power,Maximum temperature,Mean wind speed,Wind direction sin,Wind direction cos
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01 00:00:00,-0.063118,-0.457945,0.868655,-0.99863,-0.05233596
2022-01-01 01:00:00,-0.055728,-0.457945,0.382418,-0.956305,-0.2923717
2022-01-01 02:00:00,-0.095724,-0.503187,0.756447,-0.994522,-0.1045285
2022-01-01 03:00:00,-0.063726,-0.518268,0.494627,-1.0,-1.83697e-16
2022-01-01 04:00:00,-0.029392,-0.473025,0.307612,-0.951057,0.309017


### Constructing testing and training 
Using train_test_split the data is split into testing and training data. The choice was made to not use TimeSeriesSplit, because although the data is time based, the values are not dependent on the time of day in the sense that there is no strong temporal relationship that affects the observations. The power production is more weather dependent than anything else. 
 The data points can be treated independently of their time indices, allowing for a standard random sampling approach. By maintaining a randomized split, we also prevent potential biases that could arise from time-based sequences, ensuring that both the training and testing set represent the overall distribution of the dataset.

In [32]:
# Split the data
X = data.drop(columns=[target_column])
y = data[target_column]

# Select a 100 datapoints as a start with a low number of samples
X_sample = features[:100]
y_sample = data[target_column][:100]
X_forecast_sample = data2[:20]

# Sequential split (shuffle=False)
# Give the random_state a set seed of 42, to ensure that the split will be the same everytime in order to reproduce results
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(
    X_sample, y_sample, test_size=0.2, shuffle=False, random_state=42)

# Adding a column of ones to X_sample for the bias term and converting to NumPy array
X_sample_train_with_bias = np.c_[np.ones(X_sample_train.shape[0]), X_sample_train].astype(float)
X_sample_test_with_bias = np.c_[np.ones(X_sample_test.shape[0]), X_sample_test].astype(float)
X_forecast_sample_with_bias=np.c_[np.ones(X_forecast_sample.shape[0]), X_forecast_sample].astype(float)

# Ensure y_sample_train is also a NumPy array
y_sample_train = np.array(y_sample_train).astype(float)

In [33]:
data2_with_bias=np.c_[np.ones(data2.shape[0]), data2].astype(float)

### Step 3 Linear regression

In [34]:
# Gradient Descent function
def gradient_descent(X, y, learning_rate=0.01, epochs=100000):
    m, n = X.shape
    theta = np.zeros(n)
    for _ in range(epochs):
        y_pred = X @ theta
        gradients = (1/m) * X.T @ (y_pred - y)
        theta -= learning_rate * gradients
    return theta

theta_gd = gradient_descent(X_sample_train_with_bias, y_sample_train)

# Predictions using Gradient Descent
#y_pred_gd = X_sample_test_with_bias @ theta_gd

In [35]:
# Predictions for Forecasted Data using Gradient Descent
y_pred_gd = X_forecast_sample_with_bias @ theta_gd
y_pred_gd_forecast=data2_with_bias @ theta_gd

In [36]:
# Closed-form solution 
theta_closed_form = np.linalg.inv(X_sample_train_with_bias.T @ X_sample_train_with_bias) @ X_sample_train_with_bias.T @ y_sample_train

# Predictions using closed-form solution
y_pred_closed_form = X_sample_test_with_bias @ theta_closed_form

In [37]:
# mse calculation
mse_gd = mean_squared_error(y_sample_test, y_pred_gd)
mse_closed_form = mean_squared_error(y_sample_test, y_pred_closed_form)

print(f"Gradient Descent θ: {[f'{x:.5f}' for x in theta_gd]}")
print(f"Closed-Form θ: {[f'{x:.5f}' for x in theta_closed_form]}")
print(f"Gradient Descent MSE: {mse_gd:.5f}")
print(f"Closed-Form MSE: {mse_closed_form:.5f}")

Gradient Descent θ: ['-0.03789', '0.02592', '-0.04119', '0.01152', '0.01116']
Closed-Form θ: ['-0.03789', '0.02592', '-0.04119', '0.01152', '0.01116']
Gradient Descent MSE: 0.00206
Closed-Form MSE: 0.00099


In [38]:
# Step 3.2: Use the full dataset and closed form solution
X_large_sample, X_large_test_sample, y_large_sample, y_large_test_sample = train_test_split(features, data[target_column], test_size=0.2, random_state=42)
X_large_sample_forecast=data2[:1563]
                              
# Adding a column of ones for the bias term in the large sample
X_large_sample_with_bias = np.c_[np.ones(X_large_sample.shape[0]), X_large_sample]
X_large_test_sample_with_bias = np.c_[np.ones(X_large_test_sample.shape[0]), X_large_test_sample]
X_large_sample_forecast_with_bias=np.c_[np.ones(X_large_sample_forecast.shape[0]), X_large_sample_forecast]

# Upgrade the normal equation
theta_large_sample = np.linalg.inv(X_large_sample_with_bias.T @ X_large_sample_with_bias) @ X_large_sample_with_bias.T @ y_large_sample
theta_large_sample_rounded = np.round(theta_large_sample, 5)

print(f"Step 3.2: Closed-form solution training complete on the larger sample.")
print(f"Coefficients: {[f'{x:.5f}' for x in theta_large_sample_rounded]}")

Step 3.2: Closed-form solution training complete on the larger sample.
Coefficients: ['-0.04394', '0.00016', '-0.03909', '0.00591', '0.00366']


In [39]:
#Resetting the index, to obtain the timestamp as a seperate column and renaming it to Datetime
X_large_test_sample.reset_index(inplace=True)
data2_reset=data2.reset_index()
data2_reset['datetime']=data2_reset['ts']

In [40]:
#Merging the old dataset (Weather data) and the new dataset (Forecasted Data)
X_large_test_sample = X_large_test_sample.set_index('datetime')
data2_reset = data2_reset.set_index('datetime')
merged_ds = X_large_test_sample.join(data2_reset, how='left', lsuffix='_left', rsuffix='_right').reset_index()
merged_ds

Unnamed: 0,datetime,Maximum temperature_left,Mean wind speed_left,Wind direction sin_left,Wind direction cos_left,ts,Maximum temperature_right,Mean wind speed_right,Wind direction sin_right,Wind direction cos_right
0,2022-05-20 06:00:00,1.427152,0.569432,-0.909961,-0.414693,2022-05-20 06:00:00,0.963624,-0.130353,-0.890382,0.455215
1,2022-11-15 09:00:00,-0.156329,-0.515250,0.898794,-0.438371,2022-11-15 09:00:00,-0.258735,-0.626248,0.964713,-0.263303
2,2022-05-20 01:00:00,1.819252,0.457224,-0.544639,-0.838671,2022-05-20 01:00:00,1.027204,-0.659087,-0.994775,0.102093
3,2022-12-22 23:00:00,-0.653995,0.906058,-0.891007,-0.453990,2022-12-22 23:00:00,-0.744850,1.032841,-0.929767,-0.368149
4,2022-08-20 20:00:00,1.442233,-1.038890,-0.913545,0.406737,2022-08-20 20:00:00,1.300281,-0.941475,-0.572338,0.820018
...,...,...,...,...,...,...,...,...,...,...
1558,2022-09-03 12:00:00,1.246183,0.307612,0.961262,-0.275637,2022-09-03 12:00:00,1.147728,-0.344476,0.987257,-0.159132
1559,2022-09-12 16:00:00,1.065214,-0.552653,-0.707107,-0.707107,2022-09-12 16:00:00,0.913578,-0.812598,0.249059,-0.968488
1560,2022-05-05 23:00:00,-0.473025,-0.477847,-0.974370,0.224951,2022-05-05 23:00:00,-0.339004,-1.200294,-0.866022,0.500006
1561,2022-04-01 16:00:00,-1.106418,1.654115,0.681998,0.731354,2022-04-01 16:00:00,-1.095397,1.063045,0.743966,0.668218


In [41]:
#Dropping the columns from the old dataset
merged_ds=merged_ds.drop('Maximum temperature_left', axis=1)
merged_ds=merged_ds.drop('Mean wind speed_left', axis=1)
merged_ds=merged_ds.drop('Wind direction sin_left', axis=1)
merged_ds=merged_ds.drop('Wind direction cos_left', axis=1)
merged_ds=merged_ds.drop('datetime', axis=1)
merged_ds=merged_ds.drop('ts', axis=1)
merged_ds

Unnamed: 0,Maximum temperature_right,Mean wind speed_right,Wind direction sin_right,Wind direction cos_right
0,0.963624,-0.130353,-0.890382,0.455215
1,-0.258735,-0.626248,0.964713,-0.263303
2,1.027204,-0.659087,-0.994775,0.102093
3,-0.744850,1.032841,-0.929767,-0.368149
4,1.300281,-0.941475,-0.572338,0.820018
...,...,...,...,...
1558,1.147728,-0.344476,0.987257,-0.159132
1559,0.913578,-0.812598,0.249059,-0.968488
1560,-0.339004,-1.200294,-0.866022,0.500006
1561,-1.095397,1.063045,0.743966,0.668218


In [42]:
merged_ds_with_bias=np.c_[np.ones(merged_ds.shape[0]), merged_ds]

In [43]:
# Step 3.3: Verify your model using the testing dataset and appropriate evaluation metrics
y_large_pred_closed_form = X_large_test_sample_with_bias @ theta_large_sample

mse = mean_squared_error(y_large_test_sample, y_large_pred_closed_form)
mae = mean_absolute_error(y_large_test_sample, y_large_pred_closed_form)
r2 = r2_score(y_large_test_sample, y_large_pred_closed_form)
rmse = np.sqrt(mse)

print(f"Step 3.3: Model evaluation on the testing dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse:.5f}")
print(f"Mean Squared Error (MSE): {mse:.5f}")
print(f"Mean Absolute Error (MAE): {mae:.5f}")
print(f"R-squared: {r2:.5f}")

Step 3.3: Model evaluation on the testing dataset:
Root Mean Squared Error (RMSE): 0.03016
Mean Squared Error (MSE): 0.00091
Mean Absolute Error (MAE): 0.02294
R-squared: 0.64752


In [44]:
# Step 3.3: Verify your model using the testing dataset and appropriate evaluation metrics with the new merged dataset (Forecasted Data)
y_large_pred_closed_form = merged_ds_with_bias @ theta_large_sample
print(y_large_pred_closed_form)

mse = mean_squared_error(y_large_test_sample, y_large_pred_closed_form)
mae = mean_absolute_error(y_large_test_sample, y_large_pred_closed_form)
r2 = r2_score(y_large_test_sample, y_large_pred_closed_form)
rmse = np.sqrt(mse)

print(f"Step 3.3: Model evaluation on the testing dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse:.5f}")
print(f"Mean Squared Error (MSE): {mse:.5f}")
print(f"Mean Absolute Error (MAE): {mae:.5f}")
print(f"R-squared: {r2:.5f}")

[-0.04228653 -0.01476373 -0.02351511 ... -0.00035681 -0.07882923
 -0.02277414]
Step 3.3: Model evaluation on the testing dataset:
Root Mean Squared Error (RMSE): 0.02865
Mean Squared Error (MSE): 0.00082
Mean Absolute Error (MAE): 0.02156
R-squared: 0.68197


### Step 4 Non-linear Regression

In Step 1's formulation, if the price $\lambda$ is treated as a constant and the actual value p is known, the entire formula simplifies into a function dependent on the predicted value $\hat{p}_t$. This implies that the problem can be reframed as an optimization task concerning the prediction of $\hat{p}_t$. Given this perspective, extending the linear regression model from Step 3 by incorporating nonlinear features to predict $\hat{p}_t$ effectively transforms the problem into a nonlinear regression for Step 1's objective. Therefore, performing nonlinear regression on the prediction model of $\hat{p}_t$ inherently satisfies the requirements of the nonlinear extension outlined in Step 4.

In [45]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [46]:
# Define cross-validation function, which returns RMSE
def perform_cross_validation(X, y, degree, n_splits=10):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    
    # Initialize linear regression model
    linear_model = LinearRegression()
    
    # Use KFold for cross-validation, n_splits set to 10
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Compute RMSE via cross-validation (use negative MSE and then take the square root)
    neg_mse_scores = cross_val_score(linear_model, X_poly, y, cv=kf, scoring='neg_mean_squared_error')
    
    # Convert negative MSE to RMSE
    rmse_scores = np.sqrt(-neg_mse_scores)
    
    # Return the mean RMSE
    return rmse_scores.mean()

In [47]:
# Polynomial degrees to evaluate: 2, 3, and 4
degrees = [2, 3, 4]
best_degree = None
best_rmse = float('inf')

# Perform cross-validation for each polynomial degree
for degree in degrees:
    print(f"Evaluating degree {degree} polynomial: ")
    rmse = perform_cross_validation(features, data[target_column], degree)
    print(f"Mean RMSE for degree {degree}: {rmse:.5f}")
    
    # Update the best model based on RMSE
    if rmse < best_rmse:
        best_rmse = rmse
        best_degree = degree

Evaluating degree 2 polynomial: 
Mean RMSE for degree 2: 0.02963
Evaluating degree 3 polynomial: 
Mean RMSE for degree 3: 0.02762
Evaluating degree 4 polynomial: 
Mean RMSE for degree 4: 0.02725


In [48]:
print(f"Best degree is {best_degree} with RMSE: {best_rmse:.5f}")

Best degree is 4 with RMSE: 0.02725


In [49]:
# Generate the best polynomial features
poly_best = PolynomialFeatures(degree=best_degree, include_bias=False)
X_poly_best = poly_best.fit_transform(features)

# Split the data into training and testing sets
X_train_poly_best, X_test_poly_best, y_train_poly_best, y_test_poly_best = train_test_split(
    X_poly_best, data[target_column], test_size=0.2, random_state=42)

# Train the final polynomial model
linear_model_best = LinearRegression()
linear_model_best.fit(X_train_poly_best, y_train_poly_best)

# Make predictions on the test set
y_pred_poly_best = linear_model_best.predict(X_test_poly_best)

test_rmse_best = np.sqrt(mean_squared_error(y_test_poly_best, y_pred_poly_best))
test_mae_best = mean_absolute_error(y_test_poly_best, y_pred_poly_best)
test_r2_best = r2_score(y_test_poly_best, y_pred_poly_best)

# Evaluation
print(f"Test RMSE for the best degree {best_degree}: {test_rmse_best:.5f}")
print(f"Test MAE for the best degree {best_degree}: {test_mae_best:.5f}")
print(f"Test R-squared for the best degree {best_degree}: {test_r2_best:.5f}")


Test RMSE for the best degree 4: 0.02752
Test MAE for the best degree 4: 0.02016
Test R-squared for the best degree 4: 0.70646


In [50]:
# Extract coefficients
coefficients = linear_model_best.coef_
intercept = linear_model_best.intercept_

print("Coefficients:", coefficients)
print("Intercept:", intercept)

Coefficients: [-1.48867315e-03 -8.08481845e+06 -2.44429275e+08 -1.40023054e+07
  6.65156671e+08  5.25650915e+07 -1.62385513e+06  1.33334526e+07
 -1.03517423e+09 -4.78562273e+07 -1.90636850e+07  6.47965234e+08
  4.26568526e+07 -9.78409626e+08  1.47076905e-03  1.67167458e-03
  2.52060068e-03  4.61231219e-03 -1.66976282e-04 -2.95450361e-03
  1.47063159e-03 -4.42874721e-03  1.96632765e-03  2.94054855e-03
  3.63402627e-03 -3.90598650e-03  3.44096462e-03  8.08481841e+06
 -4.42008514e-04  8.08481841e+06  2.44429275e+08  1.40023054e+07
  2.44429275e+08  1.40023054e+07 -1.31297112e-03  3.87966633e-04
 -1.68728828e-03 -2.32362747e-03 -9.39786434e-04  1.79785490e-03
  3.82718444e-03 -6.65156671e+08  3.02609801e-03 -6.65156671e+08
  3.89933586e-04  1.36432052e-03  1.40170008e-03 -5.25650915e+07
 -5.47507778e-04 -5.25650915e+07  1.62385513e+06 -1.33334526e+07
  1.62385513e+06 -1.33334526e+07 -4.49240208e-04 -1.35421753e-03
 -1.75333023e-03  1.03517423e+09 -4.39435244e-03  1.03517423e+09
  4.7856227

For step 4.2, the method of locally weighted least squares will be used, as tought in the lecture. Different kernels will be compared and the best one will be chosen based on evaluating the performance on the test data.

In [51]:
def gaussian(t):
    return np.exp(-0.5 * t**2) / np.sqrt(2 * np.pi)

def epanechnikov(t):
    res = np.zeros_like(t)
    res[np.abs(t) <= 1] = 0.75 * (1 - t[np.abs(t) <= 1]**2)
    return res

def tricube(t):
    res = np.zeros_like(t)
    res[np.abs(t) <= 1] = (70 / 81) * (1 - np.abs(t[np.abs(t) <= 1])**3)**3
    return res

def uniform(t, p=0.2):
    return np.zeros_like(t) + p

def triangle(t):
    res = np.zeros_like(t)
    res[np.abs(t) <= 1] = 1 - np.abs(t[np.abs(t) <= 1])
    return res

In [52]:
# Locally Weighted Least Squares implementation
def lwls_predict(X_train, y_train, X_test, kernel_func, tau=0.1):
    y_pred = np.zeros(len(X_test))

    for i, x in enumerate(X_test):
        distances = np.linalg.norm(X_train - x, axis=1)  # Compute distances
        weights = kernel_func(distances / tau)  # Apply kernel function
        W = np.diag(weights)  # Create diagonal weight matrix

        # Weighted Least Squares computation
        XTWX = X_train.T @ W @ X_train  # X^T W X
        XTWy = X_train.T @ W @ y_train  # X^T W y

        # Use np.linalg.pinv for numerical stability
        theta = np.linalg.pinv(XTWX) @ XTWy

        # Ensure x is 2D before matrix multiplication
        y_pred[i] = np.dot(x, theta)  # Prediction for the current test sample

    return y_pred

In [53]:
# Function to evaluate different kernels and select the best one
def evaluate_kernels(X_train, y_train, X_test, y_test, kernels, tau=0.1):
    mse_results = {}

    for kernel_name, kernel_func in kernels.items():

        y_pred = lwls_predict(X_train, y_train, X_test, kernel_func, tau=tau)
        mse = mean_squared_error(y_test, y_pred)
        mse_results[kernel_name] = mse

    return mse_results

# Example kernels (ensure these are defined somewhere in the code)
kernels = {
    'Gaussian': gaussian,
    'Epanechnikov': epanechnikov,
    'Tricube': tricube,
    'Uniform': uniform,
    'Triangle': triangle
}

In [54]:
# Evaluate kernels on smaller data for faster results
mse_results_small = evaluate_kernels(X_sample_train_with_bias, y_sample_train, X_sample_test_with_bias, y_sample_test, kernels)
min_kernel, min_mse = min(mse_results_small.items(), key=lambda x: x[1])

# Evaluate kernels on smaller data from the Forecasted Dataset for faster results
#mse_results_small = evaluate_kernels(X_sample_train_with_bias, y_sample_train, X_forecast_sample_with_bias, y_sample_test, kernels)

# Print the full results and the minimum one
print(mse_results_small)
print(f"The kernel with the smallest MSE is '{min_kernel}' with a value of {min_mse}")


{'Gaussian': np.float64(0.017111416935430224), 'Epanechnikov': np.float64(0.015838563409984314), 'Tricube': np.float64(0.015838563409984314), 'Uniform': np.float64(0.0009889360972607495), 'Triangle': np.float64(0.015838563409984314)}
The kernel with the smallest MSE is 'Uniform' with a value of 0.0009889360972607495


In [55]:
# Evaluation on the kernel selected
def evaluate_uniform(X_train, y_train, X_test, y_test, uniform, tau=0.1):
    for kernel_name, kernel_func in uniform.items():
        y_pred = lwls_predict(X_train, y_train, X_test, kernel_func, tau=tau)
        
        mse = mean_squared_error(y_test, y_pred)
        mae_wls_uniform = mean_absolute_error(y_test, y_pred)
        r2_wls_uniform = r2_score(y_test, y_pred)
        rmse_wls_uniform = np.sqrt(mse)
        
        print(f"{kernel_name} Kernel Results:")
        print(f"Mean Squared Error (MSE): {mse}")
        print(f"Mean Absolute Error (MAE): {mae_wls_uniform}")
        print(f"R-squared (R2): {r2_wls_uniform}")
        print(f"Root Mean Squared Error (RMSE): {rmse_wls_uniform}")

    return

uniform_kernel = {
    'Uniform': uniform
}

#evaluate_uniform(X_large_sample_with_bias, y_large_sample, X_large_test_sample_with_bias, y_large_test_sample, uniform_kernel)

#Evaluation of the Kernel using the Forecasted Data
evaluate_uniform(X_large_sample_with_bias, y_large_sample, merged_ds_with_bias, y_large_test_sample, uniform_kernel)

Uniform Kernel Results:
Mean Squared Error (MSE): 0.0008207430221008048
Mean Absolute Error (MAE): 0.021559426662035266
R-squared (R2): 0.6819731852688306
Root Mean Squared Error (RMSE): 0.028648612917570804


### Step 5 Regularization
Ridge and Lasso regression are applied to test if the variance of the dataset can and has to be improved. Applying one of these techniques could make the model more stable and improve the prediction. 
Different alpha values are tested to see which one results in the best results. Both for Lasso and Ridge the goal is to minimize the mean squared error. To find the optimal alpha values GridSearchCV is used, which applies 5-fold cross-validation. 

In [56]:
# Create a list with possible alpha values to iterate over
alpha_values = {'alpha': [0.0001, 0.01, 0.1, 1, 10, 100]}

# Ridge model with cross-validation
ridge_model = Ridge()
# Apply GridSearchCV to search for the optimal alpha
ridge_cv = GridSearchCV(ridge_model, param_grid=alpha_values, cv=5, scoring='neg_mean_squared_error') # cv=5 for 5-fold cross-validation, scoring mean squared error because the goal is to get this as low as possible
ridge_cv.fit(X_large_test_sample,y_large_test_sample)
#ridge_cv.fit(merged_ds,y_large_test_sample)

# Lasso model with cross-validation
lasso_model = Lasso()
# Apply GridSearchCV to search for the optimal alpha
lasso_cv = GridSearchCV(lasso_model, param_grid=alpha_values, cv=5, scoring='neg_mean_squared_error') # cv=5 for 5-fold cross-validation, scoring mean squared error because the goal is to get this as low as possible
lasso_cv.fit(X_large_test_sample,y_large_test_sample)
#lasso_cv.fit(merged_ds,y_large_test_sample)

# Get the best alpha values
best_alpha_ridge = ridge_cv.best_params_['alpha']
best_alpha_lasso = lasso_cv.best_params_['alpha']

print(f"Optimal alpha for Ridge with the full dataset: {best_alpha_ridge}")
print(f"Optimal alpha for Lasso with the full dataset: {best_alpha_lasso}")

Optimal alpha for Ridge with the full dataset: 1
Optimal alpha for Lasso with the full dataset: 0.0001


In [57]:
# Run the lasso model with the optimal alpha
"""lasso_model = Lasso(alpha=best_alpha_lasso)
lasso_model.fit(X_large_test_sample,y_large_test_sample)
# Predict the power production with Lasso regularization
y_pred_lasso = lasso_model.predict(X_large_test_sample)
# Show the new coefficients
lasso_model.coef_"""

'lasso_model = Lasso(alpha=best_alpha_lasso)\nlasso_model.fit(X_large_test_sample,y_large_test_sample)\n# Predict the power production with Lasso regularization\ny_pred_lasso = lasso_model.predict(X_large_test_sample)\n# Show the new coefficients\nlasso_model.coef_'

In [58]:
# Run the lasso model with the optimal alpha for the new Forecasted Dataset
lasso_model = Lasso(alpha=best_alpha_lasso)
lasso_model.fit(merged_ds,y_large_test_sample)
# Predict the power production with Lasso regularization
y_pred_lasso = lasso_model.predict(merged_ds)
# Show the new coefficients
lasso_pred=(data2.reset_index()).drop(columns=['ts']) @ lasso_model.coef_
lasso_pred

0      -0.055967
1      -0.055967
2      -0.055967
3      -0.055967
4      -0.055967
          ...   
8755   -0.063953
8756   -0.041744
8757   -0.032598
8758   -0.035232
8759   -0.024239
Length: 8760, dtype: float64

In [59]:
# Verify the model using the testing dataset and appropriate evaluation metrics
mse_lasso = mean_squared_error(y_large_test_sample, y_pred_lasso)
mae_lasso= mean_absolute_error(y_large_test_sample, y_pred_lasso)
r2_lasso = r2_score(y_large_test_sample, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)

print(f"Weighted Least Squares model evaluation on the testing dataset and the ridge regression with a penalty of 0.0001:")
print(f"Root Mean Squared Error (RMSE): {rmse_lasso:.4f}")
print(f"Mean Squared Error (MSE): {mse_lasso:.4f}")
print(f"Mean Absolute Error (MAE): {mae_lasso:.4f}")
print(f"R-squared: {r2_lasso:.4f}")

Weighted Least Squares model evaluation on the testing dataset and the ridge regression with a penalty of 0.0001:
Root Mean Squared Error (RMSE): 0.0283
Mean Squared Error (MSE): 0.0008
Mean Absolute Error (MAE): 0.0216
R-squared: 0.6892


In [60]:
# Run the ridge model with the optimal alpha
ridge_model = Ridge(alpha=best_alpha_ridge)
ridge_model.fit(X_large_test_sample,y_large_test_sample)
# Predict the power production with Ridge regularization
y_pred_ridge = ridge_model.predict(X_large_test_sample)
# Show the new coefficients
ridge_model.coef_

array([-0.00066934, -0.03855133,  0.00735763,  0.00600741])

In [61]:
# Verify the model using the testing dataset and appropriate evaluation metrics
mse_ridge = mean_squared_error(y_large_test_sample, y_pred_lasso)
mae_ridge= mean_absolute_error(y_large_test_sample, y_pred_lasso)
r2_ridge = r2_score(y_large_test_sample, y_pred_lasso)
rmse_ridge = np.sqrt(mse_lasso)

print(f"Weighted Least Squares model evaluation on the testing dataset and the ridge regression with a penalty of 1:")
print(f"Root Mean Squared Error (RMSE): {rmse_ridge:.4f}")
print(f"Mean Squared Error (MSE): {mse_ridge:.4f}")
print(f"Mean Absolute Error (MAE): {mae_ridge:.4f}")
print(f"R-squared: {r2_ridge:.4f}")

Weighted Least Squares model evaluation on the testing dataset and the ridge regression with a penalty of 1:
Root Mean Squared Error (RMSE): 0.0283
Mean Squared Error (MSE): 0.0008
Mean Absolute Error (MAE): 0.0216
R-squared: 0.6892


In [62]:
# Run the ridge model with the optimal alpha
ridge_model = Ridge(alpha=0.0001)
ridge_model.fit(X_large_test_sample,y_large_test_sample)
# Predict the power production with Ridge regularization
y_pred_ridge = ridge_model.predict(X_large_test_sample)
# Show the new coefficients
ridge_model.coef_

array([-0.00067399, -0.03857578,  0.00736154,  0.00600847])

In [63]:
# Verify the model using the testing dataset and appropriate evaluation metrics
mse_ridge = mean_squared_error(y_large_test_sample, y_pred_lasso)
mae_ridge= mean_absolute_error(y_large_test_sample, y_pred_lasso)
r2_ridge = r2_score(y_large_test_sample, y_pred_lasso)
rmse_ridge = np.sqrt(mse_lasso)

print(f"Weighted Least Squares model evaluation on the testing dataset and the ridge regression with a penalty of 1:")
print(f"Root Mean Squared Error (RMSE): {rmse_ridge:.4f}")
print(f"Mean Squared Error (MSE): {mse_ridge:.4f}")
print(f"Mean Absolute Error (MAE): {mae_ridge:.4f}")
print(f"R-squared: {r2_ridge:.4f}")

Weighted Least Squares model evaluation on the testing dataset and the ridge regression with a penalty of 1:
Root Mean Squared Error (RMSE): 0.0283
Mean Squared Error (MSE): 0.0008
Mean Absolute Error (MAE): 0.0216
R-squared: 0.6892


The optimal penalty term for Lasso regression is 0.0001. This results in validation metric values with almost the same results as normal regression. Moreover, such a small penalty term indicates that it would be better to simply apply normal regression. This makes sense because for this model n>>p. There is a very large amount of data points and only 4 parameters. Consequently, there is already very low variance, before regularization is applied, minimizing the need for additional regularization. 

Ridge regression shows similar results, with one notable difference. For Ridge regression, it does not matter whether the penalty term is set to 1 or to 0.0001. The validation metrics remain exactly the same. This indicates that the data is inherently regularized; there is already a very low variance and the Ridge regression is not required to improve results. 
