In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [20]:
file_path1 = r"C:\Users\ilsed\OneDrive\Documents\EPA\Jaar 2\DTU\machine learning for energy systems\Feature data.csv"
# file_path1="../Data assignment 1/ Feature data.csv"
data = pd.read_csv(file_path1)

### Feature scaling

In this step,we used Principal Component Analysis (PCA) to select and reduce the dimensionality of the features.

In [21]:
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

### 1. Standard Scaling for wind speed and temperature
data['Mean wind speed'] = scaler_standard.fit_transform(data[['Mean wind speed']])
data['Maximum temperature'] = scaler_standard.fit_transform(data[['Maximum temperature']])

### 2. Wind Direction (convert to sin and cos components)
data['Wind direction sin'] = np.sin(np.deg2rad(data['Mean wind direction']))
data['Wind direction cos'] = np.cos(np.deg2rad(data['Mean wind direction']))

### 3. Normalize Power Production 
nominal_capacity = 30000 # production capacity is 30 MW, unit of power production is kW so nominal capacity is 30000
data['AKI Kalby Active Power'] = data['AKI Kalby Active Power'] / nominal_capacity

# Dropping the original wind direction after transformation
data = data.drop('Mean wind direction', axis=1)

In [22]:
# set target and features
target_column = 'AKI Kalby Active Power'
features = data.drop(columns=[target_column])

In [23]:
data

Unnamed: 0,datetime,AKI Kalby Active Power,Maximum temperature,Mean wind speed,Wind direction sin,Wind direction cos
0,2022-01-01 00:00:00,-0.063118,-0.457945,0.868655,-0.998630,-5.233596e-02
1,2022-01-01 01:00:00,-0.055728,-0.457945,0.382418,-0.956305,-2.923717e-01
2,2022-01-01 02:00:00,-0.095724,-0.503187,0.756447,-0.994522,-1.045285e-01
3,2022-01-01 03:00:00,-0.063726,-0.518268,0.494627,-1.000000,-1.836970e-16
4,2022-01-01 04:00:00,-0.029392,-0.473025,0.307612,-0.951057,3.090170e-01
...,...,...,...,...,...,...
7808,2022-12-31 19:00:00,-0.148665,0.009559,1.953338,-0.656059,-7.547096e-01
7809,2022-12-31 20:00:00,-0.153192,0.039721,1.467101,-0.731354,-6.819984e-01
7810,2022-12-31 21:00:00,-0.120257,0.039721,1.504504,-0.681998,-7.313537e-01
7811,2022-12-31 22:00:00,-0.103334,-0.065844,1.242684,-0.798636,-6.018150e-01


In [9]:
# # Impute missing values using mean for numerical columns
# imputer = SimpleImputer(strategy='mean')
# features_imputed = imputer.fit_transform(features)
# 
# # Check for any remaining NaNs after imputation
# features_imputed_df = pd.DataFrame(features_imputed, columns=features.columns)
# print(f"Number of remaining NaNs: {features_imputed_df.isna().sum().sum()}")
# 
# # Drop any rows with NaN values (if any persist after imputation)
# features_imputed_df.dropna(inplace=True)

In [10]:
# # standardize the features
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features_imputed_df)

In [11]:
# # apply PCA and plotting
# pca = PCA()
# features_pca = pca.fit_transform(features_scaled)
# explained_variance = pca.explained_variance_ratio_
# cumulative_variance = explained_variance.cumsum()
# 
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')
# plt.title('Cumulative Explained Variance')
# plt.xlabel('Number of Principal Components')
# plt.ylabel('Cumulative Variance')
# plt.grid()
# plt.show()

 A subset of the original features was selected based on their contribution to the variance in the data. Given the output above, we selected **8** principal components, which capture **over 95%** of the variance in the dataset.

In [25]:
# # determine number and reduce dimension
# n_components = next(i for i, total_var in enumerate(cumulative_variance) if total_var >= 0.95) + 1 # 95% explained
# 
# pca = PCA(n_components=n_components)
# features_reduced = pca.fit_transform(features_scaled)

In [26]:
# # convert to a dataFrame
# features_pca_df = pd.DataFrame(features_reduced, columns=[f'PC{i+1}' for i in range(n_components)])
# features_pca_df[target_column] = data.loc[features_imputed_df.index, target_column].values

In [12]:
# # Get the contribution of each principal component to the original variable
# n_components = features_reduced.shape[1]  
# pca_components_df = pd.DataFrame(pca.components_[:n_components], columns=features_imputed_df.columns)
# 
# # Find the variable that contributes most to each principal component
# top_features_per_component = []
# for i in range(n_components):
#     # Get the variable index with the largest absolute value for each principal component
#     top_feature_index = np.argmax(np.abs(pca_components_df.iloc[i]))
#     top_feature_name = features_imputed_df.columns[top_feature_index]
#     top_features_per_component.append(top_feature_name)
# 
# # Output the selected feature variable name
# print("Name of the feature variable selected by PCA:")
# print(top_features_per_component)


The original dataset includes actual weather data and historical windpark data. We combined these features into new composite features via PCA. Here we can see the types of the features we selected.

In [13]:
# features_pca_df

**Dataset Split**: 
  - The notebook splits the dataset into training and testing sets using an 80-20 split. This means 80% of the data is used for training the model, while the remaining 20% is used for testing.
  - This split was done using the `train_test_split` function from `scikit-learn`.

**Contents of the Training and Testing Datasets**:
  - **Training Dataset**: Contains the selected principal components as input features (`X_train`) and the target variable (`y_train`), which is the wind farm's power production.
  - **Testing Dataset**: Contains the same input features (`X_test`) and target variable (`y_test`) as the training dataset but is reserved for evaluating the model's performance.

In [24]:
data.set_index('datetime', inplace=True)

In [25]:
data

Unnamed: 0_level_0,AKI Kalby Active Power,Maximum temperature,Mean wind speed,Wind direction sin,Wind direction cos
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01 00:00:00,-0.063118,-0.457945,0.868655,-0.998630,-5.233596e-02
2022-01-01 01:00:00,-0.055728,-0.457945,0.382418,-0.956305,-2.923717e-01
2022-01-01 02:00:00,-0.095724,-0.503187,0.756447,-0.994522,-1.045285e-01
2022-01-01 03:00:00,-0.063726,-0.518268,0.494627,-1.000000,-1.836970e-16
2022-01-01 04:00:00,-0.029392,-0.473025,0.307612,-0.951057,3.090170e-01
...,...,...,...,...,...
2022-12-31 19:00:00,-0.148665,0.009559,1.953338,-0.656059,-7.547096e-01
2022-12-31 20:00:00,-0.153192,0.039721,1.467101,-0.731354,-6.819984e-01
2022-12-31 21:00:00,-0.120257,0.039721,1.504504,-0.681998,-7.313537e-01
2022-12-31 22:00:00,-0.103334,-0.065844,1.242684,-0.798636,-6.018150e-01


In [26]:
# split the data
X = data.drop(columns=[target_column])
y = data[target_column]

split_data= TimeSeriesSplit(n_splits=3, test_size=int(0.2*len(data)))
for train_index, test_index in split_data.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
print("TRAIN indices:", train_index, "TEST indices:", test_index)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

TRAIN indices: [   0    1    2 ... 6248 6249 6250] TEST indices: [6251 6252 6253 ... 7810 7811 7812]
X_train shape: (6251, 4)
X_test shape: (1562, 4)
y_train shape: (6251,)
y_test shape: (1562,)


### Step 3 Linear regression

In [17]:
# Step 3.1 Please show that these two methods end up with the same solution.
# start with 100 datapoints  
X_sample, X_test_sample, y_sample, y_test_sample = train_test_split(data[:100], data[target_column][:100], test_size=0.2, random_state=42)

NameError: name 'train_test_split' is not defined

In [17]:
# gradient descent
def gradient_descent(X, y, learning_rate=0.01, epochs=100000):
    m, n = X.shape
    theta = np.zeros(n)
    for _ in range(epochs):
        y_pred = X @ theta
        gradients = (1/m) * X.T @ (y_pred - y)
        theta -= learning_rate * gradients
    return theta

# Adding a column of ones to X_sample for the bias term
X_sample_with_bias = np.c_[np.ones(X_sample.shape[0]), X_sample]
X_test_sample_with_bias = np.c_[np.ones(X_test_sample.shape[0]), X_test_sample]

theta_gd = gradient_descent(X_sample_with_bias, y_sample)
y_pred_gd = X_test_sample_with_bias @ theta_gd
print(theta_gd)

TypeError: can't multiply sequence by non-int of type 'float'

In [32]:
# closed-Form solution
theta_closed_form = np.linalg.inv(X_sample_with_bias.T @ X_sample_with_bias) @ X_sample_with_bias.T @ y_sample
y_pred_closed_form = X_test_sample_with_bias @ theta_closed_form
print(theta_closed_form)

[-383.3704364  1198.4834135  -577.09838129 1346.63961396   92.56363692
 -773.26558664 -148.50118492   64.38179525  874.79710726]


In [33]:
print("Gradient Descent MSE:", mean_squared_error(y_test_sample, y_pred_gd))
print("Closed-Form MSE:", mean_squared_error(y_test_sample, y_pred_closed_form))

Gradient Descent MSE: 1235242.956553467
Closed-Form MSE: 1235273.05503181


In [34]:
# Step 3.2 Please increase the number of samples to improve the accuracy of prediction and only use the closed form solution
X_large_sample, X_large_test_sample, y_large_sample, y_large_test_sample = train_test_split(features_reduced, data[target_column], test_size=0.2, random_state=42)

# Adding a column of ones for the bias term in the large sample
X_large_sample_with_bias = np.c_[np.ones(X_large_sample.shape[0]), X_large_sample]
X_large_test_sample_with_bias = np.c_[np.ones(X_large_test_sample.shape[0]), X_large_test_sample]

# upgrade the normal equation
theta_large_sample = np.linalg.inv(X_large_sample_with_bias.T @ X_large_sample_with_bias) @ X_large_sample_with_bias.T @ y_large_sample
theta_large_sample_rounded = np.round(theta_large_sample, 2)

print(f"Step 3.2: Closed-form solution training complete on the larger sample.")
print(f"Coefficients: {theta_large_sample_rounded}")

Step 3.2: Closed-form solution training complete on the larger sample.
Coefficients: [1866.75  584.16  395.71  101.27   -8.56 -124.97  255.91    2.27 -171.51]


Given this output, we can get the optimal model: 

$$
y = 1866.75 + 584.16x_1 + 395.71x_2 + 101.27x_3 - 8.56x_4 - 124.97x_5 + 255.91x_6 + 2.27x_7 - 171.51x_8
$$

Here: 
- $ y $ is the target
- $ x_1, x_2, \dots, x_8 $ are selected features

In [35]:
# Step 3.3 Verify your model using the testing dataset and appropriate evaluation metrics
y_large_pred_closed_form = X_large_test_sample_with_bias @ theta_large_sample

mse = mean_squared_error(y_large_test_sample, y_large_pred_closed_form)
mae = mean_absolute_error(y_large_test_sample, y_large_pred_closed_form)
r2 = r2_score(y_large_test_sample, y_large_pred_closed_form)
rmse = np.sqrt(mse)

print(f"Step 3.3: Model evaluation on the testing dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared: {r2:.4f}")

Step 3.3: Model evaluation on the testing dataset:
Root Mean Squared Error (RMSE): 1293.0691
Mean Squared Error (MSE): 1672027.6484
Mean Absolute Error (MAE): 902.1377
R-squared: 0.5935


### Step 4 Non-linear Regression

In Step 1's formulation, if the price $\lambda$ is treated as a constant and the actual value p is known, the entire formula simplifies into a function dependent on the predicted value $\hat{p}_t$. This implies that the problem can be reframed as an optimization task concerning the prediction of $\hat{p}_t$. Given this perspective, extending the linear regression model from Step 3 by incorporating nonlinear features to predict $\hat{p}_t$ effectively transforms the problem into a nonlinear regression for Step 1's objective. Therefore, performing nonlinear regression on the prediction model of $\hat{p}_t$ inherently satisfies the requirements of the nonlinear extension outlined in Step 4.

In [36]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [37]:
# Step 4.1 Add polynomial features (squared and cubic terms)
poly = PolynomialFeatures(degree=3, include_bias=False) 
X_poly = poly.fit_transform(features_reduced)

# convert to DataFrame
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(features_pca_df.columns[:-1]))

# split the polynomial feature data
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly_df, y, test_size=0.2, random_state=42)

# fit linear regression model on the polynomial features
linear_model_poly = LinearRegression()
linear_model_poly.fit(X_train_poly, y_train_poly)

# predict on the testing data
y_pred_poly = linear_model_poly.predict(X_test_poly)

In [38]:
# evaluate the performance
mse_poly = mean_squared_error(y_test_poly, y_pred_poly)
mae_poly = mean_absolute_error(y_test_poly, y_pred_poly)
r2_poly = r2_score(y_test_poly, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)

print(f"Nonlinear model evaluation on the testing dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse_poly:.4f}")
print(f"Mean Squared Error (MSE): {mse_poly:.4f}")
print(f"Mean Absolute Error (MAE): {mae_poly:.4f}")
print(f"R-squared: {r2_poly:.4f}")

Nonlinear Model Evaluation on the Testing Dataset:
Root Mean Squared Error (RMSE): 923.4014
Mean Squared Error (MSE): 852670.1829
Mean Absolute Error (MAE): 600.8771
R-squared: 0.7927
