In [291]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [292]:
file_path = 'Feature data.csv'
data = pd.read_csv(file_path)
data2=pd.read_csv("Forcasted Weather data new.csv")
print(data2.shape)

(8760, 12)


In [293]:
data2=data2.drop('Accumulated percipitation', axis=1)
data2=data2.drop('Mean humidity', axis=1)
data2=data2.drop('Wind Speed Y Direction', axis=1)
data2=data2.drop('Wind Speed X Direction', axis=1)
data2=data2.drop('Mean temperature', axis=1)
data2=data2.drop('Minimum temperature', axis=1)
data2=data2.drop('Solar Shortwave Flux', axis=1)
data2=data2.drop('Unnamed: 0',axis=1)
data2.head()

Unnamed: 0,ts,Maximum temperature,Mean wind direction,Mean wind speed
0,2022-01-01 00:00:00,279.84314,235.57857,5.813893
1,2022-01-01 01:00:00,279.84314,235.57857,5.813893
2,2022-01-01 02:00:00,279.84314,235.57857,5.813893
3,2022-01-01 03:00:00,279.84314,235.57857,5.813893
4,2022-01-01 04:00:00,279.84314,235.57857,5.813893


### Feature scaling

In this step,we used Principal Component Analysis (PCA) to select and reduce the dimensionality of the features.

In [294]:
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

### 1. Standard Scaling for wind speed and temperature
data['Mean wind speed'] = scaler_standard.fit_transform(data[['Mean wind speed']])
data['Maximum temperature'] = scaler_standard.fit_transform(data[['Maximum temperature']])

### 2. Wind Direction (convert to sin and cos components)
data['Wind direction sin'] = np.sin(np.deg2rad(data['Mean wind direction']))
data['Wind direction cos'] = np.cos(np.deg2rad(data['Mean wind direction']))

### 3. Normalize Power Production
nominal_capacity = 30000 # production capacity is 30 MW, unit of power production is kW so nominal capacity is 30000
data['AKI Kalby Active Power'] = data['AKI Kalby Active Power'] / nominal_capacity

# Dropping the original wind direction after transformation
data = data.drop('Mean wind direction', axis=1)

In [295]:
### 1. Standard Scaling for wind speed and temperature
data2['Mean wind speed'] = scaler_standard.fit_transform(data2[['Mean wind speed']])
data2['Maximum temperature'] = scaler_standard.fit_transform(data2[['Maximum temperature']])

### 2. Wind Direction (convert to sin and cos components)
data2['Wind direction sin'] = np.sin(np.deg2rad(data2['Mean wind direction']))
data2['Wind direction cos'] = np.cos(np.deg2rad(data2['Mean wind direction']))


# Dropping the original wind direction after transformation
data2 = data2.drop('Mean wind direction', axis=1)

In [296]:
# Make sure datetime is set as the index
data['datetime'] = pd.to_datetime(data['datetime'])
data.set_index('datetime', inplace=True)

In [297]:
data2['ts'] = pd.to_datetime(data2['ts'])
data2.set_index('ts', inplace=True)

In [298]:
# set target and features, and remove non-numeric columns
target_column = 'AKI Kalby Active Power'
features = data.select_dtypes(include=[np.number]).drop(columns=[target_column])

In [299]:
# split the data
X = data.drop(columns=[target_column])
y = data[target_column]

split_data= TimeSeriesSplit(n_splits=3, test_size=int(0.2*len(data)))
for train_index, test_index in split_data.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

print("TRAIN indices:", train_index, "TEST indices:", test_index)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

TRAIN indices: [   0    1    2 ... 6248 6249 6250] TEST indices: [6251 6252 6253 ... 7810 7811 7812]
X_train shape: (6251, 4)
X_test shape: (1562, 4)
y_train shape: (6251,)
y_test shape: (1562,)


### Step 3 Linear regression

In [300]:
# Step 3.1 Please show that these two methods end up with the same solution.
X_sample = features[:100]
X_forecast_sample = data2[:20]
y_sample = data[target_column][:100]



In [301]:
# Sequential split (shuffle=False)
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(
    X_sample, y_sample, test_size=0.2, shuffle=False, random_state=42)

# Adding a column of ones to X_sample for the bias term and converting to NumPy array
X_sample_train_with_bias = np.c_[np.ones(X_sample_train.shape[0]), X_sample_train].astype(float)
X_test_with_bias = np.c_[np.ones(X_sample_test.shape[0]), X_sample_test].astype(float)
X_forecast_sample_with_bias=np.c_[np.ones(X_forecast_sample.shape[0]), X_forecast_sample].astype(float)

# Ensure y_sample_train is also a NumPy array
y_sample_train = np.array(y_sample_train).astype(float)

In [302]:
# Gradient Descent function
def gradient_descent(X, y, learning_rate=0.01, epochs=100000):
    m, n = X.shape
    theta = np.zeros(n)
    for _ in range(epochs):
        y_pred = X @ theta
        gradients = (1/m) * X.T @ (y_pred - y)
        theta -= learning_rate * gradients
    return theta

theta_gd = gradient_descent(X_sample_train_with_bias, y_sample_train)

# Predictions using Gradient Descent
y_pred_gd = X_forecast_sample_with_bias @ theta_gd

In [303]:
# Closed-form solution
theta_closed_form = np.linalg.inv(X_sample_train_with_bias.T @ X_sample_train_with_bias) @ X_sample_train_with_bias.T @ y_sample_train

# Predictions using closed-form solution
y_pred_closed_form = X_forecast_sample_with_bias @ theta_closed_form
# print(y_pred_closed_form)


In [304]:
# mse calculation
mse_gd = mean_squared_error(y_sample_test, y_pred_gd)
mse_closed_form = mean_squared_error(y_sample_test, y_pred_closed_form)

print(f"Gradient Descent θ: {[f'{x:.5f}' for x in theta_gd]}")
print(f"Closed-Form θ: {[f'{x:.5f}' for x in theta_closed_form]}")
print(f"Gradient Descent MSE: {mse_gd:.5f}")
print(f"Closed-Form MSE: {mse_closed_form:.5f}")

Gradient Descent θ: ['-0.03789', '0.02592', '-0.04119', '0.01152', '0.01116']
Closed-Form θ: ['-0.03789', '0.02592', '-0.04119', '0.01152', '0.01116']
Gradient Descent MSE: 0.00206
Closed-Form MSE: 0.00206


In [305]:
# Step 3.2: Use the full dataset and closed form solution
X_large_sample, X_large_test_sample, y_large_sample, y_large_test_sample = train_test_split(features, data[target_column], test_size=0.2, random_state=42)
X_large_sample_forecast=data2[:1563]

# Adding a column of ones for the bias term in the large sample
X_large_sample_with_bias = np.c_[np.ones(X_large_sample.shape[0]), X_large_sample]
X_large_test_sample_with_bias = np.c_[np.ones(X_large_test_sample.shape[0]), X_large_test_sample]
X_large_sample_forecast_with_bias=np.c_[np.ones(X_large_sample_forecast.shape[0]), X_large_sample_forecast]

# Upgrade the normal equation
theta_large_sample = np.linalg.inv(X_large_sample_with_bias.T @ X_large_sample_with_bias) @ X_large_sample_with_bias.T @ y_large_sample
theta_large_sample_rounded = np.round(theta_large_sample, 5)

print(f"Step 3.2: Closed-form solution training complete on the larger sample.")
print(f"Coefficients: {[f'{x:.5f}' for x in theta_large_sample_rounded]}")

Step 3.2: Closed-form solution training complete on the larger sample.
Coefficients: ['-0.04394', '0.00016', '-0.03909', '0.00591', '0.00366']


In [306]:
X_large_test_sample.reset_index(inplace=True)
data2_reset=data2.reset_index()
data2_reset['datetime']=data2_reset['ts']

In [307]:
X_large_test_sample = X_large_test_sample.set_index('datetime')
data2_reset = data2_reset.set_index('datetime')
merged_ds = X_large_test_sample.join(data2_reset, how='left', lsuffix='_left', rsuffix='_right').reset_index()
merged_ds

Unnamed: 0,datetime,Maximum temperature_left,Mean wind speed_left,Wind direction sin_left,Wind direction cos_left,ts,Maximum temperature_right,Mean wind speed_right,Wind direction sin_right,Wind direction cos_right
0,2022-05-20 06:00:00,1.427152,0.569432,-0.909961,-0.414693,2022-05-20 06:00:00,0.963624,-0.130353,-0.890382,0.455215
1,2022-11-15 09:00:00,-0.156329,-0.515250,0.898794,-0.438371,2022-11-15 09:00:00,-0.258735,-0.626248,0.964713,-0.263303
2,2022-05-20 01:00:00,1.819252,0.457224,-0.544639,-0.838671,2022-05-20 01:00:00,1.027204,-0.659087,-0.994775,0.102093
3,2022-12-22 23:00:00,-0.653995,0.906058,-0.891007,-0.453990,2022-12-22 23:00:00,-0.744850,1.032841,-0.929767,-0.368149
4,2022-08-20 20:00:00,1.442233,-1.038890,-0.913545,0.406737,2022-08-20 20:00:00,1.300281,-0.941475,-0.572338,0.820018
...,...,...,...,...,...,...,...,...,...,...
1558,2022-09-03 12:00:00,1.246183,0.307612,0.961262,-0.275637,2022-09-03 12:00:00,1.147728,-0.344476,0.987257,-0.159132
1559,2022-09-12 16:00:00,1.065214,-0.552653,-0.707107,-0.707107,2022-09-12 16:00:00,0.913578,-0.812598,0.249059,-0.968488
1560,2022-05-05 23:00:00,-0.473025,-0.477847,-0.974370,0.224951,2022-05-05 23:00:00,-0.339004,-1.200294,-0.866022,0.500006
1561,2022-04-01 16:00:00,-1.106418,1.654115,0.681998,0.731354,2022-04-01 16:00:00,-1.095397,1.063045,0.743966,0.668218


In [290]:
merged_ds.iloc[583]

Unnamed: 0,583
datetime,2022-01-01 19:00:00
Maximum temperature_left,-0.834964
Mean wind speed_left,-0.141222
Wind direction sin_left,0.743145
Wind direction cos_left,-0.669131
ts,NaT
Maximum temperature_right,
Mean wind speed_right,
Wind direction sin_right,
Wind direction cos_right,


In [308]:
merged_ds=merged_ds.drop('Maximum temperature_left', axis=1)
merged_ds=merged_ds.drop('Mean wind speed_left', axis=1)
merged_ds=merged_ds.drop('Wind direction sin_left', axis=1)
merged_ds=merged_ds.drop('Wind direction cos_left', axis=1)
merged_ds=merged_ds.drop('datetime', axis=1)

In [309]:
merged_ds=merged_ds.drop('ts', axis=1)
merged_ds

Unnamed: 0,Maximum temperature_right,Mean wind speed_right,Wind direction sin_right,Wind direction cos_right
0,0.963624,-0.130353,-0.890382,0.455215
1,-0.258735,-0.626248,0.964713,-0.263303
2,1.027204,-0.659087,-0.994775,0.102093
3,-0.744850,1.032841,-0.929767,-0.368149
4,1.300281,-0.941475,-0.572338,0.820018
...,...,...,...,...
1558,1.147728,-0.344476,0.987257,-0.159132
1559,0.913578,-0.812598,0.249059,-0.968488
1560,-0.339004,-1.200294,-0.866022,0.500006
1561,-1.095397,1.063045,0.743966,0.668218


In [310]:
merged_ds_with_bias=np.c_[np.ones(merged_ds.shape[0]), merged_ds]

In [311]:
nan_df=merged_ds[merged_ds.isna().any(axis=1)]
print(nan_df)

Empty DataFrame
Columns: [Maximum temperature_right, Mean wind speed_right, Wind direction sin_right, Wind direction cos_right]
Index: []


In [312]:
# Step 3.3: Verify your model using the testing dataset and appropriate evaluation metrics
y_large_pred_closed_form = merged_ds_with_bias @ theta_large_sample

mse = mean_squared_error(y_large_test_sample, y_large_pred_closed_form)
mae = mean_absolute_error(y_large_test_sample, y_large_pred_closed_form)
r2 = r2_score(y_large_test_sample, y_large_pred_closed_form)
rmse = np.sqrt(mse)

print(f"Step 3.3: Model evaluation on the testing dataset:")
print(f"Root Mean Squared Error (RMSE): {rmse:.5f}")
print(f"Mean Squared Error (MSE): {mse:.5f}")
print(f"Mean Absolute Error (MAE): {mae:.5f}")
print(f"R-squared: {r2:.5f}")

Step 3.3: Model evaluation on the testing dataset:
Root Mean Squared Error (RMSE): 0.02865
Mean Squared Error (MSE): 0.00082
Mean Absolute Error (MAE): 0.02156
R-squared: 0.68197
