In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor


In [5]:
file_path1 = r"../Data assignment 1/Windpark data.csv"
file_path2 = r"../Data assignment 1/Weather data.csv"

# Load the CSV file into a pandas DataFrame
Bornholm = pd.read_csv(file_path1)
Weather = pd.read_csv(file_path2)

In [8]:
# Merge the Bornholm and Weather dataframes based on the timestamp. To ensure all rows from both dataframes are included, they are merged using 'outer'.
merged_df = pd.merge(Bornholm, Weather, on='datetime', how='outer')

In [10]:
# Check if there are any missing values.
merged_df.isna().sum()

datetime                             0
AKI Kalby Active Power             947
Maximum temperature                  0
Accumulated percipitation            0
Mean wind speed                      0
Minimum temperature                  0
Mean temperature                     0
Mean humidity                        0
Mean wind direction                  0
Mean intensity global radiation      0
dtype: int64

There are 947 missing values in the power production. These are dropped because there was an issue in recording that resulted in longer stretches of time not having any data. Imputing these gaps with average values or other methods was deemed inappropriate, as it could skew the analysis and lead to inaccurate conclusions. Therefore, to maintain the integrity of the dataset, the decision was made to exclude these entries entirely.



In [11]:
# Drop the missing values
merged_df.dropna(inplace=True)

In [12]:
merged_df.set_index('datetime', inplace=True)

In [13]:
merged_df

Unnamed: 0_level_0,AKI Kalby Active Power,Maximum temperature,Accumulated percipitation,Mean wind speed,Minimum temperature,Mean temperature,Mean humidity,Mean wind direction,Mean intensity global radiation
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-01 00:00:00,-1893.541825,7.3,0.0,6.9,7.0,7.2,99.0,267.0,0.0
2022-01-01 01:00:00,-1671.836030,7.3,0.0,5.6,6.6,6.8,99.0,253.0,0.0
2022-01-01 02:00:00,-2871.720706,7.0,0.0,6.6,6.6,6.8,99.0,264.0,0.0
2022-01-01 03:00:00,-1911.765229,6.9,0.0,5.9,6.5,6.7,98.0,270.0,0.0
2022-01-01 04:00:00,-881.754117,7.2,0.0,5.4,6.5,6.8,98.0,288.0,0.0
...,...,...,...,...,...,...,...,...,...
2022-12-31 19:00:00,-4459.956834,10.4,0.0,9.8,9.8,10.1,93.0,221.0,0.0
2022-12-31 20:00:00,-4595.766776,10.6,0.0,8.5,9.9,10.2,94.0,227.0,0.0
2022-12-31 21:00:00,-3607.702982,10.6,0.3,8.6,9.7,10.1,96.0,223.0,0.0
2022-12-31 22:00:00,-3100.013804,9.9,0.5,7.9,9.6,9.8,98.0,233.0,0.0


### Feature selection
The feature selection is done based on a combination of variance and correlation. Features that have a relatively significant variance and correlation are selected. 

In [14]:
# Calculate variance of each feature in the dataset
variance_per_feature = merged_df.var()

# Print variance of each original feature
print(variance_per_feature)

AKI Kalby Active Power             2.279024e+06
Maximum temperature                4.397526e+01
Accumulated percipitation          2.109360e-01
Mean wind speed                    7.149015e+00
Minimum temperature                4.266575e+01
Mean temperature                   4.315623e+01
Mean humidity                      1.700687e+02
Mean wind direction                8.296777e+03
Mean intensity global radiation    4.789245e+04
dtype: float64


 Some features (like wind direction, radiation intensity, and humidity) have higher variability, indicating they change more frequently or drastically than features like wind speed or temperature. Features with higher variance might be important for predicting the target variable (active power) since they reflect more dynamic environmental conditions. However, low variance does not necessarily mean the feature is unimportant — for instance, wind speed, despite its lower variance, is likely a crucial factor for power generation. This is why as a cross-reference, the correlation between the target variable and the features will be checked. 
 

In [15]:
# Print the correlation of each variable with the target variable
print(merged_df.corr()['AKI Kalby Active Power'])

AKI Kalby Active Power             1.000000
Maximum temperature                0.106181
Accumulated percipitation         -0.063650
Mean wind speed                   -0.797750
Minimum temperature                0.086575
Mean temperature                   0.096825
Mean humidity                      0.000850
Mean wind direction               -0.098898
Mean intensity global radiation    0.021909
Name: AKI Kalby Active Power, dtype: float64


In [16]:
print(merged_df.corrwith(merged_df['AKI Kalby Active Power']))


AKI Kalby Active Power             1.000000
Maximum temperature                0.106181
Accumulated percipitation         -0.063650
Mean wind speed                   -0.797750
Minimum temperature                0.086575
Mean temperature                   0.096825
Mean humidity                      0.000850
Mean wind direction               -0.098898
Mean intensity global radiation    0.021909
dtype: float64


In [18]:
# Check the correlation between all variables
merged_df.corr()

Unnamed: 0,AKI Kalby Active Power,Maximum temperature,Accumulated percipitation,Mean wind speed,Minimum temperature,Mean temperature,Mean humidity,Mean wind direction,Mean intensity global radiation
AKI Kalby Active Power,1.0,0.106181,-0.06365,-0.79775,0.086575,0.096825,0.00085,-0.098898,0.021909
Maximum temperature,0.106181,1.0,-0.012325,-0.133873,0.99546,0.998759,-0.14234,-0.056822,0.4344
Accumulated percipitation,-0.06365,-0.012325,1.0,0.046444,-0.01419,-0.013943,0.14255,0.017684,-0.082194
Mean wind speed,-0.79775,-0.133873,0.046444,1.0,-0.109032,-0.121967,-0.155046,0.046511,0.115031
Minimum temperature,0.086575,0.99546,-0.01419,-0.109032,1.0,0.998666,-0.130183,-0.068933,0.417069
Mean temperature,0.096825,0.998759,-0.013943,-0.121967,0.998666,1.0,-0.13684,-0.062243,0.425965
Mean humidity,0.00085,-0.14234,0.14255,-0.155046,-0.130183,-0.13684,1.0,0.131818,-0.576348
Mean wind direction,-0.098898,-0.056822,0.017684,0.046511,-0.068933,-0.062243,0.131818,1.0,-0.116282
Mean intensity global radiation,0.021909,0.4344,-0.082194,0.115031,0.417069,0.425965,-0.576348,-0.116282,1.0


With a correlation of -0.6, mean wind spead has a strong negative correlation with power production. This indicates that wind speed has a strong influence on power production, and it is likely a crucial feature.
The temperature features may not contribute significantly to your prediction model, but can have secondary effects on power production (e.g., affecting air density or turbine efficiency). For completeness and because it is the variable with the third-highest correlation we decided to keep maximum temperature but treat them with lower priority.
 Despite the high variance, the low correlation with power production suggests that wind direction doesn't strongly impact power output. However, it still has the second-highest correlation of the features, which is why we decide to take it into account anyway. 

In [19]:
# Create a dataframe with only the selected features
feature_df=merged_df[['AKI Kalby Active Power', 'Maximum temperature', 'Mean wind direction', 'Mean wind speed']]

In [21]:
# Set output path
output_path = r"../Data assignment 1/Feature data.csv"
# Save the DataFrame as a CSV file
feature_df.to_csv(output_path, index=True)   