In [14]:
!pip install lightgbm



In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [16]:
csv_file = "T1.csv"
df = pd.read_csv(csv_file)

In [17]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format='%d %m %Y %H:%M')

# Extract month and hour
df['Month'] = df['Date/Time'].dt.month
df['Hour'] = df['Date/Time'].dt.hour

# Display the resulting DataFrame
df[['Date/Time', 'Month', 'Hour']]
df = df.drop(columns=['Date/Time'])

In [18]:
# get a sample from the data
sample_df = df.sample(frac=0.1, random_state=24)

In [19]:
f_df = df[~((df['LV ActivePower (kW)'] == 0) & (df['Theoretical_Power_Curve (KWh)'] != 0) & (df['Wind Speed (m/s)'] > 3))]

In [20]:
# Modify the 'wind speed (m/s)' column based on the condition
f_df['Wind Speed (m/s)'] = f_df['Wind Speed (m/s)'].apply(lambda x: 19 if x > 19.447 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_df['Wind Speed (m/s)'] = f_df['Wind Speed (m/s)'].apply(lambda x: 19 if x > 19.447 else x)


In [21]:
# split dependent and undependent variables
y = f_df['LV ActivePower (kW)']
X = f_df.drop(columns=['LV ActivePower (kW)'])

In [22]:
# split train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
f'Training data shape: {X_train.shape}, {y_train.shape}'

'Training data shape: (37626, 5), (37626,)'

In [24]:
f'Testing data shape: {X_test.shape}, {y_test.shape}'

'Testing data shape: (9407, 5), (9407,)'

In [25]:
# use standart scaler with continuous features Train Dataset
continuous_features = ['Wind Speed (m/s)', 'Wind Direction (°)']
scaler = StandardScaler()
X_train[continuous_features] = scaler.fit_transform(X_train[continuous_features])

In [26]:
# use standart scaler with continuous features Test Dataset
continuous_features = ['Wind Speed (m/s)', 'Wind Direction (°)']
scaler = StandardScaler()
X_test[continuous_features] = scaler.fit_transform(X_test[continuous_features])

In [30]:
y_train

13888      96.550636
1430     1043.302979
31717    1210.031006
37451     876.856384
10743    3370.260986
            ...     
12829       0.000000
47649      34.930962
40805     352.223114
905      2047.163940
17742     629.888123
Name: LV ActivePower (kW), Length: 37626, dtype: float64

In [31]:
# use standart scaler with training label
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train.to_frame())

In [32]:
# use standart scaler with test label
scaler = StandardScaler()
y_test = scaler.fit_transform(y_test.to_frame())

In [33]:
# use sin and cos transformation for month and hour variables Train dataset
X_train['month_sin'] = np.sin(2 * np.pi * X_train['Month'] / 12)
X_train['month_cos'] = np.cos(2 * np.pi * X_train['Month'] / 12)
X_train['hour_sin'] = np.sin(2 * np.pi * X_train['Hour'] / 24)
X_train['hour_cos'] = np.cos(2 * np.pi * X_train['Hour'] / 24)

In [34]:
# use sin and cos transformation for month and hour variables Test dataset
X_test['month_sin'] = np.sin(2 * np.pi * X_test['Month'] / 12)
X_test['month_cos'] = np.cos(2 * np.pi * X_test['Month'] / 12)
X_test['hour_sin'] = np.sin(2 * np.pi * X_test['Hour'] / 24)
X_test['hour_cos'] = np.cos(2 * np.pi * X_test['Hour'] / 24)

In [35]:
# drop unnecessary columns
X_test = X_test.drop(columns=['Month','Hour','Theoretical_Power_Curve (KWh)'])
X_train = X_train.drop(columns=['Month','Hour','Theoretical_Power_Curve (KWh)'])

In [37]:
# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LGBMRegressor(boosting_type='gbdt',
                            objective='regression',
                            num_leaves=31,
                            learning_rate=0.05,
                            n_estimators=100))
])

In [38]:
pipeline.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 571
[LightGBM] [Info] Number of data points in the train set: 37626, number of used features: 6
[LightGBM] [Info] Start training from score 0.000000


In [40]:
y_pred = pipeline.predict(X_test)

In [41]:
# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

In [42]:
f'Model RMSE: {rmse}'

'Model RMSE: 0.015072950785025373'

In [43]:
f'Model R2 score: {r2}'

'Model R2 score: 0.9849270492149746'

In [44]:
f'Model MAE: {mae}'

'Model MAE: 0.0603865101486355'

In [46]:
# Predicted and real values
print(f'Predicted values: {y_pred}')
print(f'Real values: {y_test}')

Predicted values: [ 0.13457989  0.93576101 -0.77993992 ... -0.70584579  0.02796181
  0.80766394]
Real values: [[ 0.18315042]
 [ 0.8778791 ]
 [-0.79124096]
 ...
 [-0.66436776]
 [ 0.05212583]
 [ 0.68557957]]


In [47]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print(f'Cross-validated R2 scores: {cv_scores}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 801
[LightGBM] [Info] Number of data points in the train set: 37626, number of used features: 5
[LightGBM] [Info] Start training from score 1292.696733
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 802
[LightGBM] [Info] Number of data points in the train set: 37626, number of used features: 5
[LightGBM] [Info] Start training from score 1506.290225
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [48]:
print(f'Mean cross-validated R2 score: {cv_scores.mean()}')

Mean cross-validated R2 score: 0.9631099152812181
