In [15]:
import pandas as pd
import numpy as np
import os
import sys

# Make `src` dir can be imported
project_root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))  # /aicup-predict-energy-generation
sys.path.append(project_root_path)

from src.utils import choose_device
from src.feature_engineering import create_time_features, create_sinusoidal_transformation_by_number, create_sinusoidal_transformation_year_month_day, calculate_pressure_diff

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler

In [16]:
df_raw_data = pd.read_csv('../data/processed_data/combined_data.csv')
df_raw_data['datetime'] = pd.to_datetime(df_raw_data['datetime'])
# make sure the sorting is correct
df_raw_data = df_raw_data.sort_values(by=['device','datetime']).reset_index(drop=True)

In [17]:
# parameterize the device for testing conveniently
df_device = choose_device(df_raw_data, 'L8')

## Preprocessing

In [18]:
df_device

Unnamed: 0,locationcode,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device
1084283,8,2024-01-06 06:21:19,0.0,1016.10,14.67,80.94,23.33,0.00,L8
1084284,8,2024-01-06 06:22:19,0.0,1016.10,14.65,80.95,25.83,0.00,L8
1084285,8,2024-01-06 06:23:19,0.0,1016.07,14.63,81.00,30.83,0.00,L8
1084286,8,2024-01-06 06:24:19,0.0,1016.02,14.61,81.02,35.00,0.00,L8
1084287,8,2024-01-06 06:25:19,0.0,1016.06,14.60,81.07,41.67,0.00,L8
...,...,...,...,...,...,...,...,...,...
1201277,8,2024-08-31 15:07:16,0.0,1002.31,40.37,38.86,12590.83,32.83,L8
1201278,8,2024-08-31 15:08:16,0.0,1002.33,40.13,39.45,13351.67,40.63,L8
1201279,8,2024-08-31 15:09:16,0.0,1002.36,39.96,40.12,13168.33,38.16,L8
1201280,8,2024-08-31 15:10:16,0.0,1002.36,39.57,40.80,13605.83,43.10,L8


In [19]:
columns_to_standardize = ['windspeed', 'temperature', 'humidity', 'sunlight']


scaler  = StandardScaler()
df_standardized = df_device.copy()
df_standardized[columns_to_standardize] = scaler.fit_transform(df_standardized[columns_to_standardize])

## Feature engineering

In [20]:
df_fe_result = calculate_pressure_diff(df_standardized, column='pressure')
df_fe_result

Unnamed: 0,locationcode,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device,pressure_diff
1084283,8,2024-01-06 06:21:19,-0.359678,1016.10,-1.787445,0.229870,-0.547503,0.00,L8,6.36
1084284,8,2024-01-06 06:22:19,-0.359678,1016.10,-1.790297,0.230381,-0.547392,0.00,L8,6.36
1084285,8,2024-01-06 06:23:19,-0.359678,1016.07,-1.793148,0.232938,-0.547171,0.00,L8,6.33
1084286,8,2024-01-06 06:24:19,-0.359678,1016.02,-1.796000,0.233961,-0.546986,0.00,L8,6.28
1084287,8,2024-01-06 06:25:19,-0.359678,1016.06,-1.797426,0.236517,-0.546690,0.00,L8,6.32
...,...,...,...,...,...,...,...,...,...,...
1201277,8,2024-08-31 15:07:16,-0.359678,1002.31,1.876697,-1.921849,0.009739,32.83,L8,-7.43
1201278,8,2024-08-31 15:08:16,-0.359678,1002.33,1.842480,-1.891680,0.043475,40.63,L8,-7.41
1201279,8,2024-08-31 15:09:16,-0.359678,1002.36,1.818242,-1.857420,0.035346,38.16,L8,-7.38
1201280,8,2024-08-31 15:10:16,-0.359678,1002.36,1.762638,-1.822649,0.054744,43.10,L8,-7.38


create sinusodial month mapping

In [21]:
month_numbers = list(range(1, 13))

# Create DataFrame
df_months = pd.DataFrame(month_numbers, columns=["month"])
df_months = create_sinusoidal_transformation_by_number(df_months, 'month', 12)
df_months

Unnamed: 0,month,month_sin,month_cos
0,1,0.5,0.8660254
1,2,0.8660254,0.5
2,3,1.0,6.123234000000001e-17
3,4,0.8660254,-0.5
4,5,0.5,-0.8660254
5,6,1.224647e-16,-1.0
6,7,-0.5,-0.8660254
7,8,-0.8660254,-0.5
8,9,-1.0,-1.83697e-16
9,10,-0.8660254,0.5


#### General time frame

In [22]:
# Generate date range from 2023-01-01 to 2025-12-31
general_dates = pd.date_range(start="2023-01-01", end="2025-12-31")

# Create DataFrame and extract year, month, and day into separate columns
df_general_dates = pd.DataFrame({
    "year": general_dates.year,
    "month": general_dates.month,
    "day": general_dates.day
})
df_general_dates = create_sinusoidal_transformation_year_month_day(df_general_dates, 'general_ymd', 'year', 'month', 'day', 12)
df_general_dates

Unnamed: 0,year,month,day,general_ymd_sin,general_ymd_cos
0,2023,1,1,-5.000000e-01,-8.660254e-01
1,2023,1,2,8.660254e-01,5.000000e-01
2,2023,1,3,-1.000000e+00,-1.611765e-13
3,2023,1,4,8.660254e-01,-5.000000e-01
4,2023,1,5,-5.000000e-01,8.660254e-01
...,...,...,...,...,...
1091,2025,12,27,-6.359136e-11,1.000000e+00
1092,2025,12,28,-4.654404e-11,1.000000e+00
1093,2025,12,29,-8.770438e-11,1.000000e+00
1094,2025,12,30,-7.065706e-11,1.000000e+00


#### Aligns with a specific lunar calendar period

In [23]:
# Generate date range from 2023-01-01 to 2025-12-31
luner_dates = pd.date_range(start="2023-02-04", end="2025-02-03")

# Create DataFrame and extract year, month, and day into separate columns
df_luner_dates = pd.DataFrame({
    "year": luner_dates.year,
    "month": luner_dates.month,
    "day": luner_dates.day
})
df_luner_dates = create_sinusoidal_transformation_year_month_day(df_luner_dates, 'lunar_ymd', 'year', 'month', 'day', 12)
df_luner_dates

Unnamed: 0,year,month,day,lunar_ymd_sin,lunar_ymd_cos
0,2023,2,4,-8.660254e-01,-5.000000e-01
1,2023,2,5,-8.660254e-01,5.000000e-01
2,2023,2,6,-6.447061e-13,1.000000e+00
3,2023,2,7,8.660254e-01,5.000000e-01
4,2023,2,8,8.660254e-01,-5.000000e-01
...,...,...,...,...,...
726,2025,1,30,1.037450e-12,-1.000000e+00
727,2025,1,31,1.000000e+00,6.892798e-12
728,2025,2,1,3.420118e-13,-1.000000e+00
729,2025,2,2,-6.840235e-13,1.000000e+00


In [24]:
df_fe_result = create_time_features(df_fe_result, 'datetime')
df_fe_result.head()

Unnamed: 0,locationcode,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device,pressure_diff,date,year,month,day,hour,min,day_of_week,week_of_year,quarter
1084283,8,2024-01-06 06:21:19,-0.359678,1016.1,-1.787445,0.22987,-0.547503,0.0,L8,6.36,2024-01-06,2024,1,6,6,21,5,1,1
1084284,8,2024-01-06 06:22:19,-0.359678,1016.1,-1.790297,0.230381,-0.547392,0.0,L8,6.36,2024-01-06,2024,1,6,6,22,5,1,1
1084285,8,2024-01-06 06:23:19,-0.359678,1016.07,-1.793148,0.232938,-0.547171,0.0,L8,6.33,2024-01-06,2024,1,6,6,23,5,1,1
1084286,8,2024-01-06 06:24:19,-0.359678,1016.02,-1.796,0.233961,-0.546986,0.0,L8,6.28,2024-01-06,2024,1,6,6,24,5,1,1
1084287,8,2024-01-06 06:25:19,-0.359678,1016.06,-1.797426,0.236517,-0.54669,0.0,L8,6.32,2024-01-06,2024,1,6,6,25,5,1,1


In [25]:
df_fe_result_sinusoidal_time = pd.merge(df_fe_result, df_general_dates, how='left',  on=['year', 'month', 'day'])

In [28]:
# input data
drop_cols = ['locationcode', 'datetime', 'date', 'year', 'pressure']
df_fe_result_sinusoidal_time = df_fe_result_sinusoidal_time.drop(drop_cols, axis=1)
df_fe_result_sinusoidal_time

Unnamed: 0,windspeed,temperature,humidity,sunlight,power,device,pressure_diff,month,day,hour,min,day_of_week,week_of_year,quarter,general_ymd_sin,general_ymd_cos
0,-0.359678,-1.787445,0.229870,-0.547503,0.00,L8,6.36,1,6,6,21,5,1,1,7.926873e-15,1.0
1,-0.359678,-1.790297,0.230381,-0.547392,0.00,L8,6.36,1,6,6,22,5,1,1,7.926873e-15,1.0
2,-0.359678,-1.793148,0.232938,-0.547171,0.00,L8,6.33,1,6,6,23,5,1,1,7.926873e-15,1.0
3,-0.359678,-1.796000,0.233961,-0.546986,0.00,L8,6.28,1,6,6,24,5,1,1,7.926873e-15,1.0
4,-0.359678,-1.797426,0.236517,-0.546690,0.00,L8,6.32,1,6,6,25,5,1,1,7.926873e-15,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116994,-0.359678,1.876697,-1.921849,0.009739,32.83,L8,-7.43,8,31,15,7,5,35,3,8.660254e-01,-0.5
116995,-0.359678,1.842480,-1.891680,0.043475,40.63,L8,-7.41,8,31,15,8,5,35,3,8.660254e-01,-0.5
116996,-0.359678,1.818242,-1.857420,0.035346,38.16,L8,-7.38,8,31,15,9,5,35,3,8.660254e-01,-0.5
116997,-0.359678,1.762638,-1.822649,0.054744,43.10,L8,-7.38,8,31,15,10,5,35,3,8.660254e-01,-0.5


## Check before splitting

In [29]:
df_fe_result_sinusoidal_time.describe()

Unnamed: 0,windspeed,temperature,humidity,sunlight,power,pressure_diff,month,day,hour,min,day_of_week,week_of_year,quarter,general_ymd_sin,general_ymd_cos
count,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0,116999.0
mean,1.399235e-16,-2.720734e-16,4.654398e-16,4.275439e-17,126.108926,0.002692,4.26366,15.371884,11.237429,29.544398,3.034265,16.768143,1.748818,-0.003641779,0.279173
std,1.000004,1.000004,1.000004,1.000004,391.083753,6.649853,2.218328,8.576396,3.803584,17.25037,2.041543,9.555967,0.76026,0.6003359,0.749436
min,-0.3596783,-2.055484,-3.324454,-0.5476509,0.0,-14.92,1.0,1.0,4.0,0.0,0.0,1.0,1.0,-0.8660254,-0.5
25%,-0.3596783,-0.8079645,-0.5892968,-0.4624073,0.75,-5.21,2.0,8.0,8.0,15.0,1.0,8.0,1.0,-7.98728e-12,-0.5
50%,-0.3596783,0.01896262,0.09743282,-0.3312351,5.54,-1.09,5.0,15.0,11.0,30.0,3.0,18.0,2.0,4.824914e-13,1.0
75%,-0.3596783,0.660544,0.8741582,-0.07400761,26.21,5.62,6.0,22.0,15.0,44.0,5.0,24.0,2.0,1.751028e-11,1.0
max,17.16574,3.862748,1.204484,4.672857,2620.13,16.36,8.0,31.0,19.0,59.0,6.0,35.0,3.0,0.8660254,1.0


## Train/Test split