In [1]:
import pandas as pd
import numpy as np
import os
import sys
import glob
from datetime import datetime
# Make `src` dir can be imported
project_root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))  # /aicup-predict-energy-generation
sys.path.append(project_root_path)

from src.utils import choose_device


import matplotlib.pyplot as plt
import seaborn as sns

from src.fe_tools import create_time_features

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
config_file = './Sean/test_5_L10_pe.json'
config_file_name = config_file.split('/')[2].split('.')[0]
config_file_name

'test_5_L10_pe'

In [3]:
df_raw_data = pd.read_csv('../data/processed_data/combined_data.csv')
df_raw_data['datetime'] = pd.to_datetime(df_raw_data['datetime'])
# make sure the sorting is correct
df_raw_data = df_raw_data.sort_values(by=['device','datetime']).reset_index(drop=True)

df_device = choose_device(df_raw_data, 'L10')

In [4]:
# predicted result
dir = '../pred_results/Sean/'
result_file = [file for file in os.listdir(dir) if file.startswith(config_file_name)][0]
result_file

df_result = pd.read_csv(os.path.join(dir, result_file))
df_result['datetime'] = pd.to_datetime(df_result['datetime'])

In [5]:
df_result.head()

Unnamed: 0,fold,datetime,y_valid,y_pred,tae,pred_time
0,0,2024-06-30 07:24:12,12.16,14.809577,2.649577,2024-11-22 14:18:46
1,0,2024-06-30 07:25:12,11.84,13.302484,1.462484,2024-11-22 14:18:46
2,0,2024-06-30 07:26:12,11.14,11.424655,0.284655,2024-11-22 14:18:46
3,0,2024-06-30 07:27:12,10.91,10.292706,0.617294,2024-11-22 14:18:46
4,0,2024-06-30 07:28:12,10.54,10.299067,0.240933,2024-11-22 14:18:46


In [6]:
def get_target_data(df, error):
    if isinstance(error, (int, float)):
        df_temp = df[df['tae'] > error]

    elif isinstance(error, list) and len(error) == 2:
        df_temp = df[(df['tae'] >= error[0]) & (df['tae'] < error[1])]

    else:
        raise ValueError("Error should be either a single number or a list with two elements.")

    return df_temp

In [12]:
df_error = get_target_data(df_result, error=1)
df_error

Unnamed: 0,fold,datetime,y_valid,y_pred,tae,pred_time
0,0,2024-06-30 07:24:12,12.16,14.809577,2.649577,2024-11-22 14:18:46
1,0,2024-06-30 07:25:12,11.84,13.302484,1.462484,2024-11-22 14:18:46
7,0,2024-06-30 07:31:12,10.60,13.609461,3.009461,2024-11-22 14:18:46
8,0,2024-06-30 07:32:12,10.56,13.609461,3.049461,2024-11-22 14:18:46
9,0,2024-06-30 07:33:12,10.60,12.992468,2.392468,2024-11-22 14:18:46
...,...,...,...,...,...,...
14236,4,2024-07-21 16:19:27,3.28,4.404977,1.124977,2024-11-22 14:18:46
14237,4,2024-07-21 16:20:27,3.32,4.361800,1.041800,2024-11-22 14:18:46
14238,4,2024-07-21 16:21:27,3.25,4.361800,1.111800,2024-11-22 14:18:46
14240,4,2024-07-21 16:23:27,3.59,4.683103,1.093103,2024-11-22 14:18:46


In [13]:
df = pd.merge(df_error, df_device, how='left', on='datetime')
df

Unnamed: 0,fold,datetime,y_valid,y_pred,tae,pred_time,windspeed,pressure,temperature,humidity,sunlight,power,device
0,0,2024-06-30 07:24:12,12.16,14.809577,2.649577,2024-11-22 14:18:46,0.00,1005.13,29.09,94.32,6253.33,12.16,L10
1,0,2024-06-30 07:25:12,11.84,13.302484,1.462484,2024-11-22 14:18:46,0.00,1005.08,29.12,96.38,5930.83,11.84,L10
2,0,2024-06-30 07:31:12,10.60,13.609461,3.009461,2024-11-22 14:18:46,0.00,1005.16,29.37,96.23,5083.33,10.60,L10
3,0,2024-06-30 07:32:12,10.56,13.609461,3.049461,2024-11-22 14:18:46,0.00,1005.20,29.42,96.11,5037.50,10.56,L10
4,0,2024-06-30 07:33:12,10.60,12.992468,2.392468,2024-11-22 14:18:46,0.00,1005.20,29.48,96.65,5001.67,10.60,L10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8368,4,2024-07-21 16:19:27,3.28,4.404977,1.124977,2024-11-22 14:18:46,2.70,1004.31,34.12,62.85,4065.00,3.28,L10
8369,4,2024-07-21 16:20:27,3.32,4.361800,1.041800,2024-11-22 14:18:46,2.35,1004.27,34.06,62.37,4070.83,3.32,L10
8370,4,2024-07-21 16:21:27,3.25,4.361800,1.111800,2024-11-22 14:18:46,1.74,1004.28,34.01,62.90,4111.67,3.25,L10
8371,4,2024-07-21 16:23:27,3.59,4.683103,1.093103,2024-11-22 14:18:46,2.79,1004.31,33.86,64.62,4355.83,3.59,L10


In [14]:
df.fold.value_counts()

fold
1    1944
0    1919
4    1541
2    1516
3    1453
Name: count, dtype: int64

In [15]:
df_time_feature = create_time_features(df, input_column='datetime')
df_time_feature

Unnamed: 0,fold,datetime,y_valid,y_pred,tae,pred_time,windspeed,pressure,temperature,humidity,...,device,date,year,month,day,hour,min,day_of_week,week_of_year,quarter
0,0,2024-06-30 07:24:12,12.16,14.809577,2.649577,2024-11-22 14:18:46,0.00,1005.13,29.09,94.32,...,L10,2024-06-30,2024,6,30,7,24,6,26,2
1,0,2024-06-30 07:25:12,11.84,13.302484,1.462484,2024-11-22 14:18:46,0.00,1005.08,29.12,96.38,...,L10,2024-06-30,2024,6,30,7,25,6,26,2
2,0,2024-06-30 07:31:12,10.60,13.609461,3.009461,2024-11-22 14:18:46,0.00,1005.16,29.37,96.23,...,L10,2024-06-30,2024,6,30,7,31,6,26,2
3,0,2024-06-30 07:32:12,10.56,13.609461,3.049461,2024-11-22 14:18:46,0.00,1005.20,29.42,96.11,...,L10,2024-06-30,2024,6,30,7,32,6,26,2
4,0,2024-06-30 07:33:12,10.60,12.992468,2.392468,2024-11-22 14:18:46,0.00,1005.20,29.48,96.65,...,L10,2024-06-30,2024,6,30,7,33,6,26,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8368,4,2024-07-21 16:19:27,3.28,4.404977,1.124977,2024-11-22 14:18:46,2.70,1004.31,34.12,62.85,...,L10,2024-07-21,2024,7,21,16,19,6,29,3
8369,4,2024-07-21 16:20:27,3.32,4.361800,1.041800,2024-11-22 14:18:46,2.35,1004.27,34.06,62.37,...,L10,2024-07-21,2024,7,21,16,20,6,29,3
8370,4,2024-07-21 16:21:27,3.25,4.361800,1.111800,2024-11-22 14:18:46,1.74,1004.28,34.01,62.90,...,L10,2024-07-21,2024,7,21,16,21,6,29,3
8371,4,2024-07-21 16:23:27,3.59,4.683103,1.093103,2024-11-22 14:18:46,2.79,1004.31,33.86,64.62,...,L10,2024-07-21,2024,7,21,16,23,6,29,3


In [16]:
df_time_feature.date.value_counts()

date
2024-07-03    585
2024-07-04    576
2024-07-05    563
2024-07-12    558
2024-07-01    553
2024-07-06    521
2024-07-21    521
2024-07-13    516
2024-07-02    514
2024-07-15    503
2024-07-08    495
2024-07-07    494
2024-07-19    452
2024-07-17    402
2024-07-10    374
2024-07-16    367
2024-06-30     90
2024-07-20     90
2024-07-18     76
2024-07-14     66
2024-07-11     57
Name: count, dtype: int64

In [None]:
df_time_feature.hour.value_counts()

hour
12    96
11    40
10     8
Name: count, dtype: int64

In [17]:
df_time_feature.sunlight.value_counts().sort_index(ascending=False).reset_index().head(10)

Unnamed: 0,sunlight,count
0,117758.2,816
1,117752.8,1
2,117722.26,1
3,117707.88,2
4,117648.59,3
5,117596.48,1
6,117533.59,1
7,117510.23,1
8,117467.1,1
9,117398.82,1


In [18]:
df_time_feature.groupby(['hour','sunlight']).size().reset_index(name='count').sort_values('count', ascending=False).head(10)

Unnamed: 0,hour,sunlight,count
4457,11,117758.2,412
4984,12,117758.2,404
3982,11,54612.5,10
699,7,3739.17,4
372,7,2533.33,3
2388,9,17853.33,3
341,7,2462.5,3
1085,8,4107.5,3
346,7,2475.0,3
1890,8,16901.67,3


In [19]:
# select the row by definition
df_result_add_time_feature = create_time_features(df_device, input_column='datetime')
df_result_add_time_feature

Unnamed: 0,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device,date,year,month,day,hour,min,day_of_week,week_of_year,quarter
101673,2024-03-01 17:14:06,0.0,1017.48,15.59,94.30,652.92,0.12,L10,2024-03-01,2024,3,1,17,14,4,9,1
101674,2024-03-01 17:14:47,0.0,1017.48,15.66,94.04,682.50,0.12,L10,2024-03-01,2024,3,1,17,14,4,9,1
101675,2024-03-01 17:15:47,0.0,1017.47,15.74,94.10,750.00,0.14,L10,2024-03-01,2024,3,1,17,15,4,9,1
101676,2024-03-01 17:16:47,0.0,1017.46,15.78,94.09,738.33,0.14,L10,2024-03-01,2024,3,1,17,16,4,9,1
101677,2024-03-01 17:17:47,0.0,1017.49,15.80,94.08,660.83,0.12,L10,2024-03-01,2024,3,1,17,17,4,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193008,2024-07-22 05:15:28,0.0,1004.27,27.36,88.72,102.50,0.00,L10,2024-07-22,2024,7,22,5,15,0,30,3
193009,2024-07-22 05:16:28,0.0,1004.25,27.37,88.78,115.00,0.01,L10,2024-07-22,2024,7,22,5,16,0,30,3
193010,2024-07-22 05:17:28,0.0,1004.29,27.37,88.88,129.17,0.01,L10,2024-07-22,2024,7,22,5,17,0,30,3
193011,2024-07-22 05:18:28,0.0,1004.29,27.38,88.88,144.17,0.01,L10,2024-07-22,2024,7,22,5,18,0,30,3


In [20]:
hour_range = [i for i in range(10,13)]
hour_range

[10, 11, 12]

In [21]:
df_sun_hour = df_result_add_time_feature[df_result_add_time_feature['hour']<=12][['datetime','hour','sunlight','date']]

In [22]:
df_sun_hour.sort_values('datetime')

Unnamed: 0,datetime,hour,sunlight,date
101725,2024-03-02 06:06:47,6,22.50,2024-03-02
101726,2024-03-02 06:07:47,6,28.33,2024-03-02
101727,2024-03-02 06:08:47,6,34.17,2024-03-02
101728,2024-03-02 06:09:47,6,35.83,2024-03-02
101729,2024-03-02 06:10:47,6,37.50,2024-03-02
...,...,...,...,...
193008,2024-07-22 05:15:28,5,102.50,2024-07-22
193009,2024-07-22 05:16:28,5,115.00,2024-07-22
193010,2024-07-22 05:17:28,5,129.17,2024-07-22
193011,2024-07-22 05:18:28,5,144.17,2024-07-22


In [23]:
# The data is inhomogeneous, with record counts varying significantly across dates 
df_sun_hour.date.value_counts().reset_index()

Unnamed: 0,date,count
0,2024-06-05,488
1,2024-05-30,487
2,2024-06-09,487
3,2024-05-31,486
4,2024-06-08,486
...,...,...
128,2024-03-07,172
129,2024-06-04,96
130,2024-05-01,18
131,2024-07-22,14


In [24]:
# there is a big gap when the count less than 400, so we select the date with more than 400 data
date_for_sunlight_sim = df_sun_hour.date.value_counts().reset_index().query("count > 1").date.astype(str).tolist()

In [25]:
df_sunlight_sim = df_device[df_device['datetime'].dt.date.astype(str).isin(date_for_sunlight_sim)]
df_sunlight_sim

Unnamed: 0,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device
101725,2024-03-02 06:06:47,0.0,1020.13,14.18,94.37,22.50,0.00,L10
101726,2024-03-02 06:07:47,0.0,1020.10,14.19,94.27,28.33,0.00,L10
101727,2024-03-02 06:08:47,0.0,1020.07,14.19,94.35,34.17,0.00,L10
101728,2024-03-02 06:09:47,0.0,1020.07,14.19,94.42,35.83,0.00,L10
101729,2024-03-02 06:10:47,0.0,1020.04,14.20,94.46,37.50,0.00,L10
...,...,...,...,...,...,...,...,...
193008,2024-07-22 05:15:28,0.0,1004.27,27.36,88.72,102.50,0.00,L10
193009,2024-07-22 05:16:28,0.0,1004.25,27.37,88.78,115.00,0.01,L10
193010,2024-07-22 05:17:28,0.0,1004.29,27.37,88.88,129.17,0.01,L10
193011,2024-07-22 05:18:28,0.0,1004.29,27.38,88.88,144.17,0.01,L10


In [26]:
df_sunlight_sim.to_csv('../data/processed_data/sunlight_simulation_data.csv', index=False)