In [1]:
import pandas as pd
import os
import sys

# Make `src` dir can be imported
project_root_path = os.path.abspath(os.path.join(os.getcwd(), '..'))  # /aicup-predict-energy-generation
sys.path.append(project_root_path)

from src.utils import choose_device
from src.feature_engineering import create_time_features

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_raw_data = pd.read_csv('../data/processed_data/combined_data.csv')
df_raw_data['datetime'] = pd.to_datetime(df_raw_data['datetime'])
# make sure the sorting is correct
df_raw_data = df_raw_data.sort_values(by=['device','datetime']).reset_index(drop=True)

In [4]:
# parameterize the device for testing conveniently
df_device = choose_device(df_raw_data, 'L8')

## Preprocessing

In [5]:
df_device

Unnamed: 0,locationcode,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device
1084283,8,2024-01-06 06:21:19,0.0,1016.10,14.67,80.94,23.33,0.00,L8
1084284,8,2024-01-06 06:22:19,0.0,1016.10,14.65,80.95,25.83,0.00,L8
1084285,8,2024-01-06 06:23:19,0.0,1016.07,14.63,81.00,30.83,0.00,L8
1084286,8,2024-01-06 06:24:19,0.0,1016.02,14.61,81.02,35.00,0.00,L8
1084287,8,2024-01-06 06:25:19,0.0,1016.06,14.60,81.07,41.67,0.00,L8
...,...,...,...,...,...,...,...,...,...
1201277,8,2024-08-31 15:07:16,0.0,1002.31,40.37,38.86,12590.83,32.83,L8
1201278,8,2024-08-31 15:08:16,0.0,1002.33,40.13,39.45,13351.67,40.63,L8
1201279,8,2024-08-31 15:09:16,0.0,1002.36,39.96,40.12,13168.33,38.16,L8
1201280,8,2024-08-31 15:10:16,0.0,1002.36,39.57,40.80,13605.83,43.10,L8


In [17]:
def calculate_pressure_diff(df, column='pressure'):
    """
    Adds a new column to the DataFrame with the difference of each value
    in the specified column from the column's mean.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The column for which to calculate the difference from the mean.

    Returns:
    pd.DataFrame: The DataFrame with an added column for pressure difference.
    """
    mean_value = df[column].mean()
    df.loc[:, f'{column}_diff'] = df[column].apply(lambda x: round(x - mean_value, 2))
    return df

In [19]:
df_device = calculate_pressure_diff(df_device, column='pressure')
df_device

Unnamed: 0,locationcode,datetime,windspeed,pressure,temperature,humidity,sunlight,power,device,preesure_diff,pressure_diff
1084283,8,2024-01-06 06:21:19,0.0,1016.10,14.67,80.94,23.33,0.00,L8,6.36,6.36
1084284,8,2024-01-06 06:22:19,0.0,1016.10,14.65,80.95,25.83,0.00,L8,6.36,6.36
1084285,8,2024-01-06 06:23:19,0.0,1016.07,14.63,81.00,30.83,0.00,L8,6.33,6.33
1084286,8,2024-01-06 06:24:19,0.0,1016.02,14.61,81.02,35.00,0.00,L8,6.28,6.28
1084287,8,2024-01-06 06:25:19,0.0,1016.06,14.60,81.07,41.67,0.00,L8,6.32,6.32
...,...,...,...,...,...,...,...,...,...,...,...
1201277,8,2024-08-31 15:07:16,0.0,1002.31,40.37,38.86,12590.83,32.83,L8,-7.43,-7.43
1201278,8,2024-08-31 15:08:16,0.0,1002.33,40.13,39.45,13351.67,40.63,L8,-7.41,-7.41
1201279,8,2024-08-31 15:09:16,0.0,1002.36,39.96,40.12,13168.33,38.16,L8,-7.38,-7.38
1201280,8,2024-08-31 15:10:16,0.0,1002.36,39.57,40.80,13605.83,43.10,L8,-7.38,-7.38
