#### 라이브러리 불러오기

In [1]:
import os

import numpy as np
import pandas as pd

In [2]:
_PATH_BASE = os.path.join(os.getcwd(), 'data')
get_path = lambda turbin: os.path.join(_PATH_BASE, f'{turbin}_turbin.csv')
df_data = pd.read_csv(get_path('b'))

#### 하루단위가 안되는 ROW 삭제

In [3]:
df_data = df_data.iloc[2:-4]

In [4]:
# 기존 인덱스 갱신
df_data.reset_index(inplace=True)
df_data.drop('index', axis=1, inplace=True)

In [5]:
target_col = 'ACTIVE_POWER'
df_data[df_data[target_col]==-1].shape

(9353, 6)

In [6]:
non_value_counts = (df_data.drop('TURBINE_TIME', axis=1) < 0).sum()
non_value_counts

WIND_SPEED       9353
WIND_DIR        15108
ACTIVE_POWER     9353
month               0
hour                0
dtype: int64

In [7]:
# 발전량이 음수인 컬럼의 풍향/풍속/발전량 값을 NaN으로 변경
df_data.loc[df_data[target_col] < 0, ['WIND_SPEED', 'WIND_DIR']+[target_col]] = np.nan

In [8]:
# 시계열성값을 이용한 결측치 처리
df_data = df_data.interpolate()

#### 풍향/풍속을 이용해 바람벡터 생성

In [9]:
ws = df_data.pop('WIND_SPEED')
wd = df_data.pop('WIND_DIR')
wd_radian = wd * np.pi /180

df_data['ws_X'] = ws * np.cos(wd_radian)
df_data['ws_y'] = ws * np.sin(wd_radian)

#### 시간데이터에 일간/년간 주기성 부여

In [10]:
day = 24 * 60 * 60
year = 365.2425 * day
df_data['TURBINE_TIME'] = pd.to_datetime(df_data['TURBINE_TIME'])
df_data.set_index('TURBINE_TIME', inplace=True)

In [12]:
timestampe_s = df_data.index.map(pd.Timestamp.timestamp)
df_data['day_sin'] = np.sin(timestampe_s * (2 * np.pi / day))
df_data['day_cos'] = np.cos(timestampe_s * (2 * np.pi / day))
df_data['year_sin'] = np.sin(timestampe_s * (2 * np.pi / year))
df_data['year_cos'] = np.cos(timestampe_s * (2 * np.pi / year))

In [13]:
data_size, n_features = df_data.shape

df_train = df_data[:int(data_size * 0.7)]
df_valid = df_data[int(data_size * 0.7): int(data_size * 0.9)]
df_test = df_data[int(data_size * 0.9):] 

In [14]:
df_train.to_csv(os.path.join(_PATH_BASE, 'b_train.csv'), index=False)
df_valid.to_csv(os.path.join(_PATH_BASE, 'b_valid.csv'), index=False)
df_test.to_csv(os.path.join(_PATH_BASE, 'b_test.csv'), index=False)