#### 라이브러리 불러오기

In [1]:
import os

import numpy as np
import pandas as pd

2023-08-30 09:58:37.735474: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### 데이터 불러오기

In [2]:
_PATH_BASE = os.path.join(os.getcwd(), 'data')
_PATH_DATA_FILE = os.path.join(_PATH_BASE, 'climate_2009_2016.csv')

if os.path.exists(_PATH_DATA_FILE):
    df_climate = pd.read_csv(_PATH_DATA_FILE)
else:
    _WEB_FILE_PATH = tf.keras.utils.get_file(
        origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
        fname='jena_climate_2009_2016.csv.zip',
        extract=True
    )
    _CSV_PATH, _ = os.path.splitext(_WEB_FILE_PATH)
    df_climate = pd.read_csv(_CSV_PATH)
    df_climate.to_csv(_PATH_DATA_FILE, index=False)

#### 데이터 전처리

##### 10분단위 데이터 -> 시간단위 데이터

In [3]:
# 10분단위로 들어온 데이터를 시간당 한건만 추출하는 방법
# df_climate = df_climate[5::6]     # 5번째 로우부터 6로우(60분) 간격으로 하나씩 추출

# 해당시간대의 평균값으로 시간별 데이터 추출
df_climate['Date Time'] = \
    pd.to_datetime(df_climate['Date Time'], format='%d.%m.%Y %H:%M:%S')
    
# 시계열데이터가 인덱스가 아니면, 아래 resample()이 처리되지 않음
df_climate.set_index('Date Time', inplace=True)
df_climate = df_climate.resample('H').mean()

##### 풍속/최대풍속 음수값 처리

In [4]:
# 아래 두 행은 같은 방법임
# 이 두 행은 조건에 의해 특정 컬럼값을 변경시키는 방법임(같은 기능)
df_climate['wv (m/s)'][df_climate['wv (m/s)'] < 0.0] = 0.0
# df_climate.loc[df_climate['wv (m/s)'] < 0.0, 'wv (m/s)'] = 0.0
df_climate['max. wv (m/s)'] = df_climate['max. wv (m/s)'].apply(lambda x: max(0,x))

In [5]:
df_climate.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p (mbar),70041.0,989.214359,8.358615,934.905,984.206667,989.57,994.726667,1015.243333
T (degC),70041.0,9.44239,8.414647,-22.653333,3.358333,9.41,15.461667,37.038333
Tpot (K),70041.0,283.484834,8.495644,250.97,277.43,283.455,289.515,310.976667
Tdew (degC),70041.0,4.953446,6.727769,-24.601667,0.233333,5.215,10.068333,23.016667
rh (%),70041.0,76.028671,16.385638,13.683333,65.308333,79.266667,89.35,100.0
VPmax (mbar),70041.0,13.568619,7.725238,0.98,7.776667,11.818333,17.598333,62.943333
VPact (mbar),70041.0,9.532337,4.180956,0.821667,6.213333,8.861667,12.351667,28.168333
VPdef (mbar),70041.0,4.036199,4.874833,0.0,0.878333,2.195,5.29,45.195
sh (g/kg),70041.0,6.021505,2.654106,0.516667,3.918333,5.595,7.801667,18.025
H2OC (mmol/mol),70041.0,9.638782,4.232169,0.828333,6.283333,8.963333,12.481667,28.661667


##### 풍향/풍속을 이용해 바람벡터 생성

In [6]:
df_climate.shape

(70129, 14)

In [7]:
_wv = df_climate.pop('wv (m/s)')
_max_wv = df_climate.pop('max. wv (m/s)')

_wd_radian = df_climate.pop('wd (deg)') * np.pi / 180

# 바람백터 생성
df_climate['Vwv_X'] = _wv * np.cos(_wd_radian)
df_climate['Vwv_Y'] = _wv * np.sin(_wd_radian)
df_climate['Vmwv_X'] = _max_wv * np.cos(_wd_radian)
df_climate['Vmwv_Y'] = _max_wv * np.sin(_wd_radian)

In [8]:
# 3개 컬럼이 제거되고 4개 컬럼이 추가됨
df_climate.shape

(70129, 15)

In [9]:
df_climate.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 70129 entries, 2009-01-01 00:00:00 to 2017-01-01 00:00:00
Freq: H
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   p (mbar)         70041 non-null  float64
 1   T (degC)         70041 non-null  float64
 2   Tpot (K)         70041 non-null  float64
 3   Tdew (degC)      70041 non-null  float64
 4   rh (%)           70041 non-null  float64
 5   VPmax (mbar)     70041 non-null  float64
 6   VPact (mbar)     70041 non-null  float64
 7   VPdef (mbar)     70041 non-null  float64
 8   sh (g/kg)        70041 non-null  float64
 9   H2OC (mmol/mol)  70041 non-null  float64
 10  rho (g/m**3)     70041 non-null  float64
 11  Vwv_X            70041 non-null  float64
 12  Vwv_Y            70041 non-null  float64
 13  Vmwv_X           70041 non-null  float64
 14  Vmwv_Y           70041 non-null  float64
dtypes: float64(15)
memory usage: 8.6 MB


##### 시간데이터에 일간/년간 주기성 부여

In [10]:
# 시간데이터가 초 단위로 있으면 모델링에 적합하지 않다.
# 일간/년간 주기성 부여가 반드시 필요함
_day = 24 * 60 * 60
_year = (365.2425) * _day
_timestamp_s = df_climate.index.map(pd.Timestamp.timestamp)

df_climate['day_sin'] = np.sin(_timestamp_s * (2 * np.pi / _day))
df_climate['day_cos'] = np.cos(_timestamp_s * (2 * np.pi / _day))
df_climate['year_sin'] = np.sin(_timestamp_s * (2 * np.pi / _year))
df_climate['year_cos'] = np.cos(_timestamp_s * (2 * np.pi / _year))

##### 데이터 분할

In [14]:
_data_size, _n_features = df_climate.shape

df_train = df_climate[:int(_data_size * 0.7)]
df_valid = df_climate[int(_data_size * 0.7): int(_data_size * 0.9)]
df_test = df_climate[int(_data_size * 0.9):]

##### 정규화

In [15]:
_data_mean, _data_std = df_train.mean(), df_train.std()

df_train = (df_train - _data_mean) / _data_std
df_valid = (df_valid - _data_mean) / _data_std
df_test = (df_test - _data_mean) / _data_std

##### 전처리 데이터 저장

In [16]:
df_train.to_csv(os.path.join(_PATH_BASE, 'climate_train.csv'), index=False)
df_valid.to_csv(os.path.join(_PATH_BASE, 'climate_valid.csv'), index=False)
df_test.to_csv(os.path.join(_PATH_BASE, 'climate_test.csv'), index=False)