#### 라이브러리 불러오기

In [47]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from model_evaluation import regression_evaluation, f_importances

In [2]:
_PATH_BASE = os.path.join(os.getcwd(), 'data')

#### 자동차 사고데이터

In [3]:
_PATH_CAR = os.path.join(_PATH_BASE, '32-car-accident', 'split-car')
_PATH_CAR_TRAIN = os.path.join(_PATH_CAR, 'train.csv')
_PATH_CAR_TEST = os.path.join(_PATH_CAR, 'test.csv')
_PATH_CAR_VALID = os.path.join(_PATH_CAR, 'validation.csv')

In [4]:
df_car_train = pd.read_csv(_PATH_CAR_TRAIN)
df_car_valid = pd.read_csv(_PATH_CAR_VALID)
df_car_test = pd.read_csv(_PATH_CAR_TEST)

##### 날짜데이터 확인

In [5]:
print(
    f'     train: {df_car_train.tm.min()} ~ {df_car_train.tm.max()}, cnt: {df_car_train.shape[0]:>4}\n'
    f'validation: {df_car_valid.tm.min()} ~ {df_car_valid.tm.max()}, {df_car_valid.shape[0]:>9}\n'
    f'      test: {df_car_test.tm.min()} ~ {df_car_test.tm.max()}, {df_car_test.shape[0]:>9}'
)
# Ok

     train: 2013-01-01 ~ 2016-12-30, cnt: 1460
validation: 2016-12-31 ~ 2017-07-01,       183
      test: 2017-07-02 ~ 2017-12-31,       183


##### 컬럼 고유값 확인: train

In [8]:
car_n_unique = df_car_train.nunique()
car_n_unique[car_n_unique==1].index
# Ignore

Index(['stnId', 'stnNm'], dtype='object')

##### 결측치 확인

In [9]:
df_car_train.isna().sum().sort_values(ascending=False)[:5]
# Ok

Unnamed: 0        0
avgCm20Te         0
ssDur             0
sumSsHr           0
hr1MaxIcsrHrmt    0
dtype: int64

##### 시험데이터 사망자수 분포 확인

In [10]:
df_car_test.사망자수.value_counts()
# Ok

사망자수
0.0    75
1.0    65
2.0    26
3.0    10
4.0     6
6.0     1
Name: count, dtype: int64

#### 풍력 발전데이터

In [11]:
_PATH_WIND = os.path.join(_PATH_BASE, '31-wind-power', 'split-wind-power')
_PATH_WIND_TRAIN = os.path.join(_PATH_WIND, 'train.csv')
_PATH_WIND_TEST = os.path.join(_PATH_WIND, 'test.csv')
_PATH_WIND_VALID = os.path.join(_PATH_WIND, 'validation.csv')

In [13]:
df_wind_train = pd.read_csv(_PATH_WIND_TRAIN)
df_wind_valid = pd.read_csv(_PATH_WIND_VALID)
df_wind_test = pd.read_csv(_PATH_WIND_TEST)

##### 날짜데이터 확인

In [14]:
print(
    f'     train: {df_wind_train.tm.min()} ~ {df_wind_train.tm.max()}, cnt: {df_wind_train.shape[0]:>4}\n'
    f'validation: {df_wind_valid.tm.min()} ~ {df_wind_valid.tm.max()}, {df_wind_valid.shape[0]:>9}\n'
    f'      test: {df_wind_test.tm.min()} ~ {df_wind_test.tm.max()}, {df_wind_test.shape[0]:>9}'
)
# Ok

     train: 2019-01-01 ~ 2021-03-08, cnt:  798
validation: 2021-03-09 ~ 2021-06-16,       100
      test: 2021-06-17 ~ 2021-09-24,       100


##### 훈련데이터 컬럼 고유값 확인

In [15]:
wind_n_unique = df_wind_train.nunique()
wind_n_unique[wind_n_unique==1].index
# Ignore

Index(['stnId', 'stnNm', '발전기명(WTG)', '발전기(Serial)', '발전용량(kW)', '위치'], dtype='object')

##### 결측치 확인

In [None]:
df_wind_train = df_wind_train.fillna('')

In [34]:
df_wind_train.isna().sum().sort_values(ascending=False)[:5]
# Ok

stnId         0
일자_tm         0
hr1MaxIcsr    0
sumGsr        0
avgTca        0
dtype: int64

#### 태양광 발전량데이터

In [17]:
_PATH_SUN = os.path.join(_PATH_BASE, '33-sun-power', 'split-sun-power')
_PATH_SUN_TRAIN = os.path.join(_PATH_SUN, 'train.csv')
_PATH_SUN_TEST = os.path.join(_PATH_SUN, 'test.csv')
_PATH_SUN_VALID = os.path.join(_PATH_SUN, 'validation.csv')

In [18]:
df_sun_train = pd.read_csv(_PATH_SUN_TRAIN)
df_sun_valid = pd.read_csv(_PATH_SUN_VALID)
df_sun_test = pd.read_csv(_PATH_SUN_TEST)

##### 날짜데이터 확인

In [19]:
print(
    f'     train: {df_sun_train.tm.min()} ~ {df_sun_train.tm.max()}, cnt: {df_sun_train.shape[0]:>4}\n'
    f'validation: {df_sun_valid.tm.min()} ~ {df_sun_valid.tm.max()}, {df_sun_valid.shape[0]:>9}\n'
    f'      test: {df_sun_test.tm.min()} ~ {df_sun_test.tm.max()}, {df_sun_test.shape[0]:>9}'
)

     train: 2017-01-01 ~ 2022-01-13, cnt: 1839
validation: 2022-01-14 ~ 2022-07-08,       176
      test: 2022-07-09 ~ 2022-12-31,       176


In [20]:
sun_n_unique = df_sun_train.nunique()
sun_n_unique[sun_n_unique==1].index

Index(['발전기명', '주소', '설비용량(MW)', 'stnId', 'stnNm'], dtype='object')

In [21]:
df_sun_train.isna().sum().sort_values(ascending=False)[:5]

날짜                0
hr1MaxIcsrHrmt    0
avgRhm            0
avgPv             0
avgPa             0
dtype: int64

#### RandomForestRegressor 모델링

In [43]:
df = {
    'car': {
        'train': df_car_train,
        'test': df_car_test,
        'valid': df_car_valid,
        'target_col': '사망자수'
    },
    'sun': {
        'train': df_sun_train,
        'test': df_sun_test,
        'valid': df_sun_valid, 
        'target_col': '일일발전량(Wh)'
    },
    'wind': {
        'train': df_wind_train,
        'test': df_wind_test,
        'valid': df_wind_valid,
        'target_col': '발전량(kWh)'
    }
}

SEED = 1234

In [52]:
def model_test(data_type):
    ds = df[data_type]
    _X_train = ds['train'].drop(ds['target_col'], axis=1)
    y_train = ds['train'][ds['target_col']]
    X_valid = ds['valid'].drop(ds['target_col'], axis=1)
    y_valid = ds['valid'][ds['target_col']]
    _X_test = ds['test'].drop(ds['target_col'], axis=1)
    y_test = ds['test'][ds['target_col']]
    
    X_train = _X_train.select_dtypes(include=[np.number])
    X_test = _X_test[X_train.columns]
    
    model = RandomForestRegressor(n_estimators=100, random_state=SEED)
    model.fit(X_train, y_train)
    
    pred = model.predict(X_test)
    _ = regression_evaluation(y_test, pred)

In [53]:
model_test('sun')

R2_SCORE: 0.878049, MAPE2: 13.306492, MSE: 919127245440.068237, RMSE: 958711.241949, MAPE: 0.133065, MAE: 619118.284091


In [54]:
model_test('car')

R2_SCORE: -0.031212, MAPE2: 473989071059.351929, MSE: 1.238599, RMSE: 1.112924, MAPE: 2134657003705556.750000, MAE: 0.888033


In [55]:
model_test('wind')

R2_SCORE: 0.999963, MAPE2: 96889100000.646973, MSE: 2283.458447, RMSE: 47.785546, MAPE: 436349714656262.750000, MAE: 28.194266
