In [2]:
import os
import torch
import sys
import sktime
import tqdm as tq
import xgboost as xgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

pd.set_option('display.max_columns', 30)

  from pandas import MultiIndex, Int64Index


GPU 포트 바꾸기

In [3]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print('device :', device)
print('Current :', torch.cuda.current_device())
print('Count :', torch.cuda.device_count())

device : cuda
Current : 0
Count : 1


In [4]:
# 전처리 한 파일 불러오기
train = pd.read_csv('./data/xgboost/train_preprocessed_incsolor.csv', index_col=0)
train.head()

Unnamed: 0,num,date_time,power,prec,wind,hum,temp,day,week,day_hour_mean,day_hour_std,holiday,sin_time,cos_time,THI,CDH
0,1,2022-06-01 00:00:00,1085.28,0.0,0.9,42.0,18.6,2,22,1774.744615,517.982222,1,0.0,1.0,49.6576,-5.4
1,1,2022-06-01 01:00:00,1047.36,0.0,1.1,45.0,18.0,2,22,1687.347692,500.769931,1,0.258819,0.965926,47.7625,-11.4
2,1,2022-06-01 02:00:00,974.88,0.0,1.5,45.0,17.7,2,22,1571.483077,465.227458,1,0.5,0.866025,47.2225,-17.7
3,1,2022-06-01 03:00:00,953.76,0.0,1.4,48.0,16.7,2,22,1522.153846,436.601091,1,0.707107,0.707107,44.7856,-25.0
4,1,2022-06-01 04:00:00,986.4,0.0,2.8,43.0,18.4,2,22,1506.793846,405.518091,1,0.866025,0.5,49.0061,-30.6


In [None]:
test = pd.read_csv('./data/xgboost/test_preprocessed_incsolor.csv', index_col = 0)
test.head()

In [5]:
# Define SMAPE loss function
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

### 파라미터 튜닝

eta(learning rate) : 0.01 로 했을 때보다 0.05로 했을 때 결과가 좋게 나옴  
최적의 eta 값을 찾기 위해 바꿔가며 학습

In [6]:
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.metrics import make_scorer
smape = make_scorer(SMAPE, greater_is_better = False)

df = pd.DataFrame(columns = ['n_estimators', 'eta', 'min_child_weight','max_depth', 'colsample_bytree', 'subsample'])
# df = pd.read_csv('./data/xgboost/hyperparameter_xgb_solar.csv')
preds = np.array([])

grid = {'n_estimators' : [100], 'eta' : [0.07], 'min_child_weight' : np.arange(1, 8, 1),
        'max_depth' : np.arange(3,9,1) , 'colsample_bytree' :np.arange(0.8, 1.0, 0.1),
        'subsample' :np.arange(0.8, 1.0, 0.1)} # fix the eta(learning rate)

# 건물 번호별로 GridSearch로 parameter 생성
for i in tqdm(np.arange(1, 101)):
    y = train.loc[train.num == i, 'power']
    x = train.loc[train.num == i, ].iloc[:, 3:]
    # 마지막 일주일 발전량을 validset으로 24시간*7일 = 168
    y_train, y_test, x_train, x_test = temporal_train_test_split(y = y, X = x, test_size = 168)


    # pds = PredefinedSplit(np.append(-np.ones(len(x)-168), np.zeros(168)))
    gcv = GridSearchCV(estimator = XGBRegressor(seed = 10, gpu_id = 0,
                                                tree_method = 'gpu_hist', predictor= 'gpu_predictor'),
                       param_grid = grid, scoring = smape, cv = 10, refit = True, verbose = True)


    gcv.fit(x, y)
    best = gcv.best_estimator_
    params = gcv.best_params_
    print(params)
    pred = best.predict(x_test)
    building = 'building'+str(i)
    print(building + '|| SMAPE : {}'.format(SMAPE(y_test, pred)))
    preds = np.append(preds, pred)
    df = pd.concat([df, pd.DataFrame(params, index = [0])], axis = 0)
    df.to_csv('./data/xgboost/hyperparameter_xgb_solar.csv', index = False)

  from pandas import MultiIndex, Int64Index


Fitting 10 folds for each of 168 candidates, totalling 1680 fits


  1%|          | 1/100 [12:33<20:43:48, 753.82s/it]

{'colsample_bytree': 0.9, 'eta': 0.07, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.8}
building1|| SMAPE : 1.5148426683419873
Fitting 10 folds for each of 168 candidates, totalling 1680 fits


  from pandas import MultiIndex, Int64Index
  2%|▏         | 2/100 [24:48<20:13:07, 742.73s/it]

{'colsample_bytree': 0.8, 'eta': 0.07, 'max_depth': 7, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.8}
building2|| SMAPE : 1.0043933693459162
Fitting 10 folds for each of 168 candidates, totalling 1680 fits


  from pandas import MultiIndex, Int64Index
  3%|▎         | 3/100 [37:04<19:55:42, 739.61s/it]

{'colsample_bytree': 0.9, 'eta': 0.07, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.8}
building3|| SMAPE : 2.891783319865605
Fitting 10 folds for each of 168 candidates, totalling 1680 fits


  from pandas import MultiIndex, Int64Index
  4%|▍         | 4/100 [48:30<19:09:16, 718.29s/it]

{'colsample_bytree': 0.8, 'eta': 0.07, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9}
building4|| SMAPE : 1.6763882841392252
Fitting 10 folds for each of 168 candidates, totalling 1680 fits


  from pandas import MultiIndex, Int64Index


In [None]:
xgb_params = pd.read_csv('./data/xgboost/hyperparameter_xgb_solar.csv')

In [None]:
xgb_params

In [None]:
train.head()

### 손실함수를 rmse 사용

weighted_mse 는 모든 파라미터 학습 후에 적용해보자

#### n_estimators 튜닝

In [None]:
# weighted_mse 쓰는것과 안쓰는 것 차이 확인
# weighted_mse 안쓰는 것이 더 점수 높게 나오네?? 둘다 실험해보자

scores = []   # smape 값을 저장할 list
best_it = []  # best interation을 저장할 list
for i in tqdm(range(100)):
    y = train.loc[train.num == i+1, 'power']
    x = train.loc[train.num == i+1, ].iloc[:, 3:]
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)
    
    xgb_reg = XGBRegressor(n_estimators = 1000, eta = xgb_params.iloc[i, 1], min_child_weight = xgb_params.iloc[i, 2],
                           max_depth = xgb_params.iloc[i, 3], colsample_bytree = xgb_params.iloc[i, 4], 
                           subsample = xgb_params.iloc[i, 5], seed=10)
    # xgb_reg.set_params(**{'objective':weighted_mse(100)}) # alpha = 100으로 고정
    
    xgb_reg.fit(x_train, y_train, eval_set=[(x_train, y_train), 
                                            (x_valid, y_valid)], early_stopping_rounds=300, verbose=False)
    y_pred = xgb_reg.predict(x_valid)
    pred = pd.Series(y_pred)   
    
    sm = SMAPE(y_valid, y_pred)
    scores.append(sm)
    best_it.append(xgb_reg.best_iteration) ## 실제 best iteration은 이 값에 +1 해주어야 함.
    print("building {} || best iter : {} || smape : {}".format(i+1, xgb_reg.best_iteration, sm))

In [None]:
# 튜닝한 n_estimators 적용시켜서 smape 값 추출
smape_list = []
for i in tqdm(range(100)):
    y = train.loc[train.num == i+1, 'power']
    x = train.loc[train.num == i+1, ].iloc[:, 3:]
    y_train, y_test, x_train, x_test = temporal_train_test_split(y = y, X = x, test_size = 168)
    xgb = XGBRegressor(seed = 10,
                      n_estimators = best_it[i], eta = xgb_params.iloc[i, 1], min_child_weight = xgb_params.iloc[i, 2],
                      max_depth = xgb_params.iloc[i, 3], colsample_bytree = xgb_params.iloc[i, 4], subsample = xgb_params.iloc[i, 5], eval_metric = 'rmse')
    
    xgb.fit(x_train, y_train)
    pred0 = xgb.predict(x_test)
    score0 = SMAPE(y_test,pred0)

    smape_list.append(score0)
    print("building {} || best score : {}".format(i+1, score0))

In [None]:
no_df = pd.DataFrame({'score':smape_list})
plt.bar(np.arange(len(no_df))+1, no_df['score'])
plt.plot([1,100], [5, 5], color = 'red')

In [None]:
xgb_params['best_it'] = best_it
xgb_params.to_csv('./data/xgboost/hyperparameter_xgb_solar_final.csv', index=False)

In [None]:
## best hyperparameters 불러오기
xgb_params = pd.read_csv('./data/xgboost/hyperparameter_xgb_solar_final.csv')

## test 전처리 데이터 불러오기

In [None]:
test = pd.read_csv('./data/xgboost/test_preprocessed_incsolor.csv', index_col = 0)

## 예측

In [None]:
preds = np.array([]) 
for i in tqdm(range(100)):
    
    y_train = train.loc[train.num == i+1, 'power']
    x_train, x_test = train.loc[train.num == i+1, ].iloc[:, 3:], test.loc[test.num == i+1, ].iloc[:,2:]
    x_test = x_test[x_train.columns]
    
    xgb = XGBRegressor(seed = 10, n_estimators = xgb_params.iloc[i, -1], eta = xgb_params.iloc[i, 1], 
                        min_child_weight = xgb_params.iloc[i, 2], max_depth = xgb_params.iloc[i, 3], 
                        colsample_bytree=xgb_params.iloc[i, 4], subsample=xgb_params.iloc[i, 5])

    # if xgb_params.iloc[i,6] != 0:  # 만약 alpha가 0이 아니면 weighted_mse 사용
    #     xgb.set_params(**{'objective':weighted_mse(xgb_params.iloc[i,6])})
    
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    preds = np.append(preds, y_pred)   

In [None]:
# 예측 결과 시각화

preds = pd.Series(preds)

fig, ax = plt.subplots(100, 1, figsize=(100,200), sharex = True)
ax = ax.flatten()
for i in range(100):
    train_y = train.loc[train.num == i+1, 'power'].reset_index(drop = True)
    test_y = preds[i*168:(i+1)*168]
    ax[i].scatter(np.arange(2040) , train.loc[train.num == i+1, 'power'])
    ax[i].scatter(np.arange(2040, 2040+168) , test_y)
    ax[i].tick_params(axis='both', which='major', labelsize=6)
    ax[i].tick_params(axis='both', which='minor', labelsize=4)
#plt.savefig('./predict_xgb.png')
plt.show()

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = preds
submission.to_csv('./data/xgboost/submission_xgb_solor.csv', index = False)

### 손실함수를 임의로 생성한 weighted_mse 사용
이 함수를 이용하는 것이 rmse 를 이용하는 것보다 smape 평가지표상에서 더 좋음

In [None]:
xgb_params = pd.read_csv('./data/xgboost/hyperparameter_xgb_solar.csv')

In [None]:
#### alpha를 argument로 받는 함수로 실제 objective function을 wrapping하여 alpha값을 쉽게 조정할 수 있도록 작성했습니다.
# custom objective function for forcing model not to underestimate
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [None]:
scores = []   # smape 값을 저장할 list
best_it = []  # best interation을 저장할 list
for i in tqdm(range(100)):
    y = train.loc[train.num == i+1, 'power']
    x = train.loc[train.num == i+1, ].iloc[:, 3:]
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)
    
    xgb_reg = XGBRegressor(n_estimators = 10000, eta = 0.01, min_child_weight = xgb_params.iloc[i, 2],
                           max_depth = xgb_params.iloc[i, 3], colsample_bytree = xgb_params.iloc[i, 4], 
                           subsample = xgb_params.iloc[i, 5], seed=10)
    xgb_reg.set_params(**{'objective':weighted_mse(100)}) # alpha = 100으로 고정
    
    xgb_reg.fit(x_train, y_train, eval_set=[(x_train, y_train), 
                                            (x_valid, y_valid)], early_stopping_rounds=300, verbose=False)
    y_pred = xgb_reg.predict(x_valid)
    pred = pd.Series(y_pred)   
    
    sm = SMAPE(y_valid, y_pred)
    scores.append(sm)
    best_it.append(xgb_reg.best_iteration) ## 실제 best iteration은 이 값에 +1 해주어야 함.

In [None]:
alpha_list = []
smape_list = []
for i in tqdm(range(100)):
    y = train.loc[train.num == i+1, 'power']
    x = train.loc[train.num == i+1, ].iloc[:, 3:]
    y_train, y_test, x_train, x_test = temporal_train_test_split(y = y, X = x, test_size = 168)
    xgb = XGBRegressor(seed = 10,
                      n_estimators = best_it[i], eta = 0.01, min_child_weight = xgb_params.iloc[i, 2],
                      max_depth = xgb_params.iloc[i, 3], colsample_bytree = xgb_params.iloc[i, 4], subsample = xgb_params.iloc[i, 5])
    
    xgb.fit(x_train, y_train)
    pred0 = xgb.predict(x_test)
    best_alpha = 0
    score0 = SMAPE(y_test,pred0)
    
    for j in range(1, 100, 2):
        xgb = XGBRegressor(seed = 10,
                      n_estimators = best_it[i], eta = 0.01, min_child_weight = xgb_params.iloc[i, 2],
                      max_depth = xgb_params.iloc[i, 3], colsample_bytree = xgb_params.iloc[i, 4], subsample = xgb_params.iloc[i, 5])
        xgb.set_params(**{'objective' : weighted_mse(j)})
    
        xgb.fit(x_train, y_train)
        pred1 = xgb.predict(x_test)
        score1 = SMAPE(y_test, pred1)
        if score1 < score0:
            best_alpha = j
            score0 = score1
    
    alpha_list.append(best_alpha)
    smape_list.append(score0)
    print("building {} || best score : {} || alpha : {}".format(i+1, score0, best_alpha))

In [None]:
xgb_params['alpha'] = alpha_list
xgb_params['best_it'] = best_it

예측

In [None]:
preds = np.array([]) 
for i in tqdm(range(100)):
    
    y_train = train.loc[train.num == i+1, 'power']
    x_train, x_test = train.loc[train.num == i+1, ].iloc[:, 3:], test.loc[test.num == i+1, ].iloc[:,2:]
    x_test = x_test[x_train.columns]
    
    xgb = XGBRegressor(seed = 10, n_estimators = xgb_params.iloc[i, -1], eta = xgb_params.iloc[i, 1], 
                        min_child_weight = xgb_params.iloc[i, 2], max_depth = xgb_params.iloc[i, 3], 
                        colsample_bytree=xgb_params.iloc[i, 4], subsample=xgb_params.iloc[i, 5])

    if xgb_params.iloc[i,6] != 0:  # 만약 alpha가 0이 아니면 weighted_mse 사용
        xgb.set_params(**{'objective':weighted_mse(xgb_params.iloc[i,6])})
    
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    preds = np.append(preds, y_pred)   

In [None]:
# 예측 결과 시각화

preds = pd.Series(preds)

fig, ax = plt.subplots(100, 1, figsize=(100,200), sharex = True)
ax = ax.flatten()
for i in range(100):
    train_y = train.loc[train.num == i+1, 'power'].reset_index(drop = True)
    test_y = preds[i*168:(i+1)*168]
    ax[i].scatter(np.arange(2040) , train.loc[train.num == i+1, 'power'])
    ax[i].scatter(np.arange(2040, 2040+168) , test_y)
    ax[i].tick_params(axis='both', which='major', labelsize=6)
    ax[i].tick_params(axis='both', which='minor', labelsize=4)
#plt.savefig('./predict_xgb.png')
plt.show()

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['answer'] = preds
submission.to_csv('./data/xgboost/submission_xgb_solor_wmse.csv', index = False)