<a href="https://colab.research.google.com/github/dlfrnaos19/aiffel/blob/main/modeling_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Load

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4


In [None]:
#Library Imports
import os  # 디렉토리 변경
import copy

import numpy as np  # 넘파이
import pandas as pd  # 판다스
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
%matplotlib inline

import statsmodels.api as sm
import scipy.stats
from scipy.stats import skew
from scipy.stats import spearmanr

import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

# Learning algorithms
import sklearn
from sklearn.linear_model import *
from sklearn.svm import SVR
from sklearn.cluster import KMeans

import lightgbm as lgb
from lightgbm import LGBMRegressor
import catboost
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# model validation
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV  # 파라미터 설정 고민을 줄여주는 고마운 친구
from sklearn.metrics import make_scorer  # loss function 커스터마이징

os.chdir('/content/drive/MyDrive/Colab Notebooks/dacon/energy') 

In [None]:
# 데이터 로드 (인코딩은 euc-kr)
train_df = pd.read_csv('train.csv', encoding='euc-kr')
test_df = pd.read_csv('test.csv', encoding='euc-kr')
submission = pd.read_csv('sample_submission.csv', encoding='euc-kr')

In [None]:
# renaming columns
# train origin columns name : num, date_time, 전력사용량(kWh), 기온(°C), 풍속(m/s), 습도(%), 강수량(mm), 일조(hr), 비전기냉방설비운영, 태양광보유
# test origin columns name : num, date_time, 기온(°C), 풍속(m/s), 습도(%), 강수량(mm), 일조(hr), 비전기냉방설비운영, 태양광보유
train_df.columns = ['num','datetime','target','temperature','windspeed','humidity','precipitation','insolation','nelec_cool_flag','solar_flag']
test_df.columns = ['num','datetime','temperature','windspeed','humidity','precipitation','insolation','nelec_cool_flag','solar_flag']

# Feature Engineering

## Train Feature Engineering

In [None]:
train_df['datetime'] = pd.to_datetime(train_df['datetime'])

# 6월 제거 유무
#train_df['month'] = train_df['datetime'].dt.month # 월(숫자)
#train_df = train_df.loc[train_df['month']!=6,:].reset_index(drop=True)
#train_df = train_df.drop(columns=['month']) #쓸모없는 특징 drop

train_df['dayofyear'] = train_df['datetime'].dt.dayofyear
train_df['hour'] = train_df['datetime'].dt.hour
train_df['weekday'] = train_df['datetime'].dt.weekday #time feature

train_df['hour_te'] = np.sin(2*np.pi*(train_df['hour'])/23)     #time encoding hour
train_df['hour_te1'] = np.cos(2*np.pi*(train_df['hour'])/23)    #time encoding hour

# 체감온도
train_df['더위체감지수']=13.12+0.6215*train_df['temperature']-13.947*train_df['windspeed']**0.16+0.486*train_df['temperature']*train_df['windspeed']**0.16
train_df['더위체감지수']=pd.cut(train_df['더위체감지수'], bins=[0, 21, 25, 28, 31, 50], labels=[1,2,3,4,5])

# 불쾌지수
t = 9/5*train_df['temperature']
train_df['불쾌지수'] = t - 0.55*(1-train_df['humidity']/100)*(t-26)+32
train_df['불쾌지수'] = pd.cut(train_df['불쾌지수'], bins = [0, 68, 75, 80, 200], labels = [1,2,3,4]) #불쾌지수는 카테고리로 나누는게 성능상승에 도움이 됨                                 

train_dfs = []
for i in range(1,61):
    train_dfs.append(train_df[train_df['num']==i])
    
for i in range(len(train_dfs)):
    train_dfs[i] = train_dfs[i].drop(columns=['windspeed','precipitation','insolation','num',
                                              'datetime','nelec_cool_flag','solar_flag']) #쓸모없는 특징 drop

## Test Feature Engineering

In [None]:
for i in range(1,61):
    test_df[test_df['num']==i] = test_df[test_df['num']==i].interpolate() #기상예보값 interpolat

test_df['datetime'] = pd.to_datetime(test_df['datetime'])
    
test_df['dayofyear'] = test_df['datetime'].dt.dayofyear
test_df['hour'] = test_df['datetime'].dt.hour
test_df['weekday'] = test_df['datetime'].dt.weekday #time feature

test_df['hour_te'] = np.sin(2*np.pi*(test_df['hour'])/23)   #time encoding hour
test_df['hour_te1'] = np.cos(2*np.pi*(test_df['hour'])/23)  #time encoding hour

# 체감온도
test_df['더위체감지수']=13.12+0.6215*test_df['temperature']-13.947*test_df['windspeed']**0.16+0.486*test_df['temperature']*test_df['windspeed']**0.16
test_df['더위체감지수']=pd.cut(test_df['더위체감지수'], bins=[0, 21, 25, 28, 31, 50], labels=[1,2,3,4,5])

# 불쾌지수
t = 9/5*test_df['temperature']
test_df['불쾌지수'] = t - 0.55*(1-test_df['humidity']/100)*(t-26)+32
test_df['불쾌지수'] = pd.cut(test_df['불쾌지수'], bins = [0, 68, 75, 80, 200], labels = [1,2,3,4]) #불쾌지수는 카테고리로 나누는게 성능상승에 도움이 됨

test_dfs  = []
for i in range(1,61):
    test_dfs.append(test_df[test_df['num']==i])
    
for i in range(len(test_dfs)):
    test_dfs[i] = test_dfs[i].drop(columns=['windspeed','precipitation','insolation','num',
                                            'datetime','nelec_cool_flag','solar_flag']) #쓸모없는 특징 drop

In [None]:
def CDH(xs): #cooling degree hour를 구현
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys) 

for i in range(60): #cdh 특징 추가
    train_dfs[i]['cdh'] = CDH(np.concatenate([train_dfs[i]['temperature'].values,test_dfs[i]['temperature'].values]))[:-len(test_dfs[i])]
    test_dfs[i]['cdh'] = CDH(np.concatenate([train_dfs[i]['temperature'].values,test_dfs[i]['temperature'].values]))[-len(test_dfs[i]):]

In [None]:
train_dfs[0]

Unnamed: 0,target,temperature,humidity,dayofyear,hour,weekday,hour_te,hour_te1,더위체감지수,불쾌지수,cdh
0,8179.056,17.6,92.0,153,0,0,0.000000e+00,1.000000,1,1,-8.4
1,8135.640,17.7,91.0,153,1,0,2.697968e-01,0.962917,1,1,-16.7
2,8107.128,17.5,91.0,153,2,0,5.195840e-01,0.854419,1,1,-25.2
3,8048.808,17.1,91.0,153,3,0,7.308360e-01,0.682553,1,1,-34.1
4,8043.624,17.0,92.0,153,4,0,8.878852e-01,0.460065,1,1,-43.1
...,...,...,...,...,...,...,...,...,...,...,...
2035,8714.952,29.4,66.0,237,19,0,-8.878852e-01,0.460065,5,3,43.7
2036,8740.224,28.7,69.0,237,20,0,-7.308360e-01,0.682553,4,3,47.3
2037,8730.504,28.3,71.0,237,21,0,-5.195840e-01,0.854419,4,3,48.7
2038,8725.968,28.3,72.0,237,22,0,-2.697968e-01,0.962917,4,3,48.4


In [None]:
#일단보류
#def detect_outliers(df,ratio): #iqr 이상치제거 
#    outlier_indices = [] 
#    Q1 = np.percentile(df, 25) 
#    Q3 = np.percentile(df, 75) 
#    IQR = Q3 - Q1 
#    outlier_step = ratio * IQR 
#    return ~(df < Q1 - outlier_step) | (df > Q3 + outlier_step)


#이상치 제거 iqr은 1.25
#for i in range(60):    
#    idx = detect_outliers(train_y[i],1.25)
#    train_y[i] = train_y[i][idx]
#    train_x[i] = train_x[i][idx]

In [None]:
#train_x와 train_y로 나눔    
train_x = [] 
train_y = []
for i in range(len(train_dfs)):
    train_x.append(copy.deepcopy(train_dfs[i][train_dfs[i].columns[1:]])) 
    train_y.append(copy.deepcopy(train_dfs[i][train_dfs[i].columns[0]]))

# Hyperparameter Tuning

## Modeling after hyperparameter tuning for each building
* 시간이 매우 오래 걸려서 pass

In [None]:
# loss function : SMAPE 정의
# from sklearn.metrics import mean_absolute_error
#def smape(true, pred):
#    true = np.array(true)  # np.array로 바꿔야 에러 없음
#    pred = np.array(pred)
#    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred)))  # *2 , *100은 상수이므로 생략
#SMAPE = make_scorer(smape, greater_is_better=False)  # smape 값이 작아져야하므로 False

In [None]:
# 파라미터 설정, 모델생성 함수
#def get_best_params(model, params, i):
#    grid_model = GridSearchCV(
#        model,
#        param_grid = params,  # 파라미터
#        cv=3,  # Kfold : 5
#        scoring= SMAPE)  #loss function
#
#    grid_model.fit(train_x[i], train_y[i], verbose=100)
#    scr = grid_model.best_score_
#
#    print('최적 하이퍼 파라미터:\n', grid_model.best_params_)
#    print(f'{model.__class__.__name__} 최적 score 값 {scr}\n\n')
#
#    return grid_model.best_estimator_

In [None]:
# 파라미터 후보군 설정
# 어떤 파라미터로 하는게 좋을지 고민된다면 고민하는 것들을 리스트 안에 다 넣어보세요 알아서 골라줄겁니다.
# 저는 예시로 learning_rate만 0.1 or 0.01 중 더 좋은걸 골라달라고 했습니다.
#params = {
#    'boosting_type':['goss'],
#    'objective' : ['MAE'], 
#    'n_estimators' : [10000, 12000],
#    'learning_rate' : [0.1, 0.01],
#    'num_leaves' : [37, 39, 41],
#    'subsample' : [1]
#}

In [None]:
# 모델정의
#model=LGBMRegressor(params)
#
#best_lgbm = []
#for i in range(len(train_dfs)):
#    print(str(i)+' buliding\'s optimize hyperparameter GridSearchCV')
#    # 학습진행
#    best_lgbm.append(get_best_params(model, params, i))
#    best_lgbm

In [None]:
#for dc in data_cols:#d특정 feature dc를 drop 시킴
#    for k in kfold_split:#kfold 의 nspilt 의 값 k
#        folds = []
#        for i in range(len(train_dfs)):
#            cross=KFold(n_splits=k, shuffle=True, random_state=random_seed)
#            fold=[]
#            for train_idx, valid_idx in cross.split(train_x[i], train_y[i]):
#                fold.append((train_idx, valid_idx))
#            folds.append(fold)
#            
#        for i in range(len(train_dfs)):
#            for fold in range(k):
#                print(dc,random_seed,k,i)
#                train_idx, valid_idx = folds[i][fold]
#                X_train=np.array(train_x[i].drop(columns=dc).iloc[train_idx])
#                y_train=np.array(train_y[i].iloc[train_idx])
#                X_valid=np.array(train_x[i].drop(columns=dc).iloc[valid_idx])
#                y_valid=np.array(train_y[i].iloc[valid_idx])
#
#                model=best_lgbm[0]
#                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)        
#                v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns]))
#                

## After setting model parameters, model each building


In [None]:
#과적합 방지를 위해 여러 k_fold로 반복하도록 설정
#특징중 몇개를 뺄경우 성능 향상을 기대할수 있고 과적합 또한 방지 가능하다
#random_seed = 0
#dcs = [[],['temperature'], ['humidity'], ['hour_te','hour_te1'], ['불쾌지수'], ['cdh']]
#ks = [2,3,4,5,6,7,8,9,10,4]

In [None]:
#과적합 방지를 위해 여러 k_fold로 반복하도록 설정
#특징중 몇개를 뺄경우 성능 향상을 기대할수 있고 과적합 또한 방지 가능하다
random_seed = 0
data_cols = [[]]
kfold_split = [5]

In [None]:
# 최종 적용 파라미터
cat_mae_params = {
    'objective': 'MAE',
    'n_estimators': 10000,
    'early_stopping_rounds': 4, 
} #catboost hyper parameter

lgbm_mae_params = {
    'objective': 'MAE',
    'boosting_type': 'goss',
    'n_estimators': 11000,
    'early_stopping_round': 15, 
    'num_leaves': 39,
} #lightgbm hyper parameter

xgb_mae_params = {
    'objective': 'reg:squarederror',
    'n_estimators': 20000,
    'max_depth': 8,
    'learning_rate': 0.03,
    'colsample_bytree': 0.9,
    'subsample': 0.7,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'n_jobs': -1,
    'early_stoppings': 100
} #eXtreme Gradient Boosting hyper parameter

In [None]:
for dc in data_cols:#d특정 feature dc를 drop 시킴
    for k in kfold_split:#kfold 의 nspilt 의 값 k
        folds = []
        for i in range(len(train_dfs)):
            cross=KFold(n_splits=k, shuffle=True, random_state=random_seed)
            fold=[]
            for train_idx, valid_idx in cross.split(train_x[i], train_y[i]):
                fold.append((train_idx, valid_idx))
            folds.append(fold)
            
        for i in range(len(train_dfs)):
            for fold in range(k):
                print(dc,random_seed,k,i)
                train_idx, valid_idx = folds[i][fold]
                X_train=np.array(train_x[i].drop(columns=dc).iloc[train_idx])
                y_train=np.array(train_y[i].iloc[train_idx])
                X_valid=np.array(train_x[i].drop(columns=dc).iloc[valid_idx])
                y_valid=np.array(train_y[i].iloc[valid_idx])
                
                #catboost 학습 
                #model=CatBoostRegressor(**cat_mae_params)
                #model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)
                #v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns]))
                
                #lgbm 학습
                model=LGBMRegressor(**lgbm_mae_params)
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)        
                v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns]))
                
                submission['answer'].iloc[(i)*168:(i+1)*168] += v/(len(kfold_split)*k*len(data_cols))
                
        random_seed += 1
submission.to_csv('submission_out_month6.csv', index=False)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Early stopping, best iteration is:
[327]	valid_0's l1: 26.73
['더위체감지수'] 4 5 55
Training until validation scores don't improve for 15 rounds.
[100]	valid_0's l1: 32.6537
[200]	valid_0's l1: 31.6361
[300]	valid_0's l1: 31.303
[400]	valid_0's l1: 31.1167
[500]	valid_0's l1: 30.9659
[600]	valid_0's l1: 30.8577
[700]	valid_0's l1: 30.7259
Early stopping, best iteration is:
[747]	valid_0's l1: 30.6792
['더위체감지수'] 4 5 56
Training until validation scores don't improve for 15 rounds.
[100]	valid_0's l1: 98.4381
[200]	valid_0's l1: 92.4426
[300]	valid_0's l1: 90.3632
[400]	valid_0's l1: 89.6422
Early stopping, best iteration is:
[390]	valid_0's l1: 89.6315
['더위체감지수'] 4 5 56
Training until validation scores don't improve for 15 rounds.
[100]	valid_0's l1: 97.3086
[200]	valid_0's l1: 93.7398
Early stopping, best iteration is:
[239]	valid_0's l1: 92.9448
['더위체감지수'] 4 5 56
Training until validation scores don't improve for 15 rounds.
[100]	valid_0's l

In [None]:
submission

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,8682.083218
1,1 2020-08-25 01,8667.506318
2,1 2020-08-25 02,8653.490190
3,1 2020-08-25 03,8640.600457
4,1 2020-08-25 04,8623.921642
...,...,...
10075,60 2020-08-31 19,4098.775513
10076,60 2020-08-31 20,4020.986388
10077,60 2020-08-31 21,3792.860770
10078,60 2020-08-31 22,3567.945084


## XGBoost 실험(버려)

In [None]:
for dc in data_cols:#d특정 feature dc를 drop 시킴
    for k in kfold_split:#kfold 의 nspilt 의 값 k
        folds = []
        for i in range(len(train_dfs)):
            cross=KFold(n_splits=k, shuffle=True, random_state=random_seed)
            fold=[]
            for train_idx, valid_idx in cross.split(train_x[i], train_y[i]):
                fold.append((train_idx, valid_idx))
            folds.append(fold)
            
        for i in range(len(train_dfs)):
            for fold in range(k):
                print(dc,random_seed,k,i)
                train_idx, valid_idx = folds[i][fold]
                X_train=np.array(train_x[i].drop(columns=dc).iloc[train_idx])
                y_train=np.array(train_y[i].iloc[train_idx])
                X_valid=np.array(train_x[i].drop(columns=dc).iloc[valid_idx])
                y_valid=np.array(train_y[i].iloc[valid_idx])
                
                #catboost 학습 
                #model=CatBoostRegressor(**cat_mae_params)
                #model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)
                #v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns]))
                
                #lgbm 학습
                #model=LGBMRegressor(**lgbm_mae_params)
                #model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)        
                #v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns]))

                #xgb 학습
                model=XGBRegressor(**xgb_mae_params)
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)        
                v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns]))
                
                submission['answer'].iloc[(i)*168:(i+1)*168] += v/(len(kfold_split)*k*len(data_cols))
                
        random_seed += 1
submission.to_csv('submission_xgb_ver1.csv', index=False)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[6400]	validation_0-rmse:113.713
[6500]	validation_0-rmse:113.713
[6600]	validation_0-rmse:113.713
[6700]	validation_0-rmse:113.713
[6800]	validation_0-rmse:113.713
[6900]	validation_0-rmse:113.713
[7000]	validation_0-rmse:113.713
[7100]	validation_0-rmse:113.713
[7200]	validation_0-rmse:113.713
[7300]	validation_0-rmse:113.713
[7400]	validation_0-rmse:113.713
[7500]	validation_0-rmse:113.713
[7600]	validation_0-rmse:113.713
[7700]	validation_0-rmse:113.713
[7800]	validation_0-rmse:113.713
[7900]	validation_0-rmse:113.713
[8000]	validation_0-rmse:113.713
[8100]	validation_0-rmse:113.713
[8200]	validation_0-rmse:113.713
[8300]	validation_0-rmse:113.713
[8400]	validation_0-rmse:113.713
[8500]	validation_0-rmse:113.713
[8600]	validation_0-rmse:113.713
[8700]	validation_0-rmse:113.713
[8800]	validation_0-rmse:113.713
[8900]	validation_0-rmse:113.713
[9000]	validation_0-rmse:113.713
[9100]	validation_0-rmse:113.713
[9200]	validation_0-rmse:1

KeyboardInterrupt: ignored

In [None]:
submission