In [1]:
''' import '''
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

plt.rc('font', family='Malgun Gothic') # For Windows
# print(plt.rcParams['font.family'])
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import FinanceDataReader as fdr

import lightgbm as lgb
import optuna

In [2]:
random_state = 42
eid = 47  # 실험 번호
eid = str(eid).rjust(2, '0')

In [3]:
def to_signal(_type, n, shift):
    '''
    hour 또는 dayofweek 을 sin, cos 등 주기성을 가진 값으로 변환

    Parameters:
        _type : sin 또는 cos
        n : 카테고리 변수의 카테고리 수 (ex. 시간 = 24, 요일 = 7)
        shift : 주기함수의 x축 평행이동 크기 설정

    Returns:
        변환된 값
    '''
    def to_sin(x):
        return np.sin((x-shift)*2*np.pi/n)
    def to_cos(x):
        return np.cos((x-shift)*2*np.pi/n)
    
    if _type == 'sin':
        return to_sin
    else:
        return to_cos


def smape(true, pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE).
    SMAPE 스코어 계산

    Parameters:
        true (list or numpy array): 실제 전력소비량(kWh)
        pred (list or numpy array): 예측 전력소비량(kWh)

    Returns:
        tuple: ("smape", SMAPE 스코어, False)
    """

    if not isinstance(pred, np.ndarray):
        pred = pred.get_label()
    true = np.array(true).astype(np.float64)
    pred = np.array(pred).astype(np.float64)
    
    if len(true) != len(pred):  # 실제값과 예측값의 크기가 다르다면 ValueError 발생
        raise ValueError("true and pred lists must have the same length.")

    numerator = np.abs(true - pred)
    denominator = (np.abs(true) + np.abs(pred)) / 2.0
    smape_score = np.mean(numerator / denominator) * 100.0
    return "smape", smape_score, False


def weighted_mse(alpha=1):
    '''
    alpha : 과소추정에 대한 가중치 값
    '''
    def weighted_mse_fixed(pred, data):
        true = data.get_label()
        error = pred - true
        grad = np.where(error<0, 2.0*alpha*error, 2.0*error)
        hess = np.where(error<0, 2.0*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed


In [None]:
''' 건물 그룹별 건물번호들 리스트 '''
# b_public = [17, 18, 19, 20, 21, 22, 23]  # 16
# b_uni = [24, 25, 26, 27, 28, 29, 31]  # 30
# b_dcenter = [32, 33, 34, 35, 36]  #
# b_dstore = [37, 38, 39, 40, 41, 42]  # 43, 44
# b_medic = [45, 46, 47, 48, 49, 50, 51, 52]  #
# b_comm = [53, 55, 56, 57, 58, 59, 60]  # 54
# b_apart = [61, 62, 63, 64, 65, 66, 67, 68]  #
# b_lab = [69, 70, 72, 74, 76]  # 71, 73, 75
# b_biz = [77, 78, 79, 80, 82, 83, 84]  # 81
# b_mart = [6, 9, 86, 87, 88, 89, 90, 91, 92]  # 85
# b_hotel = [93, 94, 96, 97, 98, 99, 100]  # 95
# total = [b_public, b_uni, b_dcenter, b_dstore, b_medic, b_comm, b_apart, b_biz, b_mart, b_hotel]

# def merge_lists(fracs):
#     tot = []
#     for frac in fracs:
#         tot.extend(frac)
#     return tot
# others = merge_lists(total)

In [4]:
args = {
    'valid': True,  # True : Valid 분리 후 스코어 검증 / False : 리더보드 제출을 위한 8월 25~31일에 대한 예측
    'tuning': False,  # 하이퍼파라미터 튜닝 여부
    'type': 'apart',  # 건물유형 또는 특성 정보
    'buildings': [61, 62, 63, 64, 65, 66, 67, 68],  # 예측을 진행할 건물번호들 리스트 (1~100)
    'metric-alpha': 2,  # 과소추정에 대한 가중치 alpha 값 설정
    'drop-origin-feats': ['건물유형', '풍속(m/s)', '습도(%)'],  # 학습 전 drop 할 변수 설정
    'outliers': True,  # 이상치 처리 여부 설정
    'hour': {
        'exec': True,
        'period': 24,  # 주기함수 변환 시, 카테고리 개수 설정 (hour의 경우 0~23시간이므로 24)
        'sin': {'exec': False, 'shift': 2},  # exec : sin 변수 추가 여부 설정 / shift : x축 평행이동 크기 설정
        'cos': {'exec': False, 'shift': 2+12},
        'drop': False  # hour 변수 제거 여부 설정
    },
    'ampm': True,  # 오전 오후 변수 추가 여부 설정
    'dow': {
        'exec': True,
        'period': 7,
        'sin': {'exec': False, 'shift': 0},
        'cos': {'exec': False, 'shift': 7-1},
        'drop': True  # 요일 변수 제거 여부 설정
    },
    'holiday': {'exec': True, 'dist': False},  # exec : 공휴일 변수 추가 여부 설정
    'temp': {
        'z-score': {'exec': False, 'step': 6, 'x': 2},
        'max': {'avg': False, 'std': False, 'step': 5},  # max - avg : 일별 기온 최대값의 이동평균 / std : 이동표준편차 / step : 이동평균 범위
        'mean': {'avg': False, 'std': False, 'step': 6},  # mean : 일별 기온 평균값
        'min': {'avg': False, 'std': False, 'step': 8},  # min : 일별 기온 최솟값
        'dstd': False,  # 일별 기온 표준편차
        'bdh': {'mean': False, 'std': False, 'z-score': False},  # 건물번호&요일&시간 별 기온 평균 변수 추가 여부
        'hroll': {'exec': False, 'step': 8}  # 기온의 연속 시간 이동평균 / step : 이동평균 범위
    },
    'humidex': {
        'exec': True,  # 불쾌지수 변수 추가 여부
        'max': {'avg': True, 'std': False, 'step': 5},  # max - avg : 일별 불쾌지수 최대값의 이동평균 / std : 이동표준편차 / step : 이동평균 범위
        'mean': {'avg': False, 'std': False, 'step': 8},  # mean : 일별 불쾌지수 평균값
        'min': {'avg': False, 'std': False, 'step': 5},  # min : 일별 불쾌지수 최솟값
        'hroll': {'exec': True, 'step': 7}  # 불쾌지수의 연속 시간 이동평균 / step : 이동평균 범위
    },
    'target': {
        'bdh': {'mean': True, 'std': False},  # mean : 전력소비량(kWh)의 건물번호&요일&시간 별 평균 변수 추가 여부 / std: 표준편차
        'bh': {'mean': False},  # 전력소비량(kWh)의 건물번호&시간 별 평균 변수 추가 여부
        'bd': {'mean': False}  # 전력소비량(kWh)의 건물번호&요일 별 평균 변수 추가 여부
    },
    'bminmax' : {'bmax': True, 'bmin': True},  # bmax : 건물 별 전력소비량(kWh) 최댓값 변수 추가 여부 / bmin : 건물 별 전력소비량(kWh) 최댓값 변수 추가 여부
    'area': {'exec': False, 'ratio': 0.2}  # exec: 연면적과 냉방면적을 선형결합한 변수 추가 여부 / ratio : 결합 비율 설정
}

In [5]:
''' data load '''
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
train = df_train.copy()
test = df_test.copy()
df_building = pd.read_csv('../data/building_info.csv')
df_sub = pd.read_csv('../data/sample_submission.csv')

''' drop 일조 & 일사 '''
train = train.drop(['일조(hr)', '일사(MJ/m2)'], axis=1)

''' merge df_building '''
train = train.merge(df_building, on='건물번호')
test = test.merge(df_building, on='건물번호')

''' date hour process '''  # 시간 변수 추가
train['datehour'] = pd.to_datetime(train['일시'].str.replace(' ', ''), format='%Y%m%d%H')
test['datehour'] = pd.to_datetime(test['일시'].str.replace(' ', ''), format='%Y%m%d%H')
train['date'] = pd.to_datetime(train['일시'].str.slice(0, 8), format='%Y%m%d')
test['date'] = pd.to_datetime(test['일시'].str.slice(0, 8), format='%Y%m%d')
train = train.drop(['일시', 'num_date_time'], axis=1)
test = test.drop(['일시', 'num_date_time'], axis=1)

''' wind / humidity interpolate '''  # 풍속&습도 결측치를 보간법을 통해 채우기
train['습도(%)'] = train['습도(%)'].interpolate().round(1)
train['풍속(m/s)'] = train['풍속(m/s)'].interpolate().round(1)

''' get specific buildings '''  # 예측하고자 하는 건물번호들만 데이터에서 추출
train = train.loc[train['건물번호'].isin(args['buildings'])].reset_index(drop=True)
test = test.loc[test['건물번호'].isin(args['buildings'])].reset_index(drop=True)

' data load '

' drop 일조 & 일사 '

' merge df_building '

' date hour process '

' wind / humidity interpolate '

' get specific buildings '

In [6]:
''' befor graph '''  # 전처리&이상치 처리하기 전, 건물번호 별 그래프 확인
if False:
    fig, ax = plt.subplots(nrows=len(args['buildings']), figsize=(20, 3*len(args['buildings'])))
    for i, b_num in enumerate(args['buildings']):
        _ = ax[i].set_title(f'{b_num}')
        _ = sns.lineplot(data=train.loc[train['건물번호'] == b_num], y='전력소비량(kWh)', x='datehour', color='b', ax=ax[i])

' befor graph '

In [7]:
''' Preprocess '''

# hour
if args['hour']['exec']:
    ''' hour '''
    train['hour'] = train['datehour'].dt.hour
    test['hour'] = test['datehour'].dt.hour

if args['hour']['cos']['exec']:
    ''' hour : cos '''
    train['hour_cos'] = train['hour'].apply(to_signal('cos', args['hour']['period'], args['hour']['cos']['shift']))
    test['hour_cos'] = test['hour'].apply(to_signal('cos', args['hour']['period'], args['hour']['cos']['shift']))

if args['hour']['sin']['exec']:
    ''' hour : sin '''
    train['hour_sin'] = train['hour'].apply(to_signal('sin', args['dow']['period'], args['dow']['sin']['shift']))
    test['hour_sin'] = test['hour'].apply(to_signal('sin', args['dow']['period'], args['dow']['sin']['shift']))

# ampm
if args['ampm']:
    train['ampm'] = np.where(train['hour'] < 12, 'am', 'pm')
    test['ampm'] = np.where(test['hour'] < 12, 'am', 'pm')
    train['ampm'] = train['ampm'].astype('category')
    test['ampm'] = test['ampm'].astype('category')

# dayofweek
if args['dow']['exec']:
    ''' dayofweek '''
    train['dayofweek'] = train['date'].dt.dayofweek
    test['dayofweek'] = test['date'].dt.dayofweek

if args['dow']['cos']['exec']:
    ''' dayofweek : cos '''
    train['dow_cos'] = train['dayofweek'].apply(to_signal('cos', args['dow']['period'], args['dow']['cos']['shift']))
    test['dow_cos'] = test['dayofweek'].apply(to_signal('cos', args['dow']['period'], args['dow']['cos']['shift']))

if args['dow']['sin']['exec']:
    ''' dayofweek : sin '''
    train['dow_sin'] = train['dayofweek'].apply(to_signal('sin', args['dow']['period'], args['dow']['sin']['shift']))
    test['dow_sin'] = test['dayofweek'].apply(to_signal('sin', args['dow']['period'], args['dow']['sin']['shift']))


# holiday
if True:
    ''' holiday '''
    working_days = fdr.DataReader('005930', '2022-06-01', '2022-08-31')
    working_days = working_days.index
    train['holiday'] = np.where(train['date'].isin(working_days), 0, 1)
    test['holiday'] = np.where(test['date'].isin(working_days), 0, 1)

if args['holiday']['dist']:
    ''' distance to holiday '''
    tmp = pd.concat([train, test]).sort_values(['건물번호', 'datehour']).reset_index(drop=True)
    tmp = tmp.groupby('date')['holiday'].any()

    arr, x = [], 3
    for b in tmp.values[::-1]:
        arr.append(x)
        x = 0 if b else x+1
    tmp.loc[:] = arr[::-1]

    train = pd.merge(train, tmp, how='left', left_on='date', right_index=True, suffixes=['', '_dist'])
    test = pd.merge(test, tmp, how='left', left_on='date', right_index=True, suffixes=['', '_dist'])
    train['holiday_dist'] = train['holiday_dist'].astype('int')
    test['holiday_dist'] = test['holiday_dist'].astype('int')


# humidex
if True:
    ''' humidex : (5/9 * 섭씨온도) - { 0.55 * (1 - 상대습도(%)) * (5/9 * 섭씨온도 - 26) } + 32 '''
    train['humidex'] = (9/5*train['기온(C)']) - 0.55*(1-train['습도(%)'])*(9/5*train['기온(C)'] - 26) + 32
    test['humidex'] = (9/5*test['기온(C)']) - 0.55*(1-test['습도(%)'])*(9/5*test['기온(C)'] - 26) + 32

if args['humidex']['max']['avg']:
    ''' humidex - max rolling avg '''
    step = args['humidex']['max']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['humidex'].max(), test.groupby(['건물번호', 'date'])['humidex'].max()]).sort_index().reset_index()
    temp['humidex'] = temp.groupby('건물번호')['humidex'].rolling(step, min_periods=1).mean().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_maxaroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_maxaroll'])

if args['humidex']['mean']['avg']:
    ''' humidex - mean rolling avg '''
    step = args['humidex']['mean']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['humidex'].mean(), test.groupby(['건물번호', 'date'])['humidex'].mean()]).sort_index().reset_index()
    temp['humidex'] = temp.groupby('건물번호')['humidex'].rolling(step, min_periods=1).mean().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_meanaroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_meanaroll'])

if args['humidex']['min']['avg']:
    ''' humidex - min rolling avg '''
    step = args['humidex']['min']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['humidex'].min(), test.groupby(['건물번호', 'date'])['humidex'].min()]).sort_index().reset_index()
    temp['humidex'] = temp.groupby('건물번호')['humidex'].rolling(step, min_periods=1).mean().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_minaroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_minaroll'])

if args['humidex']['hroll']['exec']:
    ''' humidex - hourly rolling '''
    tmp = pd.concat([train, test]).sort_values(['건물번호', 'datehour']).reset_index(drop=True)
    tmp['humidex_hroll'] = tmp.groupby('건물번호')['humidex'].rolling(args['humidex']['hroll']['step'], min_periods=1).mean().values
    train = tmp.loc[tmp['date'] < '2022-08-25'].sort_values(['건물번호', 'datehour']).reset_index(drop=True)
    test = tmp.loc[tmp['date'] >= '2022-08-25'].sort_values(['건물번호', 'datehour']).reset_index(drop=True).drop('전력소비량(kWh)', axis=1)


# temperature (max / mean / min rolling) / (daily std) 
if args['temp']['z-score']['exec']:
    ''' z-score rolling '''
    step, x = args['temp']['z-score']['step'], args['temp']['z-score']['x']

    # max rolling avg
    temp = pd.concat([train.groupby(['건물번호', 'date'])['기온(C)'].max(), test.groupby(['건물번호', 'date'])['기온(C)'].max()]).sort_index().reset_index()
    temp['기온(C)'] = temp.groupby('건물번호')['기온(C)'].rolling(step, min_periods=1).mean().values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_aroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_aroll'])

    # max rolling std
    temp = pd.concat([train.groupby(['건물번호', 'date'])['기온(C)'].max(), test.groupby(['건물번호', 'date'])['기온(C)'].max()]).sort_index().reset_index()
    temp['기온(C)'] = temp.groupby('건물번호')['기온(C)'].rolling(step, min_periods=1).std().values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_sroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_sroll'])
    train['기온(C)_sroll'] = train['기온(C)_sroll'].fillna(0)

    # avg / (std + x)
    train['기온(C)_zroll'] = (train['기온(C)'] - train['기온(C)_aroll']) / (train['기온(C)_sroll'] + x)
    test['기온(C)_zroll'] = (test['기온(C)'] - test['기온(C)_aroll']) / (test['기온(C)_sroll'] + x)

    # drop avg & std
    train = train.drop(['기온(C)_aroll', '기온(C)_sroll'], axis=1)
    test = test.drop(['기온(C)_aroll', '기온(C)_sroll'], axis=1)

if args['temp']['max']['avg']:
    ''' max rolling avg '''
    step = args['temp']['max']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['기온(C)'].max(), test.groupby(['건물번호', 'date'])['기온(C)'].max()]).sort_index().reset_index()
    temp['기온(C)'] = temp.groupby('건물번호')['기온(C)'].rolling(step, min_periods=1).mean().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_maxaroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_maxaroll'])

if args['temp']['max']['std']:
    ''' max rolling std '''
    step = args['temp']['max']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['기온(C)'].max(), test.groupby(['건물번호', 'date'])['기온(C)'].max()]).reset_index().sort_values(['건물번호', 'date'])
    temp['기온(C)'] = temp.groupby('건물번호')['기온(C)'].rolling(6, min_periods=1).std().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_sroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_sroll'])
    train['기온(C)_sroll'] = train['기온(C)_sroll'].fillna(0)

if args['temp']['mean']['avg']:
    ''' mean rolling '''
    step = args['temp']['mean']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['기온(C)'].mean(), test.groupby(['건물번호', 'date'])['기온(C)'].mean()]).sort_index().reset_index()
    temp['기온(C)'] = temp.groupby('건물번호')['기온(C)'].rolling(step, min_periods=1).mean().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_meanaroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_meanaroll'])

if args['temp']['min']['avg']:
    ''' min rolling '''
    step = args['temp']['min']['step']
    temp = pd.concat([train.groupby(['건물번호', 'date'])['기온(C)'].min(), test.groupby(['건물번호', 'date'])['기온(C)'].min()]).sort_index().reset_index()
    temp['기온(C)'] = temp.groupby('건물번호')['기온(C)'].rolling(step, min_periods=1).mean().round(1).values
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_minaroll'])
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_on=['건물번호', 'date'], suffixes=['', '_minaroll'])

if args['temp']['dstd']:
    ''' daily std '''
    temp = train.groupby(['건물번호', 'date'])['기온(C)'].std()
    train = pd.merge(train, temp, left_on=['건물번호', 'date'], right_index=True, suffixes=['', '_std'])
    temp = test.groupby(['건물번호', 'date'])['기온(C)'].std()
    test = pd.merge(test, temp, left_on=['건물번호', 'date'], right_index=True, suffixes=['', '_std'])

if args['temp']['hroll']['exec']:
    ''' hourly rolling '''
    tmp = pd.concat([train, test]).sort_values(['건물번호', 'datehour']).reset_index(drop=True)
    tmp['기온(C)_hroll'] = tmp.groupby('건물번호')['기온(C)'].rolling(args['temp']['hroll']['step'], min_periods=1).mean().values
    train = tmp.loc[tmp['date'] < '2022-08-25'].sort_values(['건물번호', 'datehour']).reset_index(drop=True)
    test = tmp.loc[tmp['date'] >= '2022-08-25'].sort_values(['건물번호', 'datehour']).reset_index(drop=True).drop('전력소비량(kWh)', axis=1)

# 기온 bdh z-score / mean / std
if args['valid']:
    if args['temp']['bdh']['z-score']:
        ''' for Valid - z by 건물번호, 요일, 시간 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].std()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()
        train['기온(C)_z'] = (train['기온(C)'] - train['기온(C)_bdh']) / train['기온(C)_std']
        train = train.drop(['기온(C)_bdh', '기온(C)_std'], axis=1)
    
    if args['temp']['bdh']['mean']:
        ''' for Valid - mean by 건물번호, 요일, 시간 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()

    if args['temp']['bdh']['std']:
        ''' for Valid - std by 건물번호, 요일, 시간 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].std()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()

else:
    if args['temp']['bdh']['z-score']:
        ''' for Test - z by 건물번호, 요일, 시간 '''
        tmp = train.groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()
        tmp = train.groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].std()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()
        train['기온(C)_z'] = (train['기온(C)'] - train['기온(C)_bdh']) / train['기온(C)_std']
        test['기온(C)_z'] = (test['기온(C)'] - test['기온(C)_bdh']) / test['기온(C)_std']
        train = train.drop(['기온(C)_bdh', '기온(C)_std'], axis=1)
        test = test.drop(['기온(C)_bdh', '기온(C)_std'], axis=1)

    if args['temp']['bdh']['mean']:
        ''' for Test - mean by 건물번호, 요일, 시간 '''
        tmp = train.groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()

    if args['temp']['bdh']['std']:
        ''' for Test - std by 건물번호, 요일, 시간 '''
        tmp = train.groupby(['건물번호', 'dayofweek', 'hour'])['기온(C)'].std()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()


# 냉방면적 + x*연면적
if args['area']['exec']:
    ''' area linear '''
    x = args['area']['ratio']
    train['area'] = x * train['연면적(m2)'] + train['냉방면적(m2)']
    test['area'] = x * test['연면적(m2)'] + test['냉방면적(m2)']
    # train['냉방면적비율'] = train['냉방면적(m2)'] / train['연면적(m2)']
    # test['냉방면적비율'] = test['냉방면적(m2)'] / test['연면적(m2)']


' Preprocess '

' hour '

' dayofweek '

' holiday '

' humidex : (5/9 * 섭씨온도) - { 0.55 * (1 - 상대습도(%)) * (5/9 * 섭씨온도 - 26) } + 32 '

' humidex - max rolling avg '

' humidex - hourly rolling '

In [8]:
if args['outliers']:  # 이상치 처리
    ''' outlliers '''
    # 기타 건물
    if 4 in args['buildings']:
        train.loc[(train['건물번호'] == 4) & (train['datehour'] >= '2022-07-25-22') & (train['datehour'] <= '2022-07-25-22'), '전력소비량(kWh)'] = 620
    if 5 in args['buildings']:
        train.drop(train.loc[(train['건물번호'] == 5) & (train['date'] == '2022-06-17')].index, inplace=True)
    if 11 in args['buildings']:
        train.loc[(train['건물번호'] == 11) & (train['datehour'] > '2022-07-22') & (train['datehour'] < '2022-08-18') & (train['전력소비량(kWh)'] < 1490), '전력소비량(kWh)'] = [1680.13, 1662.54, 1662.54, 1693.95, 2328.63, 1871.64]
        # train.loc[(train['건물번호'] == 11) & (train['datehour'] >= '2022-07-01')].groupby('hour')['전력소비량(kWh)'].mean()

    # public
    if 16 in args['buildings']:
        train.loc[(train['건물번호'] == 16) & (train['datehour'] >= '2022-06-22-01') & (train['datehour'] < '2022-06-22-02'), '전력소비량(kWh)'] =\
            (train.loc[(train['건물번호'] == 16) & (train['datehour'] >= '2022-06-22-00') & (train['datehour'] < '2022-06-22-01'), '전력소비량(kWh)'] +\
                train.loc[(train['건물번호'] == 16) & (train['datehour'] >= '2022-06-22-02') & (train['datehour'] < '2022-06-22-03'), '전력소비량(kWh)']) / 2
    if 17 in args['buildings']:
        train.loc[(train['건물번호'] == 17) & (train['datehour'] >= '2022-06-18') & (train['datehour'] < '2022-06-19'), '전력소비량(kWh)'] =\
        (train.loc[(train['건물번호'] == 17) & (train['datehour'] >= '2022-06-11') & (train['datehour'] < '2022-06-12'), '전력소비량(kWh)'].values +\
            train.loc[(train['건물번호'] == 17) & (train['datehour'] >= '2022-06-25') & (train['datehour'] < '2022-06-26'), '전력소비량(kWh)'].values) / 2
        train.loc[(train['건물번호'] == 17) & (train['datehour'] >= '2022-07-23') & (train['datehour'] < '2022-07-24'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 17) & (train['dayofweek'] == 5)].groupby('hour')['전력소비량(kWh)'].mean().values
    if 22 in args['buildings']:
        train.loc[(train['건물번호'] == 22) & (train['datehour'] >= '2022-06-22') & (train['datehour'] < '2022-06-23'), '전력소비량(kWh)'] =\
        (train.loc[(train['건물번호'] == 22) & (train['datehour'] >= '2022-06-15') & (train['datehour'] < '2022-06-16'), '전력소비량(kWh)'].values +\
            train.loc[(train['건물번호'] == 22) & (train['datehour'] >= '2022-06-29') & (train['datehour'] < '2022-06-30'), '전력소비량(kWh)'].values) / 2
    if 23 in args['buildings']:
        train.loc[(train['건물번호'] == 23) & (train['datehour'] >= '2022-07-04-20') & (train['datehour'] < '2022-07-05-06'), '전력소비량(kWh)'] =\
        (train.loc[(train['건물번호'] == 23) & (train['datehour'] >= '2022-06-27-20') & (train['datehour'] < '2022-06-28-06'), '전력소비량(kWh)'].values +\
            train.loc[(train['건물번호'] == 23) & (train['datehour'] >= '2022-07-11-20') & (train['datehour'] < '2022-07-12-06'), '전력소비량(kWh)'].values) / 2

    # uni
    if 26 in args['buildings']:
        train.loc[(train['건물번호'] == 26) & (train['datehour'] >= '2022-07-16') & (train['datehour'] < '2022-07-18'), '전력소비량(kWh)'] =\
        (train.loc[(train['건물번호'] == 26) & (train['datehour'] >= '2022-07-23') & (train['datehour'] < '2022-07-25'), '전력소비량(kWh)'].values +\
            train.loc[(train['건물번호'] == 26) & (train['datehour'] >= '2022-07-09') & (train['datehour'] < '2022-07-11'), '전력소비량(kWh)'].values) / 2
    if 28 in args['buildings']:
        train.loc[(train['건물번호'] == 28) & (train['datehour'] >= '2022-07-28-02') & (train['datehour'] < '2022-07-28-03'), '전력소비량(kWh)'] =\
            (train.loc[(train['건물번호'] == 28) & (train['datehour'] >= '2022-07-28-01') & (train['datehour'] < '2022-07-28-02'), '전력소비량(kWh)'].values +\
                train.loc[(train['건물번호'] == 28) & (train['datehour'] >= '2022-07-28-03') & (train['datehour'] < '2022-07-28-04'), '전력소비량(kWh)'].values) / 2
    if 29 in args['buildings']:
        train.loc[(train['건물번호'] == 29) & (train['datehour'] >= '2022-06-25-12') & (train['datehour'] < '2022-06-25-16'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 29) & (train['datehour'] >= '2022-06-25-12') & (train['datehour'] < '2022-06-25-16'), '전력소비량(kWh)'] + 55
        train.loc[(train['건물번호'] == 29) & (train['datehour'] >= '2022-06-25-14') & (train['datehour'] < '2022-06-25-16'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 29) & (train['datehour'] >= '2022-06-25-14') & (train['datehour'] < '2022-06-25-16'), '전력소비량(kWh)'] + 120
        train.loc[(train['건물번호'] == 29) & (train['datehour'] >= '2022-08-20-12') & (train['datehour'] < '2022-08-20-15'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 29) & (train['datehour'] >= '2022-08-20-12') & (train['datehour'] < '2022-08-20-15'), '전력소비량(kWh)'] + 130
    if 31 in args['buildings']:
        train.loc[(train['건물번호'] == 31) & (train['datehour'] >= '2022-07-02-11') & (train['datehour'] < '2022-07-02-12'), '전력소비량(kWh)'] =\
            (train.loc[(train['건물번호'] == 31) & (train['datehour'] >= '2022-07-02-10') & (train['datehour'] < '2022-07-02-11'), '전력소비량(kWh)'] +\
                train.loc[(train['건물번호'] == 31) & (train['datehour'] >= '2022-07-02-12') & (train['datehour'] < '2022-07-02-13'), '전력소비량(kWh)']) / 2

    # dcenter
    if 33 in args['buildings']:
        # tmp = (train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-06-29') & (train['datehour'] < '2022-06-30'), '전력소비량(kWh)'].values +\
        #        train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-07-01') & (train['datehour'] < '2022-07-02'), '전력소비량(kWh)'].values) / 2
        # train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-06-30') & (train['datehour'] < '2022-07-01'), '전력소비량(kWh)'] = tmp
        tmp = (train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-07-05-18') & (train['datehour'] < '2022-07-06-14'), '전력소비량(kWh)'].values +\
            train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-07-03-18') & (train['datehour'] < '2022-07-04-14'), '전력소비량(kWh)'].values) / 2
        train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-07-04-18') & (train['datehour'] < '2022-07-05-14'), '전력소비량(kWh)'] = tmp
        train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-07-11-20') & (train['datehour'] < '2022-07-11-22'), '전력소비량(kWh)'] = [8781, 8763]
        train.loc[(train['건물번호'] == 33) & (train['datehour'] >= '2022-07-12-20') & (train['datehour'] < '2022-07-12-22'), '전력소비량(kWh)'] = [8781, 8763]
    if 34 in args['buildings']:
        train.loc[(train['건물번호'] == 34) & (train['전력소비량(kWh)'] < 2800), '전력소비량(kWh)'] = 3470
    if 35 in args['buildings']:
        train.loc[(train['건물번호'] == 35) & (train['datehour'] >= '2022-07-09-06') & (train['datehour'] < '2022-07-09-08'), '전력소비량(kWh)'] = [2176.3, 2177.6]
        train.loc[(train['건물번호'] == 35) & (train['datehour'] >= '2022-08-08-19') & (train['datehour'] < '2022-08-08-23'), '전력소비량(kWh)'] = [2229.5, 2228.4, 2227.0, 2235.0]

    # dstore
    if 40 in args['buildings']:
        train.drop(train.loc[(train['건물번호'] == 40) & (train['date'] == '2022-06-17')].index, inplace=True)
        # train.loc[(train['건물번호'] == 40) & (train['datehour'] >= '2022-06-17') & (train['datehour'] < '2022-06-18'), '전력소비량(kWh)'] =\
        #     train.loc[(train['건물번호'] == 40) & (train['date'] < '2022-06-17') & (train['dayofweek'] == 4)].groupby('hour')['전력소비량(kWh)'].mean().values

    # medic
    if 48 in args['buildings']:
        train.loc[(train['건물번호'] == 48) & (train['datehour'] >= '2022-06-19-08') & (train['datehour'] < '2022-06-19-12'), '전력소비량(kWh)'] =\
            (train.loc[(train['건물번호'] == 48) & (train['datehour'] >= '2022-06-12-08') & (train['datehour'] < '2022-06-12-12'), '전력소비량(kWh)'].values +\
                train.loc[(train['건물번호'] == 48) & (train['datehour'] >= '2022-06-26-08') & (train['datehour'] < '2022-06-26-12'), '전력소비량(kWh)'].values) / 2
    if 49 in args['buildings']:
        train.loc[(train['건물번호'] == 49) & (train['datehour'] >= '2022-07-06-01') & (train['datehour'] < '2022-07-06-03'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 49) & (train['datehour'] >= '2022-07-06-01') & (train['datehour'] < '2022-07-06-03'), '전력소비량(kWh)'] + 100
        if not args['valid']:
            train.loc[(train['건물번호'] == 49) & (train['datehour'] >= '2022-08-17') & (train['datehour'] < '2022-08-20') & (train['전력소비량(kWh)'] < 2540), '전력소비량(kWh)'] =\
                train.loc[(train['건물번호'] == 49) & (train['datehour'] >= '2022-08-17') & (train['datehour'] < '2022-08-20') & (train['전력소비량(kWh)'] < 2540), '전력소비량(kWh)'] + 200
    if 52 in args['buildings']:
        train.loc[(train['건물번호'] == 52) & (train['datehour'] >= '2022-07-05-04') & (train['datehour'] < '2022-07-05-05'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 52) & (train['datehour'] >= '2022-07-05-04') & (train['datehour'] < '2022-07-05-05'), '전력소비량(kWh)'] + 300
        train.loc[(train['건물번호'] == 52) & (train['datehour'] >= '2022-07-30-00') & (train['datehour'] < '2022-07-30-01'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 52) & (train['datehour'] >= '2022-07-30-00') & (train['datehour'] < '2022-07-30-01'), '전력소비량(kWh)'] + 100

    # comm
    if 54 in args['buildings']:
        train.drop(train.loc[(train['건물번호'] == 54) & (train['date'] == '2022-06-17')].index, inplace=True)
        train.drop(train.loc[(train['건물번호'] == 54) & (train['date'] == '2022-08-16')].index, inplace=True)
        train.drop(train.loc[(train['건물번호'] == 54) & (train['date'] == '2022-08-17')].index, inplace=True)
    if 56 in args['buildings']:
        train.loc[(train['건물번호'] == 56) & (train['datehour'] >= '2022-06-08-16') & (train['datehour'] < '2022-06-08-17'), '전력소비량(kWh)'] = 4150
        train.loc[(train['건물번호'] == 56) & (train['datehour'] >= '2022-06-18-22') & (train['datehour'] < '2022-06-19-00'), '전력소비량(kWh)'] = [4000, 4010]
        train.loc[(train['건물번호'] == 56) & (train['datehour'] >= '2022-06-19-05') & (train['datehour'] < '2022-06-19-08'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 56) & (train['datehour'] >= '2022-06-19-05') & (train['datehour'] < '2022-06-19-08'), '전력소비량(kWh)'] - 50
    if 58 in args['buildings']:
        train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-04-19') & (train['datehour'] <= '2022-07-04-20'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-04-19') & (train['datehour'] <= '2022-07-04-20'), '전력소비량(kWh)'] + 20
        train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-05-04') & (train['datehour'] <= '2022-07-05-06'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-05-04') & (train['datehour'] <= '2022-07-05-06'), '전력소비량(kWh)'] + 70
        train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-05-09') & (train['datehour'] <= '2022-07-05-09'), '전력소비량(kWh)'] = 2940
        train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-05-11') & (train['datehour'] <= '2022-07-05-11'), '전력소비량(kWh)'] = 2985
        train.loc[(train['건물번호'] == 58) & (train['datehour'] >= '2022-07-05-13') & (train['datehour'] <= '2022-07-05-15'), '전력소비량(kWh)'] = [3030, 3015, 3024]

    # apart
    if 61 in args['buildings']:
        train.loc[(train['건물번호'] == 61) & (train['datehour'] >= '2022-07-01-10') & (train['datehour'] <= '2022-07-01-10'), '전력소비량(kWh)'] = 2935
        train.loc[(train['건물번호'] == 61) & (train['datehour'] >= '2022-07-01-12') & (train['datehour'] <= '2022-07-01-12'), '전력소비량(kWh)'] = 2765
        train.loc[(train['건물번호'] == 61) & (train['datehour'] >= '2022-07-01-13') & (train['datehour'] <= '2022-07-01-14'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 61) & (train['datehour'] >= '2022-07-01-13') & (train['datehour'] <= '2022-07-01-14'), '전력소비량(kWh)'] + 500
    if 65 in args['buildings']:
        train.loc[(train['건물번호'] == 65) & (train['datehour'] >= '2022-06-15-03') & (train['datehour'] <= '2022-06-15-03'), '전력소비량(kWh)'] = 225

    # lab
    if 69 in args['buildings']:
        train.loc[(train['건물번호'] == 69) & (train['datehour'] >= '2022-06-08') & (train['datehour'] < '2022-06-09') & (train['전력소비량(kWh)'] < 3000), '전력소비량(kWh)'] = 4633.95
    if 70 in args['buildings']:
        tmp = train.loc[(train['건물번호'] == 70) & (train['datehour'] >= '2022-06-18') & (train['datehour'] < '2022-07-29')].groupby(['dayofweek', 'hour'])['전력소비량(kWh)'].mean().round(2).reset_index()
        tmp = pd.merge(train.loc[(train['건물번호'] == 70) & (train['datehour'] >= '2022-07-29') & (train['datehour'] <= '2022-08-08')], tmp, on=['dayofweek', 'hour'], how='left')['전력소비량(kWh)_y']
        train.loc[(train['건물번호'] == 70) & (train['datehour'] >= '2022-07-29') & (train['datehour'] <= '2022-08-08'), '전력소비량(kWh)'] = tmp.values
    if 72 in args['buildings']:
        train.loc[(train['건물번호'] == 72) & (train['datehour'] >= '2022-06-23') & (train['datehour'] < '2022-07-19') & (train['전력소비량(kWh)'] < 1070), '전력소비량(kWh)'] = [1140, 1150, 1178, 1182, 1100, 1090, 1080]
        train.loc[(train['건물번호'] == 72) & (train['datehour'] >= '2022-07-31') & (train['datehour'] < '2022-08-01') & (train['전력소비량(kWh)'] < 1102), '전력소비량(kWh)'] = [1130, 1135, 1140, 1145]
    if 75 in args['buildings']:
        train.loc[(train['건물번호'] == 75) & (train['전력소비량(kWh)'] < 750), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 75) & (train['전력소비량(kWh)'] < 750), '전력소비량(kWh)'] + 850
        train.loc[(train['건물번호'] == 75) & (train['datehour'] > '2022-06-17-06') & (train['datehour'] < '2022-06-18-00'), '전력소비량(kWh)'] =\
            train.loc[(train['건물번호'] == 75) & (train['datehour'] > '2022-06-17-06') & (train['datehour'] < '2022-06-18-00'), '전력소비량(kWh)'] + 250

    # biz
    if 79 in args['buildings']:
        train.loc[(train['건물번호'] == 79) & (train['datehour'] >= '2022-06-12-19') & (train['datehour'] < '2022-06-12-22'), '전력소비량(kWh)'] = [2000.4, 1928.16, 1855.92]

    # mart
    if 86 in args['buildings']:
        train.drop(train.loc[(train['건물번호'] == 86) & (train['datehour'] >= '2022-06-10') & (train['datehour'] < '2022-06-11')].index, inplace=True)
        train.drop(train.loc[(train['건물번호'] == 86) & (train['datehour'] >= '2022-06-12') & (train['datehour'] < '2022-06-13')].index, inplace=True)
        train.drop(train.loc[(train['건물번호'] == 86) & (train['datehour'] >= '2022-07-30') & (train['datehour'] < '2022-07-31')].index, inplace=True)
        train.drop(train.loc[(train['건물번호'] == 86) & (train['datehour'] >= '2022-08-10') & (train['datehour'] < '2022-08-11')].index, inplace=True)
        train.drop(train.loc[(train['건물번호'] == 86) & (train['datehour'] >= '2022-08-14') & (train['datehour'] < '2022-08-15')].index, inplace=True)
        train.reset_index(drop=True, inplace=True)
    if 87 in args['buildings']:
        train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-07-29-22') & (train['datehour'] <= '2022-07-29-22'), '전력소비량(kWh)'] = 1493.19
        train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-07-30-00') & (train['datehour'] <= '2022-07-30-00'), '전력소비량(kWh)'] = 1080.18
        train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-07-30-22') & (train['datehour'] <= '2022-07-30-22'), '전력소비량(kWh)'] = 1501.2
        train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-08-01-22') & (train['datehour'] <= '2022-08-01-22'), '전력소비량(kWh)'] = 1243.17
        train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-08-05-23') & (train['datehour'] <= '2022-08-06-00'), '전력소비량(kWh)'] = [877.52, 759.02]
        train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-08-15-23') & (train['datehour'] <= '2022-08-15-23'), '전력소비량(kWh)'] = 1106.1
        if not args['valid']:
            train.loc[(train['건물번호'] == 87) & (train['datehour'] >= '2022-08-23-22') & (train['datehour'] <= '2022-08-23-23'), '전력소비량(kWh)'] = 1293.57
    if 91 in args['buildings']:
        if not args['valid']:
            train.drop(train.loc[(train['건물번호'] == 91) & (train['datehour'] >= '2022-08-22') & (train['datehour'] < '2022-08-23')].index, inplace=True)

    # hotel
    if 95 in args['buildings']:
        train.loc[np.log(train['전력소비량(kWh)']) < 4, '전력소비량(kWh)'] = np.nan
        train['전력소비량(kWh)'] = train['전력소비량(kWh)'].interpolate()
    if 100 in args['buildings']:
        train.loc[(train['건물번호'] == 100) & (train['datehour'] >= '2022-06-29-14') & (train['datehour'] < '2022-06-29-16'), '전력소비량(kWh)'] = [800, 850]
        train.loc[(train['건물번호'] == 100) & (train['datehour'] >= '2022-06-08-17') & (train['datehour'] < '2022-06-08-18'), '전력소비량(kWh)'] = [700]

    train = train.sort_values(['건물번호', 'datehour']).reset_index(drop=True)


' outlliers '

In [9]:
# 전력소비량 bdh mean & std / bh mean / bd mean
if args['valid']:
    if args['target']['bdh']['mean']:
        ''' for Valid - mean by 건물번호, 요일, 시간 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek', 'hour'])['전력소비량(kWh)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()

    if args['target']['bdh']['std']:
        ''' for Valid - std by 건물번호, 요일, 시간 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek', 'hour'])['전력소비량(kWh)'].std()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()
    
    if args['target']['bh']['mean']:
        ''' for Valid - mean by 건물번호, 시간 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'hour'])['전력소비량(kWh)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'hour'], right_index=True, suffixes=('', '_bh')).sort_index()
    
    if args['target']['bd']['mean']:
        ''' for Valid - mean by 건물번호, 요일 '''
        tmp = train.loc[train['datehour'] < '2022-08-18'].groupby(['건물번호', 'dayofweek'])['전력소비량(kWh)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek'], right_index=True, suffixes=('', '_bd')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek'], right_index=True).sort_index().rename({'전력소비량(kWh)': '전력소비량(kWh)_bd'}, axis=1)
    
else:
    if args['target']['bdh']['mean']:
        ''' for Test - mean by 건물번호, 요일, 시간 '''
        tmp = train.groupby(['건물번호', 'dayofweek', 'hour'])['전력소비량(kWh)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_bdh')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True).sort_index().rename({'전력소비량(kWh)': '전력소비량(kWh)_bdh'}, axis=1)

    if args['target']['bdh']['std']:
        ''' for Test - std by 건물번호, 요일, 시간 '''
        tmp = train.groupby(['건물번호', 'dayofweek', 'hour'])['전력소비량(kWh)'].std()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True, suffixes=('', '_std')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek', 'hour'], right_index=True).sort_index().rename({'전력소비량(kWh)': '전력소비량(kWh)_std'}, axis=1)

    if args['target']['bh']['mean']:
        ''' for Test - mean by 건물번호, 시간 '''
        tmp = train.groupby(['건물번호', 'hour'])['전력소비량(kWh)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'hour'], right_index=True, suffixes=('', '_bh')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'hour'], right_index=True).sort_index().rename({'전력소비량(kWh)': '전력소비량(kWh)_bh'}, axis=1)

    if args['target']['bd']['mean']:
        ''' for Test - mean by 건물번호, 요일 '''
        tmp = train.groupby(['건물번호', 'dayofweek'])['전력소비량(kWh)'].mean()
        train = pd.merge(train, tmp, left_on=['건물번호', 'dayofweek'], right_index=True, suffixes=('', '_bd')).sort_index()
        test = pd.merge(test, tmp, left_on=['건물번호', 'dayofweek'], right_index=True).sort_index().rename({'전력소비량(kWh)': '전력소비량(kWh)_bd'}, axis=1)


# 최대 & 최소 by 건물번호
if args['bminmax']['bmax']:
    ''' max by 건물번호 '''
    tmp = train.groupby('건물번호')['전력소비량(kWh)'].max()
    train = pd.merge(train, tmp, how='left', left_on='건물번호', right_index=True, suffixes=['', '_bmax'])
    test = pd.merge(test, tmp, how='left', left_on='건물번호', right_index=True, suffixes=['', '_bmax'])

if args['bminmax']['bmin']:
    ''' min by 건물번호 '''
    tmp = train.groupby('건물번호')['전력소비량(kWh)'].min()
    train = pd.merge(train, tmp, how='left', left_on='건물번호', right_index=True, suffixes=['', '_bmin'])
    test = pd.merge(test, tmp, how='left', left_on='건물번호', right_index=True, suffixes=['', '_bmin'])


# object -> category
if True:
    ''' object -> category '''
    type_to = 'category'
    feats = ['건물번호', '건물유형', 'hour', 'holiday', 'dayofweek']
    for feat in feats:
        train[feat] = train[feat].astype(type_to)
        test[feat] = test[feat].astype(type_to)

# hour drop
if args['hour']['drop']:
    ''' hour : drop ''' 
    train = train.drop('hour', axis=1)
    test = test.drop('hour', axis=1)

# dayofweek drop
if args['dow']['drop']:
    ''' dayofweek : drop '''
    train = train.drop('dayofweek', axis=1)
    test = test.drop('dayofweek', axis=1)

# humidex drop
if not args['humidex']['exec']:
    ''' humidex : drop '''
    train = train.drop('humidex', axis=1)
    test = test.drop('humidex', axis=1)

# holiday drop
if not args['holiday']['exec']:
    ''' holiday : drop '''
    train = train.drop('holiday', axis=1)
    test = test.drop('holiday', axis=1)


' for Valid - mean by 건물번호, 요일, 시간 '

' max by 건물번호 '

' min by 건물번호 '

' object -> category '

' dayofweek : drop '

In [10]:
# 전처리&이상치 처리 후 건물번호 별 그래프 확인
if False:
    ''' show graph '''
    fig, ax = plt.subplots(nrows=len(args['buildings']), figsize=(20, 3*len(args['buildings'])))
    for i, b_num in enumerate(args['buildings']):
        _ = ax[i].set_title(f'{b_num}')
        _ = sns.lineplot(data=train.loc[train['건물번호'] == b_num], y='전력소비량(kWh)', x='datehour', color='b', ax=ax[i])

In [11]:
# Drop
feats = args['drop-origin-feats']
if args['valid']:
    ''' drop - for Valid '''
    train = train.drop(feats, axis=1)
else:
    ''' drop - for Test '''
    train = train.drop(feats, axis=1)
    test = test.drop(feats, axis=1)

' drop - for Valid '

### Train Test Split

In [12]:
if args['valid']:
    ''' train valid split '''  # train 데이터의 각 건물번호 별 마지막 7일을 Valid Dataset으로 설정
    valid = train.loc[train['datehour'] >= '2022-08-18'].reset_index(drop=True)
    train = train.loc[train['datehour'] < '2022-08-18'].reset_index(drop=True)

    ''' for Valid - drop '''
    feats = ['datehour', 'date', '강수량(mm)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', '연면적(m2)', '냉방면적(m2)']  #, 'humidex']
    v_datehour = valid['datehour']
    train.drop(feats, axis=1, inplace=True)
    valid.drop(feats, axis=1, inplace=True)
    test.drop(feats, axis=1, inplace=True)

    ''' X y split '''
    target = '전력소비량(kWh)'
    X_train, y_train = train.drop(target, axis=1), train[target]
    X_valid, y_valid = valid.drop(target, axis=1), valid[target]

    X_train.shape
    X_valid.shape
    X_train.columns

else:
    ''' for Test - drop '''
    feats = ['datehour', 'date', '강수량(mm)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)', '연면적(m2)', '냉방면적(m2)']  #, 'humidex']
    train.drop(feats, axis=1, inplace=True)
    test.drop(feats, axis=1, inplace=True)

    train.shape
    test.shape
    train.columns

' train valid split '

' for Valid - drop '

' X y split '

(14976, 11)

(1344, 11)

Index(['건물번호', '기온(C)', 'hour', 'ampm', 'holiday', 'humidex',
       'humidex_maxaroll', 'humidex_hroll', '전력소비량(kWh)_bdh',
       '전력소비량(kWh)_bmax', '전력소비량(kWh)_bmin'],
      dtype='object')

### Hyperparameter Tuning

In [13]:
# Set & Run Tuning
if args['tuning']:
    best_num_boost_rounds = []

    ''' set Tuning '''
    def objective(trial):
        lr = 0.1 # trial.suggest_float('learning_rate', 0.001, 0.1)
        max_depth = trial.suggest_int('max_depth', 3, 13)
        num_leaves = trial.suggest_int('num_leaves', 7, pow(2, max_depth)-1)
        bagging_fraction = trial.suggest_float('bagging_fraction', 0.0, 1.0)
        feature_fraction = trial.suggest_float('feature_fraction', 0.2, 1.0)
        min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 1000)
        # min_gain_to_split = trial.suggest_float('min_gain_to_split', 0.0, 10.0)
        # lambda_l1 = trial.suggest_float('lambda_l1', 0.0, 10.0)
        # lambda_l2 = trial.suggest_float('lambda_l2', 0.0, 10.0)
        # max_cat_threshold = trial.suggest_int('max_cat_threshold', 8, 256)

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

        params = {
            'objective': weighted_mse(alpha=args['metric-alpha']),
            # 'metric': 'rmse',  # or 'binary_logloss' for classification tasks
            'boosting_type': 'gbdt', # or 'dart'
            'learning_rate': lr,
            'max_depth': max_depth,
            'num_leaves': num_leaves,
            'bagging_fraction': bagging_fraction,
            'feature_fraction': feature_fraction,
            'min_data_in_leaf': min_data_in_leaf,
            # 'min_gain_to_split': min_gain_to_split,
            # 'lambda_l1': lambda_l1,
            # 'lambda_l2': lambda_l2,
            # 'max_cat_threshold': max_cat_threshold,
            # 'verbose': 0
        }
        
        early_stopping = True
        model = lgb.train(
            params,
            lgb_train,
            feval=smape,
            valid_sets=lgb_eval,
            num_boost_round=100000,
            callbacks=([lgb.early_stopping(stopping_rounds=500)] if early_stopping else None),
        )

        y_pred = model.predict(X_valid)
        score = smape(y_valid, y_pred)
        # score = smape(np.exp(y_valid), np.exp(y_pred))
        best_num_boost_rounds.append(model.best_iteration)
        return score[1]


    ''' run Tuning '''
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)


In [14]:
# Save tuning result & Show table
if args['tuning']:
    trials_df = study.trials_dataframe()
    trials_df['best_num_boost_rounds'] = best_num_boost_rounds
    trials_df.drop(['number', 'datetime_start', 'datetime_complete', 'duration', 'state'], axis=1, inplace=True)
    trials_df.sort_values('value')[:3]
    study.best_params

# Visualize
if args['tuning']:
    rows = 2
    fig, ax = plt.subplots(nrows=rows, ncols=4, figsize=(18, 3*rows))
    for i, col in enumerate(trials_df.columns[1:]):
        r, c = divmod(i, 4)
        sns.scatterplot(data=trials_df, x=col, y='value', ax=ax[r][c])

# Save
if args['tuning']:
    ''' save '''
    trials_df.to_csv(f'../../tune/t_lightgbm_{eid}_{args["type"]}.csv', index=False)

### Valid

In [15]:
if args['tuning']:
    params = {
        'objective': weighted_mse(alpha=args['metric-alpha']), # 'regression',  # or 'binary' for classification tasks
        'learning_rate': 0.1,
        'verbose': 0,
    }
    params.update(study.best_params)
    params
else:
    params = {
        'objective': weighted_mse(alpha=args['metric-alpha']),
        'learning_rate': 0.1,
        'verbose': 0,
        'max_depth': 7,
        'num_leaves': 76,
        'bagging_fraction': 0.26367812662431633,
        'feature_fraction': 0.8617789081931919,
        'min_data_in_leaf': 199
    }

In [16]:
# lgbm model train
if args['valid']:
    ''' lgbm model train '''
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)
    
    early_stopping = True
    model = lgb.train(
        params,
        lgb_train,
        feval=smape,
        valid_sets=lgb_eval,
        num_boost_round=100000,
        callbacks=([lgb.early_stopping(stopping_rounds=500)] if early_stopping else None),
    )

    y_pred = model.predict(X_valid)

    best_num_boost_round = model.best_iteration
    score = smape(y_valid, y_pred)[1].round(5)
    # score = smape(np.exp(y_valid), np.exp(y_pred))[1]
    feature_importances_split = model.feature_importance(importance_type='split')
    feature_importances_gain = model.feature_importance(importance_type='gain')


' lgbm model train '

You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[34]	valid_0's smape: 4.26315


In [17]:
if args['valid']:
    score, best_num_boost_round, feature_importances_split, feature_importances_gain
    X_valid.columns 

(4.26315,
 34,
 array([ 33, 191, 170,  32,  15,  67, 297, 184, 456,  87,  27]),
 array([5.93848588e+07, 1.73495350e+09, 3.53027581e+09, 5.53211344e+08,
        1.83996009e+07, 4.66953964e+09, 8.21405162e+09, 8.76362516e+09,
        2.26428279e+11, 2.78011526e+10, 1.21424592e+09]))

Index(['건물번호', '기온(C)', 'hour', 'ampm', 'holiday', 'humidex',
       'humidex_maxaroll', 'humidex_hroll', '전력소비량(kWh)_bdh',
       '전력소비량(kWh)_bmax', '전력소비량(kWh)_bmin'],
      dtype='object')

In [18]:
# 변수 하나씩 제거해보기
if args['valid']:
    result = []
    
    ''' train with out one feature '''
    for col in X_train.columns:
        lgb_train = lgb.Dataset(X_train.drop(col, axis=1), y_train)
        lgb_eval = lgb.Dataset(X_valid.drop(col, axis=1), y_valid, reference=lgb_train)

        early_stopping = True
        model = lgb.train(
            params,
            lgb_train,
            feval=smape,
            valid_sets=lgb_eval,
            num_boost_round=100000,
            callbacks=([lgb.early_stopping(stopping_rounds=500)] if early_stopping else None),
        )

        y_pred = model.predict(X_valid.drop(col, axis=1))
        score = smape(y_valid, y_pred)[1]
        result.append((col, score.round(5)))


' train with out one feature '

You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[33]	valid_0's smape: 4.80847
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[31]	valid_0's smape: 4.67672
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[32]	valid_0's smape: 4.85658
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[32]	valid_0's smape: 4.71434
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 500 rounds
Early stopping, best iteration is:
[32]	valid_0's smape: 4.77186
You can set `force_col_wise=true` to remove the overhead.
Training until va

In [19]:
result

[('건물번호', 4.80847),
 ('기온(C)', 4.67672),
 ('hour', 4.85658),
 ('ampm', 4.71434),
 ('holiday', 4.77186),
 ('humidex', 4.67684),
 ('humidex_maxaroll', 5.48533),
 ('humidex_hroll', 4.65883),
 ('전력소비량(kWh)_bdh', 5.1585),
 ('전력소비량(kWh)_bmax', 4.57845),
 ('전력소비량(kWh)_bmin', 4.60976)]

In [None]:
# make & save valid df
if args['valid']:
    ''' make '''
    check_df = X_valid.copy()
    check_df['true'] = y_valid
    check_df['pred'] = y_pred
    check_df['error'] = check_df['true'] - check_df['pred']
    check_df['datehour'] = v_datehour
    ''' save '''
    check_df.to_csv(f'../../valid/v_lightgbm_{eid}_{args["type"]}.csv', index=False)

In [None]:
# Show Valid pred Graph
if args['valid']:
    fig, ax = plt.subplots(nrows=len(args['buildings']), figsize=(20, 3*len(args['buildings'])))
    for i, bnum in enumerate(args['buildings']):
        _ = ax[i].set_title(f'{bnum}')
        _ = sns.lineplot(check_df.loc[check_df['건물번호'] == bnum]['true'], color='b', ax=ax[i])
        _ = sns.lineplot(check_df.loc[check_df['건물번호'] == bnum]['pred'], color='orange', ax=ax[i])

### Submission

In [None]:
# lgbm model train
if not args['valid']:
    X, y = train.drop('전력소비량(kWh)', axis=1), train['전력소비량(kWh)']
    
    df_sub['건물번호'] = df_sub['num_date_time'].str.split('_').str[0].astype('int')
    df_sub = df_sub.loc[df_sub['건물번호'].isin(args['building']['nums'])].reset_index(drop=True)

    ''' train '''
    lgb_train = lgb.Dataset(X, y)
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=best_num_boost_round,
    )
    y_pred = model.predict(test)
    df_sub['answer'] = y_pred

In [None]:
''' min max '''
min(y_pred), max(y_pred)

''' Save '''
df_sub.to_csv(f'../../fraction/lightgbm_{eid}_{args["type"]}.csv', index=False)