In [None]:
import os
import sys
import math
import optuna
import random
import pandas as pd
import numpy as np

from tqdm import tqdm
from datetime import datetime
from xgboost import XGBRegressor
from sktime.forecasting.model_selection import temporal_train_test_split

In [None]:
np.random.seed(42)

base_path = 'C:/Users/dlwks/OneDrive/바탕 화면/VSCode/DACON_전력사용량/base_path'

In [None]:
def SAMPE(true, pred):
    return np.mean((np.abs(true - pred)) / (np.abs(true) + np.abs(pred))) * 200

In [None]:
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [None]:
def add_data(df):
    for i in range(2):
        np.random.seed(i)
        num_rows = len(df)
        random_factors = ['temp', 'prec', 'wind', 'hum']

        random_data = {
            factor : df[factor] * np.random.uniform(0.9, 1.1, num_rows)
            for factor in random_factors
        }

        random_data = {factor : np.round(data, 1) for factor, data in random_data.items()}

        new_df = df.copy()
        new_df.update(pd.DataFrame(random_data))
        df = pd.concat([df, new_df], ignore_index = True)

    df = df.sort_values(by = ['building', 'date_time']).reset_index(drop = True)
    return df

In [None]:
def weather(train):
    condition = train['prec'] > 0
    filtered_df = train[condition].index.tolist()
    train['weather'] = 0

    for idx in filtered_df:
        for offset in range(-3, 4):
            new_idx = idx + offset
            if 0 <= new_idx < len(train):
                train.loc[new_idx, 'weather'] = 1

    return train

In [None]:
def time_features(data, mode):
    date = pd.to_datetime(data.date_time)
    
    data['hour'] = date.dt.hour
    data['dow'] = date.dt.weekday
    data['month'] = date.dt.month
    data['week'] = date.dt.isocalendar().week.astype(np.int32)
    data['day'] = date.dt.day

    data['sin_time'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['cos_time'] = np.cos(2 * np.pi * data['hour'] / 24)
    
    data['holiday'] = data.apply(lambda x : 0 if x['dow'] < 5 else 1, axis = 1)
    data['date'] = pd.to_datetime(data['date_time'], format='%Y-%m-%d').dt.date
    
    building_dates = [['2022-06-07', '2022-06-17'], ['2022-07-31', '2022-07-23', '2022-07-20'], ['2022-08-16', '2022-08-17']]
    
    for index, b in enumerate([2, 3 ,54]):
        data.loc[data['building'] == b, 'holiday'] = 0
        data.loc[(data['building'] == b) & (data['dow'] == 0) , 'holiday'] = 1
        data.loc[(data['building'] == b) & (data['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1
    
    data.loc[(data['building'] != 14) & (data['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-01', '2022-06-06', '2022-08-15']])), 'holiday'] = 1
    data.loc[(data['building'] == 14) & (data['date'].isin([pd.to_datetime(i).date() for i in ['2022-06-14']])) , 'holiday'] = 1
    data.loc[data['building'] == 85, 'holiday'] = 0
    
    def week_of_month(date):
        first_day = date.replace(day = 1)
        if (date.isocalendar().week - first_day.isocalendar().week + 1) % 2 == 0:
            if date.weekday() == 6:
                return 1
        return 0

    data['week_of_month'] = data['date'].apply(week_of_month)

    target_buildings = [87, 88, 89, 90, 91, 92]
    data.loc[(data['building'].isin(target_buildings)) , 'holiday'] = 0
    data.loc[(data['building'].isin(target_buildings)) & (data['week_of_month'] == 1), 'holiday'] = 1
    
    building_dates = [['2022-06-20', '2022-07-11', '2022-08-08', '2022-06-17'], ['2022-06-13', '2022-07-25', '2022-08-01'],
                     ['2022-07-18', '2022-08-08'], ['2022-06-20', '2022-07-18', '2022-06-17', '2022-08-08'],
                     ['2022-06-27', '2022-07-25', '2022-08-08'], ['2022-06-13', '2022-07-11', '2022-08-22'],
                     ['2022-06-10', '2022-08-10', '2022-07-10', '2022-07-24', '2022-06-26', '2022-08-28']]
    if mode == 'byb' or mode == 'gu_byb':
        for index, b in enumerate([37, 38, 39, 40, 41, 42, 86]):
            data.loc[data['building'] == b, 'holiday'] = 0
            data.loc[(data['building'] == b) & (data['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1
            
    if mode == 'all' or mode == 'gu_all':
        data.loc[data['building'] == 86, 'holiday'] = 0
        data.loc[(data['building'] == 86) & (data['date'].isin([pd.to_datetime(i).date() for i in building_dates[-1] + ['2022-07-30']])), 'holiday'] = 1
    
    data['date'] = pd.to_datetime(data['date_time'], format = '%Y-%m-%d')
    return data

In [None]:
def side_indicator(data):
    data['THI'] = 9 / 5 * data['기온(C)'] - 0.55 * (1 - data['습도(%)'] / 100) * (9 / 5 * data['습도(%)'] - 26) + 32 # 온습도 지수
    data['WC'] = 13.12 + 0.6215 * data['temp'] - 13.947 * data['wind'] ** 0.16 + 0.486 * data['temp'] * data['wind'] ** 0.16 # 체감온도

    def calculate_cdh(xs):
        ys = []
        for i in range(len(xs)):
            if i < 11:
                ys.append(np.sum(xs[: (i + 1)] - 26))
            else:
                ys.append(np.sum(xs[(i - 11) : (i + 1)] - 26))
        return np.array(ys)

    cdhs = []
    for num in range(1, 101):
        temp = data[data['building'] == num]
        cdh = calculate_cdh(temp['temp'].values)
        cdhs.extend(cdh)
    data['CDH'] = cdhs # 불쾌 지수

    return data

In [None]:
def summer_cos(date):
    start_date = datetime.strptime("2022-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
    end_date = datetime.strptime("2022-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")

    period = (end_date - start_date).total_seconds()

    return math.cos(2 * math.pi * (date - start_date).total_seconds() / period)

In [None]:
def summer_sin(date):
    start_date = datetime.strptime("2022-06-01 00:00:00", "%Y-%m-%d %H:%M:%S")
    end_date = datetime.strptime("2022-09-14 00:00:00", "%Y-%m-%d %H:%M:%S")

    period = (end_date - start_date).total_seconds()

    return math.sin(2 * math.pi * (date - start_date).total_seconds() / period)

In [None]:
def temp_features(data):
    avg_temp = pd.pivot_table(data[data['hour'] % 3 == 0], values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.mean).reset_index()
    avg_temp.rename(columns = {'temp' : 'avg_temp'}, inplace = True)
    data = pd.merge(data, avg_temp, on = ['building', 'day', 'month'], how = 'left')

    max_temp = pd.pivot_table(data, values = 'temp', index = ['building', 'day', 'month'], aggfunc = np.max).reset_index()
    max_temp.rename(columns = {'temp' : 'max_temp'}, inplace = True)
    data = pd.merge(data, max_temp, on = ['building', 'day', 'month'], how = 'left')

    min_temp = pd.pivot_table(data, values = 'temp', index = {'building', 'day', 'month'}, aggfunc = np.min).reset_index()
    min_temp.rename(columns = {'temp' : 'min_temp'}, inplace = True)
    data = pd.merge(data, min_temp, on = ['building', 'day', 'month'], how = 'left')

    data['temp_diff'] = data['max_temp'] - data['min_temp']

    return data

In [None]:
def process_data(data, mode):
    data['wind'] = data['wind'].fillna(method = 'ffill')
    data['hum'] = data['hum'].fillna(method = 'ffill')
    data = data.fillna(0)

    data = time_features(data, mode)
    data = side_indicator(data)
    data = temp_features(data)

    data['summer_cos'] = data['date'].apply(summer_cos)
    data['summer_sin'] = data['date'].apply(summer_sin)

    return data

In [None]:
def mean_std(train, test, mode):
    ratio = np.array([0.985] + [0.98] * 2 + [0.995] * 2 + [0.99] * 2)
    if mode == 'byb':
        train['target'] = train.apply(lambda row : row['target'] * ratio[row['dow']], axis = 1)
    elif mode == 'all':
        ratio -= 0.005
        train['target'] = train.apply(lambda row : row['target'] * ratio[row['dow']], axis = 1)

    power_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'dow'], aggfunc = np.mean).reset_index()
    power_mean.rename(columns = {'target': 'dow_hour_mean'}, inplace = True)
    train = pd.merge(train, power_mean, on = ['building', 'hour', 'dow'], how = 'left')
    test = pd.merge(test, power_mean, on = ['building', 'hour', 'dow'], how = 'left')
    
    power_holiday_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'holiday'], aggfunc = np.mean).reset_index()
    power_holiday_mean.rename(columns = {'target': 'holiday_mean'}, inplace = True)
    train = pd.merge(train, power_holiday_mean, on = ['building', 'hour', 'holiday'], how = 'left')
    test = pd.merge(test, power_holiday_mean, on = ['building', 'hour', 'holiday'], how = 'left')

    power_holiday_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour', 'holiday'], aggfunc = np.std).reset_index()
    power_holiday_std.rename(columns = {'target': 'holiday_std'}, inplace = True)
    train = pd.merge(train, power_holiday_std, on = ['building', 'hour', 'holiday'], how = 'left')
    test = pd.merge(test, power_holiday_std, on = ['building', 'hour', 'holiday'], how = 'left')

    power_hour_mean = pd.pivot_table(train, values = 'target', index = ['building', 'hour',], aggfunc = np.mean).reset_index()
    power_hour_mean.rename(columns = {'target': 'hour_mean'}, inplace = True)
    train = pd.merge(train, power_hour_mean, on = ['building', 'hour', ], how = 'left')
    test = pd.merge(test, power_hour_mean, on = ['building', 'hour', ], how = 'left')

    power_hour_std = pd.pivot_table(train, values = 'target', index = ['building', 'hour',], aggfunc = np.std).reset_index()
    power_hour_std.rename(columns = {'target': 'hour_std'}, inplace = True)
    train = pd.merge(train, power_hour_std, on = ['building', 'hour', ], how = 'left')
    test = pd.merge(test, power_hour_std, on = ['building', 'hour', ], how = 'left')

    if mode == 'all' or mode == 'gu_all':
        train['date'] = pd.to_datetime(train['date_time'], format = '%Y-%m-%d').dt.date
        test['date'] = pd.to_datetime(test['date_time'], format = '%Y-%m-%d').dt.date
        building_dates = [['2022-06-20', '2022-07-11', '2022-08-08', '2022-06-17'], ['2022-06-13', '2022-07-25', '2022-08-01'],
                         ['2022-07-18', '2022-08-08'], ['2022-06-20', '2022-07-18', '2022-06-17', '2022-08-08'],
                         ['2022-06-27', '2022-07-25', '2022-08-08'], ['2022-06-13', '2022-07-11', '2022-08-22']]
        
        for index, b in enumerate([37, 38, 39, 40, 41, 42]):
            train.loc[train['building'] == b, 'holiday'] = 0
            train.loc[(train['building'] == b) & (train['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1
            test.loc[test['building'] == b, 'holiday'] = 0
            test.loc[(test['building'] == b) & (test['date'].isin([pd.to_datetime(i).date() for i in building_dates[index]])), 'holiday'] = 1
    
    return train, test

In [None]:
def process_info(data, is_train=True):
    data.columns = ['building', 'type', 'all_area', 'cool_area', 'sun']
    data['sun'] = data['sun'].replace('-', 0).astype('float')

    value_dict = {value: index for index, value in enumerate(data['type'].unique())}
    data['type'] = data['type'].map(value_dict)
    
    filtered_data = data[(data['type'] == 7) & (data['cool_area'] != 0)]
    result = (filtered_data['all_area'].iloc[1:].sum() / filtered_data['cool_area'].iloc[1:].sum())
    condition = (data['type'] == 7) & (data['cool_area'] == 0)
    data.loc[condition, 'cool_area'] = (data.loc[condition, 'all_area'] / result).astype('int')
    
    filtered_data = data[(data['type'] == 9) & (data['cool_area'] > 500)]
    result = (filtered_data['all_area'].sum() / filtered_data['cool_area'].sum())
    condition = (data['type'] == 9) & (data['cool_area'] < 500)
    data.loc[condition, 'cool_area'] = round(data.loc[condition, 'all_area'] / result, 1)
    
    return data

In [None]:
def get_train_and_test_data(mode):
    building_info = pd.read_csv(os.path.join(base_path,'building_info.csv')).drop(['ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1)
    building_info = process_info(building_info)

    train_data = pd.read_csv(os.path.join(base_path,'train.csv')).drop(['일조(hr)', '일사(MJ/m2)'], axis=1)
    train_data.columns = ['num_date_time', 'building', 'date_time', 'temp', 'prec', 'wind', 'hum', 'target']
    train_data = process_data(train_data, mode)

    test_data = pd.read_csv(os.path.join(base_path,'test.csv'))
    test_data.columns = ['num_date_time', 'building', 'date_time', 'temp', 'prec', 'wind', 'hum']
    test_data = process_data(test_data, mode)
    
    train_data, test_data = mean_std(train_data, test_data, mode)
    
    if mode == 'all' or mode == 'gu_all':
        train_data = train_data.merge(building_info, on='building', how='left')
        test_data = test_data.merge(building_info, on='building', how='left')
        
    return train_data, test_data

In [None]:
gu_byb = ['num_date_time', 'building', 'date_time', 'temp', 'wind', 'hum',
       'dow', 'month', 'week', 'dow_hour_mean', 'holiday',
       'holiday_mean', 'holiday_std', 'hour_mean', 'hour_std', 'sin_time',
       'cos_time', 'THI', 'WC', 'CDH', 'target']

train, test = get_train_and_test_data('gu_byb')

train = train[gu_byb]
test = test[gu_byb[:-1]]

scores = []  
best_it = [] 

score = pd.DataFrame({'building':range(1,101)})
for i in tqdm(range(100)):
    y = train.loc[train.building == i+1, 'target']
    x = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
    y_train, y_valid, x_train, x_valid = temporal_train_test_split(y = y, X = x, test_size = 168)
    
    xgb = XGBRegressor(colsample_bytree=0.8, eta=0.01, max_depth=5,
             min_child_weight=6,n_estimators=2000, subsample=0.9, early_stopping_rounds=50, eval_metric=SMAPE)
    
    xgb.set_params(**{'objective':weighted_mse(100)})
    
    xgb.fit(x_train, y_train, eval_set=[(x_train, y_train), 
                                            (x_valid, y_valid)], verbose=False)
    
    y_pred = xgb.predict(x_valid)  
            
    sm = SMAPE(y_valid, y_pred)
    scores.append(sm)
    best_it.append(xgb.best_iteration+1) 

score['score'] = scores
print(sum(scores)/len(scores)) 
print(sum(best_it)/len(best_it))

preds = np.array([]) 

for i in tqdm(range(100)):
    pred_df = pd.DataFrame()   
    
    for seed in [0,1,2,3,4]: 
        y_train = train.loc[train.building == i+1, 'target']
        x_train = train.loc[train.building == i+1, ].iloc[:, 3:].drop(['target'], axis=1)
        x_test = test.loc[test.building == i+1, ].iloc[:,3:]
        
        xgb = XGBRegressor(colsample_bytree=0.8, eta=0.01, max_depth=5, seed=seed,
                 min_child_weight=6,n_estimators=best_it[i], subsample=0.9)

        xgb.fit(x_train, y_train)
        y_pred = xgb.predict(x_test)
        pred_df.loc[:,seed] = y_pred   
        
    pred = pred_df.mean(axis=1)        
    preds = np.append(preds, pred)   

submission = pd.read_csv(os.path.join(base_path,'sample_submission.csv'))
submission['answer'] = preds
submission.to_csv('./gu_byb.csv', index = False)

del train, test, scores, best_it, submission