In [1]:
import numpy as np
import pandas as pd

# preprocessing
import os
from datetime import datetime

# EDA
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats
from scipy.stats import skew
from scipy.stats import spearmanr

# Learning algorithms
import sklearn
from sklearn.linear_model import *
from sklearn.svm import SVR
from sklearn.cluster import KMeans

import lightgbm as lgb
from lightgbm import LGBMRegressor
# import catboost

# model validation
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

SEED = 2
np.random.seed(SEED)

In [2]:
from matplotlib import font_manager, rc
plt.rcParams['axes.unicode_minus'] = False 

#f_path = "c:/Windows/Fonts/AppleGothic.ttf"(맥)
f_path = "c:/Windows/Fonts/malgun.ttf"

font_name = font_manager.FontProperties(fname=f_path).get_name()
rc('font',family = font_name)

In [3]:
main_path = 'C:/Users/Playdata/Documents/DataScience/source/99_Competition/source'
data_dir = 'data'
dir_path = f'{main_path}/{data_dir}'

test_name = 'test.csv'
train_name = 'train.csv'
building_name = 'building_info.csv'
sample_submission_name = 'sample_submission.csv'

In [4]:
def rename_dict(df):
    # 이름 정리
    targets = df.columns[df.columns.str.contains('\(')]
    if len(targets) == 7:
        # train_df
        df.rename(columns = {
            '기온(C)':'기온',
            '강수량(mm)':'강수량',
            '풍속(m/s)':'풍속',
            '습도(%)':'습도',
            '일조(hr)':'일조',
            '일사(MJ/m2)':'일사',
            '전력소비량(kWh)':'전력소비량'
        },inplace = True)
    elif len(targets) == 4:
        # test_df
        df.rename(columns = {
            '기온(C)':'기온',
            '강수량(mm)':'강수량',
            '풍속(m/s)':'풍속',
            '습도(%)':'습도',
        },inplace = True)
    else:
        # building_df
        df.rename(columns = {
            '연면적(m2)':'연면적',
            '냉방면적(m2)':'냉방면적', 
            '태양광용량(kW)':'태양광용량',
            'ESS저장용량(kWh)':'ESS저장용량',
            'PCS용량(kW)':'PCS용량'
        },inplace = True)
        
    return df

def data_datetime_setting(df):
    date_format = "%Y%m%d %H"
    # 일시 컬럼 이용하여 요일 확인
    df['요일'] = df['일시'].apply(lambda x: datetime.strptime(x, date_format).strftime("%a"))
    # 일시 컬럼 이용하여 연/월/일/시간 컬럼추가
    df['일시'] = pd.to_datetime(df['일시'], format='%Y%m%d %H')
    df['연'] = df['일시'].dt.year
    df['월'] = df['일시'].dt.month
    df['일'] = df['일시'].dt.day
    df['시간'] = df['일시'].dt.hour
    return df

def data_preprocessing(df):
    try:
        df['강수량'].fillna(0, inplace=True)
        df['풍속'].fillna(method='ffill', inplace=True)
        df['습도'].fillna(method='ffill', inplace=True)
        df['일조'].fillna(0, inplace=True)
        df['일사'].fillna(0, inplace=True)
    except:
        df['강수량'].fillna(0, inplace=True)
        df['일조'] = 0
        df['일사'] = 0
    return df

def merge_dfs(df):
    merge_df = df.merge(building_df)
    for col in merge_df.columns:
        merge_df[col] = merge_df[col].replace('-',0)
        merge_df[col] = merge_df[col].fillna(0)

    merge_df['태양광용량'] = merge_df['태양광용량'].astype('float64')
    merge_df['ESS저장용량'] = merge_df['ESS저장용량'].astype('float64')
    merge_df['PCS용량'] = merge_df['PCS용량'].astype('float64')
    merge_df.drop(['num_date_time'],axis=1,inplace=True) # 둘 다 없어도 알 수 있는 데이터라서
    return merge_df

# 요일, 건물유형은 원핫인코딩하기 > pd.get_dummies()


In [5]:
os.listdir(dir_path)

['2019',
 '2021',
 'building_info.csv',
 'sample_submission.csv',
 'test.csv',
 'train.csv']

In [6]:
# 제출 형식
pd.read_csv(f'{dir_path}/{sample_submission_name}').columns

Index(['num_date_time', 'answer'], dtype='object')

In [7]:
test_df = pd.read_csv(f'{dir_path}/{test_name}')
train_df = pd.read_csv(f'{dir_path}/{train_name}')
building_df = pd.read_csv(f'{dir_path}/{building_name}')

In [8]:
train_df = rename_dict(df = train_df)
# train_df = data_datetime_setting(df=train_df)
train_df = data_preprocessing(df=train_df)

building_df = rename_dict(df = building_df)

test_df = rename_dict(df = test_df)
test_df = data_preprocessing(df=test_df)

In [9]:
train_merge = merge_dfs(df = train_df)
test_merge = merge_dfs(df = test_df)

In [11]:
from tqdm import tqdm
import pandas as pd
import statsmodels.api as sm
predicts = []
# 데이터 불러오기 (예시)
for n in tqdm(range(1,len(train_merge['건물번호'].unique())+1)):
    train__ = train_merge[train_merge['건물번호'] == n]
    test__ = test_merge[test_merge['건물번호'] == n]
    n_data = train__.copy()
    t_data = test__.copy()
    n_data.set_index('일시', inplace=True)
    t_data.set_index('일시', inplace=True)

    # 다중 회귀를 위한 피쳐 선택
    features = ['건물번호', '기온', '강수량', '풍속', '습도', '일조', '일사', '연면적',
       '냉방면적', '태양광용량', 'ESS저장용량', 'PCS용량']
    # 종속 변수 설정
    target = '전력소비량'

#     # 훈련 데이터 분할 (예시)
#     index = int(len(data)*4/5)
#     train_data = data.iloc[:index]
#     test_data = data.iloc[index:]

    # 피쳐와 종속 변수 설정
    X_train = n_data[features]
    y_train = n_data[target]
    X_test = t_data[features]


    # 상수(intercept)를 추가하여 회귀 모델 피팅
    X_train = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train).fit()
    # 테스트 데이터 예측
    X_test = sm.add_constant(X_test)
    predictions = model.predict(X_test)
    
    test_merge.loc[test_merge[test_merge['건물번호']==n].index,'answer'] = predicts[n-1].values
    
    # 모델 결과 출력
    print(model.summary())

  6%|████▉                                                                             | 6/100 [00:00<00:03, 26.05it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.621
Model:                            OLS   Adj. R-squared:                  0.619
Method:                 Least Squares   F-statistic:                     554.1
Date:                Wed, 16 Aug 2023   Prob (F-statistic):               0.00
Time:                        18:25:25   Log-Likelihood:                -16216.
No. Observations:                2040   AIC:                         3.245e+04
Df Residuals:                    2033   BIC:                         3.249e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호       -9.657e+05   1.25e+06     -0.770      0.4


  9%|███████▍                                                                          | 9/100 [00:00<00:04, 21.55it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.213
Model:                            OLS   Adj. R-squared:                  0.211
Method:                 Least Squares   F-statistic:                     110.1
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          3.93e-103
Time:                        18:25:26   Log-Likelihood:                -13204.
No. Observations:                2040   AIC:                         2.642e+04
Df Residuals:                    2034   BIC:                         2.645e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        7.565e-07   5.11e-08     14.798      0.0

 17%|█████████████▊                                                                   | 17/100 [00:00<00:03, 26.14it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.294
Model:                            OLS   Adj. R-squared:                  0.292
Method:                 Least Squares   F-statistic:                     121.0
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          7.96e-149
Time:                        18:25:26   Log-Likelihood:                -14429.
No. Observations:                2040   AIC:                         2.887e+04
Df Residuals:                    2032   BIC:                         2.892e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호         8.33e+07   2.97e+08      0.281      0.7

 24%|███████████████████▍                                                             | 24/100 [00:00<00:02, 28.10it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.139
Model:                            OLS   Adj. R-squared:                  0.137
Method:                 Least Squares   F-statistic:                     54.84
Date:                Wed, 16 Aug 2023   Prob (F-statistic):           6.09e-63
Time:                        18:25:26   Log-Likelihood:                -16788.
No. Observations:                2040   AIC:                         3.359e+04
Df Residuals:                    2033   BIC:                         3.363e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        -115.7247    322.060     -0.359      0.7

 30%|████████████████████████▎                                                        | 30/100 [00:01<00:02, 24.06it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.514
Model:                            OLS   Adj. R-squared:                  0.512
Method:                 Least Squares   F-statistic:                     306.8
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          1.05e-312
Time:                        18:25:26   Log-Likelihood:                -18754.
No. Observations:                2040   AIC:                         3.752e+04
Df Residuals:                    2032   BIC:                         3.757e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호       -1.016e+06   7.65e+05     -1.329      0.1


 33%|██████████████████████████▋                                                      | 33/100 [00:01<00:03, 20.80it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.372
Model:                            OLS   Adj. R-squared:                  0.370
Method:                 Least Squares   F-statistic:                     200.3
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          6.37e-201
Time:                        18:25:27   Log-Likelihood:                -14698.
No. Observations:                2040   AIC:                         2.941e+04
Df Residuals:                    2033   BIC:                         2.945e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        7.171e+04   1.03e+06      0.070      0.9

 39%|███████████████████████████████▌                                                 | 39/100 [00:01<00:02, 20.64it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.595
Model:                            OLS   Adj. R-squared:                  0.594
Method:                 Least Squares   F-statistic:                     497.6
Date:                Wed, 16 Aug 2023   Prob (F-statistic):               0.00
Time:                        18:25:27   Log-Likelihood:                -10187.
No. Observations:                2040   AIC:                         2.039e+04
Df Residuals:                    2033   BIC:                         2.043e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호           0.0006   1.96e-06    324.562      0.0

 45%|████████████████████████████████████▍                                            | 45/100 [00:01<00:02, 23.51it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.516
Method:                 Least Squares   F-statistic:                     312.1
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          1.43e-316
Time:                        18:25:27   Log-Likelihood:                -17463.
No. Observations:                2040   AIC:                         3.494e+04
Df Residuals:                    2032   BIC:                         3.499e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호       -3.466e+06   1.76e+07     -0.197      0.8

 51%|█████████████████████████████████████████▎                                       | 51/100 [00:02<00:02, 24.31it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.565
Model:                            OLS   Adj. R-squared:                  0.564
Method:                 Least Squares   F-statistic:                     440.9
Date:                Wed, 16 Aug 2023   Prob (F-statistic):               0.00
Time:                        18:25:27   Log-Likelihood:                -14544.
No. Observations:                2040   AIC:                         2.910e+04
Df Residuals:                    2033   BIC:                         2.914e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호       -4.262e-06   5.67e-07     -7.512      0.0

 58%|██████████████████████████████████████████████▉                                  | 58/100 [00:02<00:01, 28.19it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.509
Model:                            OLS   Adj. R-squared:                  0.508
Method:                 Least Squares   F-statistic:                     301.4
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          1.21e-308
Time:                        18:25:28   Log-Likelihood:                -16488.
No. Observations:                2040   AIC:                         3.299e+04
Df Residuals:                    2032   BIC:                         3.304e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        2.642e+06   8.11e+06      0.326      0.7

 65%|████████████████████████████████████████████████████▋                            | 65/100 [00:02<00:01, 29.24it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.506
Model:                            OLS   Adj. R-squared:                  0.505
Method:                 Least Squares   F-statistic:                     347.2
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          5.28e-307
Time:                        18:25:28   Log-Likelihood:                -14247.
No. Observations:                2040   AIC:                         2.851e+04
Df Residuals:                    2033   BIC:                         2.855e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        9.455e+06   1.15e+07      0.820      0.4

 73%|███████████████████████████████████████████████████████████▏                     | 73/100 [00:02<00:00, 28.85it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.273
Model:                            OLS   Adj. R-squared:                  0.271
Method:                 Least Squares   F-statistic:                     109.0
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          7.32e-136
Time:                        18:25:28   Log-Likelihood:                -17472.
No. Observations:                2040   AIC:                         3.496e+04
Df Residuals:                    2032   BIC:                         3.500e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호         6.86e+05   2.11e+06      0.324      0.7

 80%|████████████████████████████████████████████████████████████████▊                | 80/100 [00:03<00:00, 29.49it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.261
Model:                            OLS   Adj. R-squared:                  0.259
Method:                 Least Squares   F-statistic:                     119.8
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          8.16e-130
Time:                        18:25:28   Log-Likelihood:                -13727.
No. Observations:                2040   AIC:                         2.747e+04
Df Residuals:                    2033   BIC:                         2.751e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        5.688e-06   1.39e-06      4.083      0.0

 89%|████████████████████████████████████████████████████████████████████████         | 89/100 [00:03<00:00, 28.01it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.437
Model:                            OLS   Adj. R-squared:                  0.435
Method:                 Least Squares   F-statistic:                     262.6
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          4.59e-249
Time:                        18:25:28   Log-Likelihood:                -16189.
No. Observations:                2040   AIC:                         3.239e+04
Df Residuals:                    2033   BIC:                         3.243e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호       -4.384e+06   8.08e+06     -0.543      0.5

 95%|████████████████████████████████████████████████████████████████████████████▉    | 95/100 [00:03<00:00, 26.66it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.497
Method:                 Least Squares   F-statistic:                     336.9
Date:                Wed, 16 Aug 2023   Prob (F-statistic):          2.48e-300
Time:                        18:25:29   Log-Likelihood:                -15882.
No. Observations:                2040   AIC:                         3.178e+04
Df Residuals:                    2033   BIC:                         3.182e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호        1.277e+07   2.41e+07      0.531      0.5

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 26.25it/s]

                            OLS Regression Results                            
Dep. Variable:                  전력소비량   R-squared:                       0.613
Model:                            OLS   Adj. R-squared:                  0.611
Method:                 Least Squares   F-statistic:                     535.7
Date:                Wed, 16 Aug 2023   Prob (F-statistic):               0.00
Time:                        18:25:29   Log-Likelihood:                -15276.
No. Observations:                2040   AIC:                         3.057e+04
Df Residuals:                    2033   BIC:                         3.061e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
건물번호         1.78e+07   2.78e+07      0.641      0.5




건물별로 

In [23]:
test_merge['num_date_time'] = test_merge['건물번호'].astype('str') + '_' + test_merge['일시'].astype('str')

In [26]:
# test_merge[['num_date_time','answer']].to_csv('../output/submission.csv',index=False)

In [27]:
test_merge[['num_date_time','answer']]

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1816.315399
1,1_20220825 01,1569.815399
2,1_20220825 02,1675.815399
3,1_20220825 03,1630.815399
4,1_20220825 04,1535.315399
...,...,...
16795,100_20220831 19,662.838498
16796,100_20220831 20,555.760373
16797,100_20220831 21,530.842404
16798,100_20220831 22,552.637326
