In [1]:
import pandas as pd
import numpy as np

import holoviews as hv
import hvplot.pandas

import plotly.graph_objects as go

def generate_plotly_scatter(df: pd.DataFrame, group_col, xaxis, yaxis, mode="lines", title="", height=600, xaxis_kwargs={}, yaxis_kwargs={}):
  plots = []
  
  for name, group in df.groupby(by=group_col, observed=True):
      plot = go.Scatter(x=group[xaxis], y=group[yaxis], mode=mode, name=str(name), opacity=0.75)
      plots.append(plot)
      
  fig = go.Figure(data=plots)
  fig.update_layout(
    title=title, 
    xaxis_title=xaxis,
    yaxis_title=yaxis,
    height=height,
    showlegend=True
    )
  fig.update_xaxes(**xaxis_kwargs)
  fig.update_yaxes(**yaxis_kwargs)
  
  
  return fig

def generate_plotly_hist(df: pd.DataFrame, group_col, yaxis, height=600):
  plots = []
  
  for name, group in df.groupby(by=group_col, observed=True):
        plot = go.Histogram(x=group[yaxis], name=name, opacity=0.75)
        plots.append(plot)
  fig = go.Figure(data=plots)
  fig.update_layout(title=f"{yaxis} by {group_col}", xaxis_title=yaxis, yaxis_title="Count", height=height)
  return fig

def generate_hvplot_kde(df: pd.DataFrame, group_col, *yaxis):
  plots = []
  
  for col in yaxis:
    plot = df.hvplot.kde(
          y=col,
          by=group_col,
          autorange="y",
          alpha=0.5,
          title=str(col),
          value_label="KDE",
          height=500,
    )
    plots.append(plot)
    
  return hv.Layout(plots).cols(2)

def generate_hvplot_kde(df: pd.DataFrame, **kwargs):
  return df.hvplot.kde(**kwargs, autorange="y", alpha=0.75)

def generate_hvplot_line(df: pd.DataFrame, **kwargs):
  return df.hvplot.line(**kwargs, autorange="y", alpha=0.75)

%opts magic unavailable (pyparsing cannot be imported)
%compositor magic unavailable (pyparsing cannot be imported)


# EDA

In [2]:
info_df = pd.read_csv("../data/building_info.csv").rename(columns={
  "연면적(m2)": "연면적",
  "냉방면적(m2)": "냉방면적",
  "태양광용량(kW)": "태양광용량",
  "ESS저장용량(kWh)": "ESS저장용량",
  "PCS용량(kW)": "PCS용량"
  })
train_df = pd.read_csv("../data/train.csv").rename(columns={
    "전력소비량(kWh)": "전력소비량",
    "기온(°C)": "기온",
    "강수량(mm)": "강수량",
    "풍속(m/s)": "풍속",
    "습도(%)": "습도",
    "일조(hr)": "일조",
    "일사(MJ/m2)": "일사"
})

weather_cols = ["기온", "강수량", "풍속", "습도", "일조", "일사"]
weather_cmap = ["#F49BAB","#9EC6F3","#CAE8BD","#89A8B2","#FFD2A0","#E16A54"]
energy_cols = ["태양광용량", "ESS저장용량", "PCS용량"]

info_df[energy_cols] = info_df[energy_cols].replace("-", np.nan)
info_df = info_df.astype({
    "건물번호": "category",
    "태양광용량": float,
    "ESS저장용량": float,
    "PCS용량": float
})
train_df = train_df.astype({"건물번호": "category"})

# timestamp 변환
train_df["일시"] = pd.to_datetime(train_df["일시"], format="%Y%m%d %H")

# num_date_time 열 제거
train_df = train_df.drop(columns=["num_date_time"])

# info_df와 train_df 병합
merged_df = train_df.merge(info_df, on="건물번호", how="left")

## 시간

In [3]:
import calendar

def cyclical_encode(value: int, period: int) -> tuple[float, float]:
    """
    주기(period)를 갖는 값(value)을 sin, cos으로 변환.
    :param value: 변환할 정수 값 (예: 1~12, 1~31, 0~6 등)
    :param period: 주기 길이 (예: 12, 31, 7 등)
    :return: (sin_val, cos_val)
    """
    angle = 2 * np.pi * (value / period)
    return np.sin(angle), np.cos(angle)
  
def date_sin_cos(ts=None, start="2025-01-01", end="2025-12-31", freq="h") -> dict:
    """
    날짜 객체에 대해 월, 일, 요일을 각각 sin/cos 인코딩하여 반환.
    :param dt: datetime.date 또는 datetime.datetime
    :return: {
        'month_sin': float, 'month_cos': float,
        'day_sin':   float, 'day_cos':   float,
        'wd_sin':    float, 'wd_cos':    float
    }
    """
    
    month = []
    day = []
    hour = []
    wd = []
    
    if ts is None:
      ts = pd.date_range(start=start, end=end, freq=freq)
    
    for dt in ts:

      m_sin, m_cos = cyclical_encode(dt.month - 1, 12)
      month.append({"sin": m_sin, "cos": m_cos})
      
      last_day = calendar.monthrange(dt.year, dt.month)[1] # 월에 따른 마지막 날 계산
      d_sin, d_cos = cyclical_encode(dt.day - 1, last_day)    
      day.append({"sin": d_sin, "cos": d_cos})
      
      # day-1 로 0~30 범위로 정규화 (optional)
      h_sin, h_cos = cyclical_encode(dt.hour, 24)
      hour.append({"sin": h_sin, "cos": h_cos})
      
      w_sin, w_cos = cyclical_encode(dt.weekday(), 7)          
      wd.append({"sin": w_sin, "cos": w_cos})
      
    m_df = pd.DataFrame(month, index=ts)
    d_df = pd.DataFrame(day, index=ts)
    h_df = pd.DataFrame(hour, index=ts)
    w_df = pd.DataFrame(wd, index=ts)
    
    return pd.concat([m_df.merge(w_df, left_index=True, right_index=True, suffixes=("_m", "_wd")), d_df.merge(h_df, left_index=True, right_index=True, suffixes=("_d", "_h"))], axis=1)
    
plot_opts = {
  "autorange": "y",
  "grid": True,
  "legend": "bottom_right"
}
fourier_features = date_sin_cos(merged_df["일시"].unique())

In [4]:
(
  fourier_features.hvplot.line(y=["sin_m", "cos_m"], width=700, height=400, **plot_opts) + \
  fourier_features.hvplot.line(y=["sin_wd", "cos_wd"], width=700, height=400, **plot_opts) + \
  fourier_features.hvplot.line(y=["sin_d", "cos_d"], width=700, height=400, **plot_opts) + \
  fourier_features.hvplot.line(y=["sin_h", "cos_h"], width=700, height=400, **plot_opts)
).cols(2)

## 건물

In [5]:
print(f"건물 유형 수: {len(info_df["건물유형"].unique())}")
print(info_df["건물유형"].unique())

건물 유형 수: 10
['호텔' '상용' '병원' '학교' '건물기타' '아파트' '연구소' '백화점' 'IDC(전화국)' '공공']


## 지역 그룹

- 같은 지역인 건물들이 있을까?

In [6]:
merged_df[merged_df["일시"] == "2024-06-01 00:00:00"]

Unnamed: 0,건물번호,일시,기온,강수량,풍속,습도,일조,일사,전력소비량,건물유형,연면적,냉방면적,태양광용량,ESS저장용량,PCS용량
0,1,2024-06-01,18.3,0.0,2.6,82.0,0.0,0.0,5794.80,호텔,82912.71,77586.00,,,
2040,2,2024-06-01,18.3,0.0,2.6,82.0,0.0,0.0,1204.26,상용,40658.90,30392.82,,,
4080,3,2024-06-01,18.3,0.0,2.6,82.0,0.0,0.0,10767.78,병원,560431.00,418992.00,278.58,,
6120,4,2024-06-01,18.3,0.0,2.6,82.0,0.0,0.0,454.44,호텔,41813.29,23715.71,,,
8160,5,2024-06-01,18.3,0.0,2.6,82.0,0.0,0.0,3178.08,학교,403749.39,248507.00,1983.05,1025.0,250.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193800,96,2024-06-01,18.3,0.0,2.6,82.0,0.0,0.0,809.52,건물기타,3260213.41,1956128.00,,,
195840,97,2024-06-01,18.8,0.0,1.6,50.0,0.0,0.0,902.64,건물기타,42370.93,3975.52,81.38,,
197880,98,2024-06-01,17.5,0.0,1.1,57.0,0.0,0.0,1141.92,호텔,99102.62,22097.00,,,
199920,99,2024-06-01,22.1,0.0,2.3,52.0,0.0,0.0,974.52,상용,329867.95,13442.35,171.60,,


In [7]:
# 1. 아무 날짜를 선택하고 해당 날짜의 날씨 정보에 따라 그룹화
# 2. 각 그룹에 대해 고유한 정수값 할당하고 카테고리 타입으로 변환

date_condition = merged_df["일시"] == "2024-06-01 00:00:00"
merged_df.loc[date_condition, "지역"] = merged_df[date_condition].groupby(by=["일시"] + weather_cols, observed=True).ngroup().astype("category")

In [8]:
# 3. 생성한 지역 컬럼에 nan 값이 있는지 확인
locale_df = merged_df[date_condition][["건물번호", "지역"]]
print(locale_df[locale_df["지역"].isna()])

Empty DataFrame
Columns: [건물번호, 지역]
Index: []


In [9]:
# 4. 나머지 날짜들에 대해서도 지역 컬럼 채우기
for idx, row in merged_df[date_condition].groupby(by="지역", observed=True):
    merged_df.loc[merged_df["건물번호"].isin(row["건물번호"]), "지역"] = idx
    
print(f"총 지역 수: {merged_df['지역'].nunique()}")

총 지역 수: 30


In [10]:
# 5. 그룹화한 지역별로 건물들의 기상 데이터가 동일한지 확인
for locale, group in merged_df.groupby(by="지역", observed=True):
    print(f"지역: {locale}, 건물 수: {len(group['건물번호'].unique())}")
    
    # 지역에 건물이 1개인 경우 continue
    if not len(group['건물번호'].unique()) > 1:
        continue
      
    for dt, sub_group in group.groupby(by="일시"):
        if False in sub_group.duplicated(weather_cols, keep=False).values:
            print(f"{locale} - {dt} 건물별 기상 데이터가 서로 다릅니다.")
            print(sub_group[["건물번호", *weather_cols]])

지역: 0, 건물 수: 2
지역: 1, 건물 수: 4
지역: 2, 건물 수: 1
지역: 3, 건물 수: 17
지역: 4, 건물 수: 1
지역: 5, 건물 수: 2
지역: 6, 건물 수: 6
지역: 7, 건물 수: 2
지역: 8, 건물 수: 7
지역: 9, 건물 수: 1
지역: 10, 건물 수: 1
지역: 11, 건물 수: 4
지역: 12, 건물 수: 3
지역: 13, 건물 수: 1
지역: 14, 건물 수: 2
지역: 15, 건물 수: 16
지역: 16, 건물 수: 2
지역: 17, 건물 수: 3
지역: 18, 건물 수: 1
지역: 19, 건물 수: 1
지역: 20, 건물 수: 1
지역: 21, 건물 수: 1
지역: 22, 건물 수: 1
지역: 23, 건물 수: 2
지역: 24, 건물 수: 3
지역: 25, 건물 수: 2
지역: 26, 건물 수: 1
지역: 27, 건물 수: 1
지역: 28, 건물 수: 6
지역: 29, 건물 수: 5


In [11]:
# 건물유형별 기온 분포
target = merged_df.groupby(by=["건물유형", "지역"], observed=True).agg({"기온": "mean", "강수량": "mean", "풍속": "mean", "습도": "mean", "일조": "mean", "일사": "mean"})

hv.Layout([target.hvplot.box(by="건물유형", y=col, autorange="y", title=f"건물유형별 {col} 분포", height=500) for col in target.columns]).cols(2)

## 건물 유형별 연면적 및 냉방면적

## 건물유형별 통계 시각화

In [12]:
(
  info_df.hvplot.violin(y="연면적", by="건물유형", c="건물유형", title="건물유형별 연면적 분포", width=750, height=400, grid=True, dynamic=True, autorange="y", legend="bottom_right") + 
  info_df.hvplot.violin(y="냉방면적", by="건물유형", c="건물유형", title="건물유형별 냉방면적 분포", width=750, height=400, grid=True, dynamic=True, autorange="y", legend="bottom_right") + 
  merged_df.hvplot.violin(y="전력소비량", by="건물유형", c="건물유형", width=750, height=400, title="건물유형별 전력 소비량 분포", grid=True, dynamic=True, autorange="y", legend="bottom_right") +
  info_df.hvplot.violin(y="태양광용량", by="건물유형", c="건물유형", width=750, height=400, title="건물유형별 태양광용량(kW) 분포", grid=True, dynamic=True, autorange="y", legend="bottom_right") +
  info_df.hvplot.violin(y="ESS저장용량", by="건물유형", c="건물유형", width=750, height=400, title="건물유형별 ESS저장용량(kWh) 분포", grid=True, dynamic=True, autorange="y", legend="bottom_right") + 
  info_df.hvplot.violin(y="PCS용량", by="건물유형", c="건물유형", width=750, height=400, title="건물유형별 PCS용량(kW) 분포", grid=True, dynamic=True, autorange="y", legend="bottom_right")
).opts(shared_axes=False).cols(2)

In [13]:
plots = []
for key, group in merged_df.groupby("건물유형", observed=True):
    g = group.copy().astype({"건물번호": str})
    plots.append(g.hvplot.violin(y="전력소비량", by="건물번호", c="건물번호", width=700, height=400, title=f"{key} 전력사용량 분포", grid=True, dynamic=True, autorange="y"))

In [14]:
hv.Layout(plots).cols(2).opts(shared_axes=False)

## 전처리

In [15]:
from sklearn.preprocessing import minmax_scale, normalize

df = merged_df.merge(fourier_features, left_on="일시", right_index=True, how="left")

# 습도
df["습도"] = df["습도"] * 0.01

# 강수량
df["강수량"] = np.log1p(df["강수량"])
# df["강수량"] = (df["강수량"] - df["강수량"].mean()) / df["강수량"].std()

# # 연면적, 냉방면적
df[["연면적", "냉방면적"]] = normalize(df[["연면적", "냉방면적"]])

In [16]:
df[["일시", "강수량"]].drop_duplicates().hvplot.kde(y="강수량")

In [17]:
df[["연면적", "냉방면적"]].drop_duplicates().hvplot.kde(y=["연면적", "냉방면적"])

## 상관관계

In [18]:

weather_corr = df[["전력소비량", "sin_h", "cos_h", *weather_cols]].corr()
energy_corr = info_df[energy_cols].corr()
(
  weather_corr.hvplot.heatmap(width=600, height=400) + 
  energy_corr.hvplot.heatmap(width=600, height=400)
).opts(shared_axes=False)

## VIF

In [19]:
import statsmodels.api as sm
from statsmodels.formula.api import ols # 최소제곱법
from statsmodels.stats.outliers_influence import variance_inflation_factor

_df = df.copy()

ols_model = ols("전력소비량 ~ C(건물유형) + 연면적 + 냉방면적", data=df).fit()
x = ols_model.model.exog
columns = ols_model.model.exog_names

In [20]:
_df["기온"] = (_df["기온"] - _df["기온"].mean()) / _df["기온"].std()

In [21]:
_df

Unnamed: 0,건물번호,일시,기온,강수량,풍속,습도,일조,일사,전력소비량,건물유형,...,PCS용량,지역,sin_m,cos_m,sin_wd,cos_wd,sin_d,cos_d,sin_h,cos_h
0,1,2024-06-01 00:00:00,-1.924092,0.0,2.6,0.82,0.0,0.00,5794.80,호텔,...,,15,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.000000,1.000000
1,1,2024-06-01 01:00:00,-1.924092,0.0,2.7,0.82,0.0,0.00,5591.85,호텔,...,,15,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.258819,0.965926
2,1,2024-06-01 02:00:00,-1.973439,0.0,2.6,0.80,0.0,0.00,5338.17,호텔,...,,15,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.500000,0.866025
3,1,2024-06-01 03:00:00,-1.998113,0.0,2.6,0.81,0.0,0.00,4554.42,호텔,...,,15,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.707107,0.707107
4,1,2024-06-01 04:00:00,-2.047461,0.0,1.3,0.81,0.0,0.00,3602.25,호텔,...,,15,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.866025,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,2024-08-24 19:00:00,0.740674,0.0,4.4,0.76,0.4,0.18,3276.00,호텔,...,,28,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.965926,0.258819
203996,100,2024-08-24 20:00:00,0.617305,0.0,3.7,0.74,0.0,0.00,3197.52,호텔,...,,28,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.866025,0.500000
203997,100,2024-08-24 21:00:00,0.543284,0.0,2.9,0.74,0.0,0.00,3006.60,호텔,...,,28,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.707107,0.707107
203998,100,2024-08-24 22:00:00,0.469263,0.0,1.7,0.76,0.0,0.00,2649.72,호텔,...,,28,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.500000,0.866025


In [22]:
# VIF, 분산팽창인자

vif = pd.DataFrame({
    "feature": columns,
    "VIF": [variance_inflation_factor(x, i) for i in range(x.shape[1])]
})

In [23]:
vif

Unnamed: 0,feature,VIF
0,Intercept,2794.72847
1,C(건물유형)[T.건물기타],1.937012
2,C(건물유형)[T.공공],1.747624
3,C(건물유형)[T.백화점],2.380617
4,C(건물유형)[T.병원],1.936326
5,C(건물유형)[T.상용],1.953229
6,C(건물유형)[T.아파트],1.8986
7,C(건물유형)[T.연구소],1.907184
8,C(건물유형)[T.학교],1.927811
9,C(건물유형)[T.호텔],1.953589


# Building Embdding

In [15]:
info_df.columns

Index(['건물번호', '건물유형', '연면적', '냉방면적', '태양광용량', 'ESS저장용량', 'PCS용량'], dtype='object')

In [11]:
from sklearn.preprocessing import minmax_scale

temp = info_df.copy()
temp[["연면적", "냉방면적"]] = minmax_scale(np.log(temp[["연면적", "냉방면적"]]))
temp[energy_cols] = minmax_scale(np.log(temp[energy_cols]))

temp.hvplot.violin(y=["연면적", "냉방면적", *energy_cols], width=700, height=400, title="Scaled Features", grid=True, autorange="y", legend="bottom_right")

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BuildingEmbeddingModel(nn.Module):
    def __init__(self, num_buildings=100, out_dim=10, embedding_dim=16, hidden_shape=[32, 64]):
        super(BuildingEmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=num_buildings + 1, embedding_dim=embedding_dim)
        
        self.head = nn.Sequential(
            nn.Linear(embedding_dim, hidden_shape[0]),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_shape[0], hidden_shape[1]),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_shape[1], out_dim),
            nn.ReLU()
        )
      
    def forward(self, building_id):
        x = self.embedding(building_id)
        h = self.head(x)

        return h


In [19]:
from torch.utils.data import Dataset, DataLoader

class BuildingTypeDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.df.loc[idx, "건물번호"], dtype=torch.long)
        label_typs = torch.tensor(self.df.loc[idx, "건물유형"], dtype=torch.long)
        
        return x, label_typs
      
class BuildingAreaDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.df.loc[idx, "건물번호"], dtype=torch.long)
        label_areas = torch.tensor(self.df.loc[idx, ["연면적", "냉방면적"]].tolist(), dtype=torch.float32)
        
        return x, label_areas
      
class BuildingCapDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.df.loc[idx, "건물번호"], dtype=torch.long)
        label_caps = torch.tensor(self.df.loc[idx, ["태양광용량", "ESS저장용량", "PCS용량"]].tolist(), dtype=torch.float32)
        
        return x, label_caps

In [20]:
def embedding_train(model, loader, optimizer, criterion, device):
    model.to(device)
    
    if isinstance(criterion, nn.Module):
        criterion = criterion.to(device)
    
    model.train()
    total_loss = 0.0

    
    for batch in loader:
        building_ids, labels = batch
        building_ids = building_ids.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        out = model(building_ids)
        loss = criterion(out, labels)
        loss.backward()
        
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

train_df = info_df.copy()
train_df["건물유형"] = train_df["건물유형"].astype("category").cat.codes

area_scaler = MinMaxScaler()
train_df[["연면적", "냉방면적"]] = area_scaler.fit_transform(np.log(train_df[["연면적", "냉방면적"]]))

cap_scaler = MaxAbsScaler()
train_df[["태양광용량", "ESS저장용량", "PCS용량"]] = cap_scaler.fit_transform(train_df[["태양광용량", "ESS저장용량", "PCS용량"]])
train_df[["태양광용량", "ESS저장용량", "PCS용량"]] = train_df[["태양광용량", "ESS저장용량", "PCS용량"]].fillna(0)

In [132]:
embedding_dim = 64

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

type_ds = BuildingTypeDataset(train_df)
type_loader = DataLoader(type_ds, batch_size=16, shuffle=True)
type_model = BuildingEmbeddingModel(embedding_dim=embedding_dim)
type_optimizer = torch.optim.Adam(type_model.parameters(), lr=1e-3)
type_criterion = nn.CrossEntropyLoss()

type_loss = []
prev = None
for epoch in range(300):
    loss = embedding_train(type_model, type_loader, type_optimizer, type_criterion, DEVICE)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}')
    type_loss.append(loss)

In [138]:
type_model.eval()

with torch.no_grad():
  type_h = type_model(torch.tensor(train_df["건물번호"].values, dtype=torch.long).to(DEVICE)).argmax(dim=1).cpu().numpy()

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

area_ds = BuildingAreaDataset(train_df)
area_loader = DataLoader(area_ds, batch_size=16, shuffle=True)
area_model = BuildingEmbeddingModel(out_dim=2, embedding_dim=embedding_dim)
area_optimizer = torch.optim.Adam(area_model.parameters(), lr=1e-3)
area_criterion = nn.MSELoss()

area_loss = []
prev = None
for epoch in range(100):
    loss = embedding_train(area_model, area_loader, area_optimizer, area_criterion, DEVICE)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}')
    area_loss.append(loss)

In [156]:
area_model.eval()

with torch.no_grad():
  area_h = area_model(torch.tensor(train_df["건물번호"].values, dtype=torch.long).to(DEVICE)).cpu().numpy()

In [198]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.MSELoss()

def loss_fn(pred, target):
    return criterion(F.relu(pred), target)

caps_ds = BuildingCapDataset(train_df)
caps_loader = DataLoader(caps_ds, batch_size=16, shuffle=True)
caps_model = BuildingEmbeddingModel(out_dim = 3, embedding_dim=embedding_dim)
caps_optimizer = torch.optim.Adam(caps_model.parameters(), lr=1e-3)
caps_criterion = loss_fn

caps_loss = []
prev = None

for epoch in range(100):
    loss = embedding_train(caps_model, caps_loader, caps_optimizer, caps_criterion, DEVICE)
    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}')
    caps_loss.append(loss)

Epoch 00, Loss: 0.0315
Epoch 01, Loss: 0.0266
Epoch 02, Loss: 0.0263
Epoch 03, Loss: 0.0249
Epoch 04, Loss: 0.0242
Epoch 05, Loss: 0.0246
Epoch 06, Loss: 0.0203
Epoch 07, Loss: 0.0189
Epoch 08, Loss: 0.0211
Epoch 09, Loss: 0.0198
Epoch 10, Loss: 0.0156
Epoch 11, Loss: 0.0162
Epoch 12, Loss: 0.0171
Epoch 13, Loss: 0.0125
Epoch 14, Loss: 0.0123
Epoch 15, Loss: 0.0107
Epoch 16, Loss: 0.0107
Epoch 17, Loss: 0.0114
Epoch 18, Loss: 0.0119
Epoch 19, Loss: 0.0097
Epoch 20, Loss: 0.0085
Epoch 21, Loss: 0.0097
Epoch 22, Loss: 0.0115
Epoch 23, Loss: 0.0131
Epoch 24, Loss: 0.0119
Epoch 25, Loss: 0.0076
Epoch 26, Loss: 0.0072
Epoch 27, Loss: 0.0106
Epoch 28, Loss: 0.0066
Epoch 29, Loss: 0.0071
Epoch 30, Loss: 0.0047
Epoch 31, Loss: 0.0045
Epoch 32, Loss: 0.0060
Epoch 33, Loss: 0.0065
Epoch 34, Loss: 0.0046
Epoch 35, Loss: 0.0028
Epoch 36, Loss: 0.0037
Epoch 37, Loss: 0.0054
Epoch 38, Loss: 0.0037
Epoch 39, Loss: 0.0043
Epoch 40, Loss: 0.0036
Epoch 41, Loss: 0.0036
Epoch 42, Loss: 0.0030
Epoch 43, L

In [199]:
caps_model.eval()

with torch.no_grad():
  caps_h = caps_model(torch.tensor(train_df["건물번호"].values, dtype=torch.long).to(DEVICE)).cpu().numpy()

In [200]:
h = np.concatenate([type_h.reshape(-1, 1), area_h, caps_h], axis=1)
h_df = pd.DataFrame(h, columns=["건물유형", "연면적", "냉방면적", "태양광용량", "ESS저장용량", "PCS용량"], index=info_df["건물번호"].values).reset_index(drop=False).rename(columns={"index": "건물번호"})

In [201]:
(
  h_df.hvplot.kde(y=energy_cols, title="pred") + 
  train_df.hvplot.kde(y=energy_cols, title="real")
  # h_df.hvplot.kde(y=["연면적", "냉방면적"], title="pred") + 
  # train_df.hvplot.kde(y=["연면적", "냉방면적"], title="real")
).cols(2)

In [145]:
h_df

Unnamed: 0,건물번호,건물유형,연면적,냉방면적,태양광용량,ESS저장용량,PCS용량
0,1,0.0,0.329615,0.573990,0.000000,0.000000,0.000000
1,2,5.0,0.287965,0.456110,0.000000,0.000000,0.000000
2,3,4.0,0.652367,0.770236,0.570313,0.000000,0.000000
3,4,0.0,0.234548,0.443406,0.000000,0.000000,0.000000
4,5,0.0,0.566841,0.659653,0.916110,0.615789,0.340029
...,...,...,...,...,...,...,...
95,96,1.0,0.788676,0.810337,0.000000,0.000000,0.000000
96,97,1.0,0.228234,0.313559,0.367682,0.000000,0.000000
97,98,0.0,0.348348,0.474413,0.000000,0.000000,0.000000
98,99,5.0,0.556452,0.451243,0.508166,0.000000,0.000000


In [146]:
train_df

Unnamed: 0,건물번호,건물유형,연면적,냉방면적,태양광용량,ESS저장용량,PCS용량
0,1,9,0.365124,0.627496,0.000000,0.000000,0.000000
1,2,5,0.241915,0.519326,0.000000,0.000000,0.000000
2,3,4,0.695537,0.822151,0.628409,0.000000,0.000000
3,4,9,0.246756,0.490693,0.000000,0.000000,0.000000
4,5,8,0.638838,0.761857,1.000000,0.676315,0.401896
...,...,...,...,...,...,...,...
95,96,1,1.000000,1.000000,0.000000,0.000000,0.000000
96,97,1,0.249046,0.284553,0.395426,0.000000,0.000000
97,98,9,0.395965,0.482534,0.000000,0.000000,0.000000
98,99,5,0.603893,0.425165,0.536672,0.000000,0.000000


# 123

In [450]:
for dt, group in train_df.groupby(by="일시"):
    for c, v in fourier_features.loc[dt].items():
      train_df.loc[group.index, c] = v

In [268]:
train_df

Unnamed: 0,건물번호,일시,기온,강수량,풍속,습도,일조,일사,전력소비량,sin_m,cos_m,sin_wd,cos_wd,sin_d,cos_d,sin_h,cos_h
0,1,2024-06-01 00:00:00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.000000,1.000000
1,1,2024-06-01 01:00:00,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.258819,0.965926
2,1,2024-06-01 02:00:00,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.500000,0.866025
3,1,2024-06-01 03:00:00,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.707107,0.707107
4,1,2024-06-01 04:00:00,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,0.5,-0.866025,-0.974928,-0.222521,0.000000,1.000000,0.866025,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,2024-08-24 19:00:00,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.965926,0.258819
203996,100,2024-08-24 20:00:00,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.866025,0.500000
203997,100,2024-08-24 21:00:00,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.707107,0.707107
203998,100,2024-08-24 22:00:00,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,-0.5,-0.866025,-0.974928,-0.222521,-0.998717,-0.050649,-0.500000,0.866025


In [306]:
train_df[[*weather_cols, *fourier_features.columns]].corr().hvplot.heatmap(title="Fourier Features Correlation", width=600, height=400)

In [305]:
result = None
for idx, group in train_df.groupby(by="건물번호", observed=True):
    if result is None:
        result = group[["전력소비량", *fourier_features.columns]].corr()
    else:
        result += group[["전력소비량", *fourier_features.columns]].corr()

In [304]:
(result / len(train_df["건물번호"].unique())).hvplot.heatmap(title="Fourier Features Correlation", width=600, height=400)

In [350]:
t_group = train_df.groupby(by="일시").agg({"강수량": "mean", "기온": "mean", "습도": "mean", "풍속": "mean", "일조": "mean", "일사": "mean"})

( 
  t_group.hvplot.line(x="일시", y="기온", width=1200, height=400, autorange="y", title="일조 및 일사량 변화", grid=True, legend="bottom_right") + \
  (t_group.hvplot.area(x="일시", y=["일조", "일사"], width=1200, height=400, autorange="y", title="일조 및 일사량 변화", grid=True, legend="bottom_right", alpha=0.5) * \
  t_group.hvplot.line(x="일시", y=["풍속", "강수량", "습도"], width=1200, height=400, autorange="y", title="일조 및 일사량 변화", grid=True, legend="bottom_right"))).cols(1)

# 123

In [484]:
import torch
from torch import nn
from torch.functional import F

class BuildingEmbedding(nn.Module):
    def __init__(self, num_buildings, embedding_dim):
        super(BuildingEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=num_buildings + 1, embedding_dim=embedding_dim)
        
    def forward(self, building_id):
        x = self.embedding(building_id)
        return x

class WeatherLayer(nn.Module):
    def __init__(self, input_dim=6, hidden_dim=32, output_dim=16):
        super(WeatherLayer, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return x

class BasicRegressionModel(nn.Module):
    def __init__(self, num_buildings, num_weathers, num_dt, embedding_dim=64):
        super(BasicRegressionModel, self).__init__()
        self.embedding = BuildingEmbedding(num_buildings + 1, embedding_dim)
        
        self.weather = WeatherLayer(input_dim=num_weathers, hidden_dim=32, output_dim=embedding_dim // 2)
        self.dt = nn.Linear(num_dt, embedding_dim // 2)
        self.out = nn.Linear(embedding_dim * 2, 1)
        
    def forward(self, building_id, weather_features, dt_features):
        embeded = self.embedding(building_id)
        weather = self.weather(weather_features)
        dt = self.dt(dt_features)
        
        h = torch.cat([embeded, weather, dt], dim=1)
        out = self.out(h)
        return out


In [None]:
def train(model, loader, optimizer, criterion, device):
    model.to(device)
    model.train()
    total_loss = 0.0
    
    for batch in loader:
        building_ids, weather_features, dt_features, labels = batch
        building_ids = building_ids.to(device)
        weather_features = weather_features.to(device)
        dt_features = dt_features.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        
        out = model(building_ids, weather_features, dt_features)
        loss = criterion(out, labels)
        loss.backward()
        
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(loader)

In [470]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
dt_cols = ["sin_wd", "cos_wd", "sin_d", "cos_d", "sin_h", "cos_h"]

n_buildings = len(train_df["건물번호"].unique())
n_weathers = len(weather_cols)
n_dt_features = len(dt_cols)  # sin_wd, cos_wd, sin_d, cos_d, sin_h, cos_h

model = BasicRegressionModel(n_buildings, n_weathers, n_dt_features)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [486]:
sample = train_df.sample(10, random_state=42)
b_id = torch.tensor(sample["건물번호"].values, dtype=torch.long).to(DEVICE)
weather_features = torch.tensor(sample[weather_cols].values, dtype=torch.float32).to(DEVICE)
dt_features = torch.tensor(sample[["sin_wd", "cos_wd", "sin_d", "cos_d", "sin_h", "cos_h"]].values, dtype=torch.float32).to(DEVICE)

In [None]:
from torch.utils.data import Dataset, DataLoader

class TrainSet(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()
        for _, group in self.df.groupby(by="건물번호"):
            group

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.df.loc[idx, "건물번호"], dtype=torch.long)
        label_typs = torch.tensor(self.df.loc[idx, "건물유형"], dtype=torch.long)
        
        return x, label_typs

In [None]:
train(model, 
      DataLoader(
          list(zip(b_id, weather_features, dt_features, torch.tensor(sample["전력소비량"].values, dtype=torch.float32).to(DEVICE))),
          batch_size=2,
          shuffle=True
      ), 
      optimizer, 
      criterion, 
      DEVICE
)

tensor([[1.0048],
        [0.9345],
        [1.0796],
        [1.9757],
        [0.8301],
        [0.9031],
        [1.5231],
        [1.1728],
        [0.7504],
        [0.9770]], device='cuda:0', grad_fn=<AddmmBackward0>)

# ADF

In [439]:
from statsmodels.tsa.stattools import adfuller

def adf_with_pandas(df: pd.DataFrame, columns, maxlag=None, regression='c', autolag='AIC') -> pd.DataFrame:
  results = pd.DataFrame(columns=["adf_stat", "p_value", "used_lag", "n_obs", "crit_vals_1", "crit_vals_5", "crit_vals_10", "icbest"])
  
  for col in columns:
    result = adfuller(df[col], maxlag=maxlag, regression=regression, autolag=autolag)
    adf_stat, p_value, used_lag, n_obs, crit_vals, icbest = result
    
    results.loc[col, :] = [adf_stat, p_value, used_lag, n_obs, *crit_vals.values(), icbest]
  
  return results

def adf(data, maxlag=None, regression='c', autolag='AIC') -> pd.DataFrame:
  result = adfuller(data, maxlag=maxlag, regression=regression, autolag=autolag)
  adf_stat, p_value, used_lag, n_obs, crit_vals, icbest = result
  return pd.DataFrame([{"adf_stat": adf_stat, "p_value": p_value, "used_lag": used_lag, "n_obs": n_obs, **crit_vals, "icbest": icbest}])


In [None]:
weather_group = train_df.groupby(by="일시").agg({"강수량": "mean", "기온": "mean", "습도": "mean", "풍속": "mean", "일조": "mean", "일사": "mean"})
result = adf_with_pandas(weather_group, weather_cols)
result

Unnamed: 0,adf_stat,p_value,used_lag,n_obs,crit_vals_1,crit_vals_5,crit_vals_10,icbest
기온,-2.637607,0.085471,26,2013,-3.433603,-2.862977,-2.567535,220.519779
강수량,-6.38296,0.0,24,2015,-3.433599,-2.862975,-2.567534,3634.068403
풍속,-6.631133,0.0,26,2013,-3.433603,-2.862977,-2.567535,178.229902
습도,-4.035502,0.001236,26,2013,-3.433603,-2.862977,-2.567535,-2198.761971
일조,-4.969936,2.5e-05,25,2014,-3.433601,-2.862976,-2.567535,-3862.746298
일사,-5.468801,2e-06,26,2013,-3.433603,-2.862977,-2.567535,-2886.077596


In [403]:
merged_df.groupby(by=["일시", "건물유형"]).agg({"전력소비량": "mean"}).reset_index(level=1, drop=False).hvplot.line(
    x="일시",
    y="전력소비량",
    by="건물유형",
    width=1200,
    height=400,
    title="건물유형별 전력소비량 변화",
    grid=True,
    legend="bottom_right")

In [None]:
building_group = merged_df.groupby(by=["일시", "건물유형"]).agg({"전력소비량": "mean"}).reset_index(level=1, drop=False)

result = pd.concat(
  [
    adf_with_pandas(building_group[building_group["건물유형"] == building_type], ["전력소비량"]).rename(index={"전력소비량": f"{building_type}_전력소비량"})
    for building_type in building_group["건물유형"].unique()], axis=0)

In [423]:
result

Unnamed: 0,adf_stat,p_value,used_lag,n_obs,crit_vals_1,crit_vals_5,crit_vals_10,icbest
IDC(전화국)_전력소비량,-3.456912,0.009165,26,2013,-3.433603,-2.862977,-2.567535,25102.28659
건물기타_전력소비량,-2.392704,0.143806,25,2014,-3.433601,-2.862976,-2.567535,21917.433559
공공_전력소비량,-5.195624,9e-06,26,2013,-3.433603,-2.862977,-2.567535,22567.60995
백화점_전력소비량,-3.593401,0.005888,25,2014,-3.433601,-2.862976,-2.567535,23188.342421
병원_전력소비량,-4.602248,0.000128,26,2013,-3.433603,-2.862977,-2.567535,24640.486763
상용_전력소비량,-5.783248,1e-06,26,2013,-3.433603,-2.862977,-2.567535,21543.504004
아파트_전력소비량,-1.23927,0.656479,26,2013,-3.433603,-2.862977,-2.567535,19005.65728
연구소_전력소비량,-5.449607,3e-06,26,2013,-3.433603,-2.862977,-2.567535,23733.162342
학교_전력소비량,-5.238497,7e-06,26,2013,-3.433603,-2.862977,-2.567535,23706.239544
호텔_전력소비량,-1.712248,0.424813,26,2013,-3.433603,-2.862977,-2.567535,23625.646944


In [445]:
result = pd.concat(
  [
    adf_with_pandas(train_df[train_df["건물번호"] == building_id].sort_values("일시"), ["전력소비량"]).rename(index={"전력소비량": f"{building_id}_전력소비량"})
    for building_id in train_df["건물번호"].unique()], axis=0)

In [446]:
result

Unnamed: 0,adf_stat,p_value,used_lag,n_obs,crit_vals_1,crit_vals_5,crit_vals_10,icbest
1_전력소비량,-4.984962,0.000024,26,2013,-3.433603,-2.862977,-2.567535,30640.446286
2_전력소비량,-7.205416,0.0,26,2013,-3.433603,-2.862977,-2.567535,27351.915022
3_전력소비량,-5.310995,0.000005,26,2013,-3.433603,-2.862977,-2.567535,31357.769183
4_전력소비량,-3.198262,0.020086,26,2013,-3.433603,-2.862977,-2.567535,21632.736422
5_전력소비량,-5.303486,0.000005,26,2013,-3.433603,-2.862977,-2.567535,25118.892728
...,...,...,...,...,...,...,...,...
96_전력소비량,-2.262918,0.184207,25,2014,-3.433601,-2.862976,-2.567535,21741.399895
97_전력소비량,-3.209234,0.019456,26,2013,-3.433603,-2.862977,-2.567535,24593.410898
98_전력소비량,-1.891349,0.336142,24,2015,-3.433599,-2.862975,-2.567534,25243.43021
99_전력소비량,-2.794649,0.059042,26,2013,-3.433603,-2.862977,-2.567535,20491.864504


In [None]:
from sklearn.preprocessing import minmax_scale

adf(minmax_scale(train_df[train_df["건물번호"] == 1].sort_values("일시")["전력소비량"]))

Unnamed: 0,adf_stat,p_value,used_lag,n_obs,1%,5%,10%,icbest
0,-4.984962,2.4e-05,26,2013,-3.433603,-2.862977,-2.567535,-4533.177123
