### DataModule

In [None]:
import pandas as pd
import os
import os.path as path
from random import choice
import numpy as np
import torch
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import MLFlowLogger
import torch.optim as optim
import mlflow
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from yacs.config import CfgNode as CN
from typing import Any, Optional, Union, List
from datetime import datetime
import warnings

warnings.filterwarnings("ignore", ".*does not have many workers.*")

# load csv
DATA_PATH = "../data/custom/tmp_dataset_with_interest_col_modified.csv"

### Dataset

In [None]:
# Dataset - Single Series
class TSSingleDataset(Dataset):
    def __init__(self, data, x_cols, input_steps, output_steps, scaler=None):
        self.dataframe = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
        self.input_steps = input_steps
        self.output_steps = output_steps

        # setup scaler
        if scaler is not None:
            self.scaler = scaler
        else:
            self.scaler = StandardScaler()
            self.scaler.fit(self.dataframe[x_cols])

        # prepare data
        self.data = torch.tensor(
            self.scaler.transform(self.dataframe[x_cols]),
            dtype=torch.float32,
        )
        self.n_features = self.data.shape[1]  # feature-dim

    def __len__(self):
        return len(self.data) - self.input_steps - self.output_steps + 1

    def __getitem__(self, idx):
        # x : (input_steps, n_features)
        x = self.data[idx : idx + self.input_steps, :]
        y = self.data[
            idx + self.input_steps : idx + self.input_steps + self.output_steps, :
        ]
        return x, y

In [None]:
# Dataset - Multi Series
class TSMultiDataset(Dataset):
    def __init__(
        self,
        data: Union[List[pd.DataFrame], pd.DataFrame],
        x_cols,
        input_steps,
        output_steps,
        scaler=None,
    ):

        # data : list of dataframe or single dataframe (all same shape / time series length)
        self.input_steps = input_steps
        self.output_steps = output_steps
        self.df_list = [data] if isinstance(data, pd.DataFrame) else data
        self.df_combined = pd.concat(self.df_list, axis=0)
        self.series_length = (
            self.df_list[0].shape[0] - self.input_steps - self.output_steps + 1
        )  # length of each series

        # setup scaler
        if scaler is not None:
            self.scaler = scaler
        else:
            self.scaler = StandardScaler()
            self.scaler.fit(self.df_combined[x_cols])

        # dim : (N_series, N_timesteps, N_features)
        self.data = torch.tensor(
            np.asarray([self.scaler.transform(df[x_cols]) for df in self.df_list]),
            dtype=torch.float32,
        )
        self.n_features = self.data.shape[2]

    def __len__(self):
        return self.series_length * len(self.df_list)

    def __getitem__(self, idx):
        idx_1 = idx // self.series_length
        idx_2 = idx % self.series_length

        # shape : X - (input_steps, N_features), Y - (output_steps, N_features)
        x = self.data[idx_1, idx_2 : idx_2 + self.input_steps, :]
        y = self.data[
            idx_1,
            idx_2 + self.input_steps : idx_2 + self.input_steps + self.output_steps,
            :,
        ]
        return x, y

### LightningDataModule

In [None]:
# DataModule - Single Series
class TSSingleDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_path: str,
        emd_cd: str,
        input_steps: int,
        output_steps: int,
        test_size: int = 12,
        val_size: int = 12,
        batch_size: int = 8,
        x_cols: list = None,
    ) -> None:
        super().__init__()
        self.data_path = data_path
        self.input_steps = input_steps
        self.output_steps = output_steps
        self.emd_cd = emd_cd
        self.x_cols = x_cols
        self.test_size = test_size
        self.val_size = val_size
        self.batch_size = batch_size

    def prepare_data(self) -> None:
        df = pd.read_csv(self.data_path, low_memory=False)
        df.dropna(how="any", inplace=True)
        self.dataframe = df.loc[df["EMD_CD"] == self.emd_cd].sort_values(by="STD_YM")
        if not self.x_cols is not None:
            self.x_cols = list(df.columns[2:])  # exclude index columns

        # Y (vacancy_rate) column must be at the front
        if self.x_cols[0] != "vacancy_rate":
            self.x_cols.insert(0, self.x_cols.pop(self.x_cols.index("vacancy_rate")))

    def setup(self, stage: str = None) -> None:
        # (train, val, test) -> (ANY, 12, 12)
        train, test = train_test_split(
            self.dataframe,
            test_size=self.test_size,
            shuffle=False,
        )
        train, val = train_test_split(train, test_size=self.test_size, shuffle=False)
        # scaler is set from train-set only
        self.train = TSSingleDataset(
            train, self.x_cols, self.input_steps, self.output_steps
        )
        self.scaler = self.train.scaler
        self.n_features = self.train.n_features
        self.validation = TSSingleDataset(
            val, self.x_cols, self.input_steps, self.output_steps, scaler=self.scaler
        )
        self.test = TSSingleDataset(
            test, self.x_cols, self.input_steps, self.output_steps, scaler=self.scaler
        )

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.validation, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test, batch_size=self.batch_size, shuffle=False)

In [None]:
# DataModule - Multi Series
class TSMultiDataModule(pl.LightningDataModule):
    def __init__(
        self,
        data_path: str,
        input_steps: int,
        output_steps: int,
        test_size: int = 12,
        val_size: int = 12,
        batch_size: int = 8,
        x_cols: list = None,
    ) -> None:
        super().__init__()
        self.data_path = data_path
        self.input_steps = input_steps
        self.output_steps = output_steps
        self.x_cols = x_cols
        self.test_size = test_size
        self.val_size = val_size
        self.batch_size = batch_size

    def prepare_data(self) -> None:
        df = pd.read_csv(self.data_path, low_memory=False)
        df.dropna(how="any", inplace=True)
        df.set_index("EMD_CD", inplace=True)
        self.df_list = [
            df.loc[emd].reset_index(drop=False).sort_values(by="STD_YM")
            for emd in df.index.unique()
        ]
        if not self.x_cols:
            self.x_cols = list(self.df_list[0].columns[2:])  # exclude index columns

        # Y (vacancy_rate) column must be at the front
        if self.x_cols[0] != "vacancy_rate":
            self.x_cols.insert(0, self.x_cols.pop(self.x_cols.index("vacancy_rate")))

    def setup(self, stage: str = None) -> None:
        # (train, val, test) -> (ANY, 12, 12)
        splits = [
            train_test_split(df, test_size=self.test_size, shuffle=False)
            for df in self.df_list
        ]
        trains_t, tests = [x[0] for x in splits], [x[1] for x in splits]
        splits = [
            train_test_split(df, test_size=self.val_size, shuffle=False)
            for df in trains_t
        ]
        trains, vals = [x[0] for x in splits], [x[1] for x in splits]

        # scaler is set from train-set only
        self.train = TSMultiDataset(
            trains,
            x_cols=self.x_cols,
            input_steps=self.input_steps,
            output_steps=self.output_steps,
        )
        self.scaler = self.train.scaler
        self.n_features = self.train.n_features
        self.validation = TSMultiDataset(
            vals,
            x_cols=self.x_cols,
            input_steps=self.input_steps,
            output_steps=self.output_steps,
            scaler=self.scaler,
        )
        self.test = TSMultiDataset(
            tests,
            x_cols=self.x_cols,
            input_steps=self.input_steps,
            output_steps=self.output_steps,
            scaler=self.scaler,
        )

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.validation, batch_size=self.batch_size, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test, batch_size=self.batch_size, shuffle=False)

### LSTM (Simple)

In [None]:
# Model
class LSTMSimple(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers,
        output_steps,
        bidirectional=False,
        scaler=None,
    ):
        super(LSTMSimple, self).__init__()
        self.input_size = input_size
        self.output_steps = output_steps
        self.output_size = output_steps * input_size  # input_size == N_FEATURES
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.scaler = scaler
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            dropout=0,
            bidirectional=bidirectional,
            batch_first=True,
        )
        self.fc = nn.Linear(
            hidden_size * (2 if self.bidirectional else 1), self.output_size
        )

    def forward(self, x):
        # hidden state
        h_0 = torch.zeros(
            self.num_layers * (2 if self.bidirectional else 1),
            x.size(0),
            self.hidden_size,
        ).to(x.device)
        c_0 = torch.zeros(
            self.num_layers * (2 if self.bidirectional else 1),
            x.size(0),
            self.hidden_size,
        ).to(x.device)

        # forward lstm & fcn
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
# Lightning module
from typing import Any


class LSTMSimpleLightningModule(pl.LightningModule):
    def __init__(self, model=None, cfg=None, scaler=None):
        super(LSTMSimpleLightningModule, self).__init__()
        assert (model is not None) or (cfg is not None)

        # init by either model or CfgNode
        if model is not None:
            self.model = model
        else:
            assert scaler is not None, "Dataset Scaler must be provided with CfgNode"
            self.model = LSTMSimple(
                input_size=cfg["input_size"],
                output_steps=cfg["output_steps"],
                hidden_size=cfg["hidden_size"],
                num_layers=cfg["num_layers"],
                bidirectional=cfg["bidirectional"] if "bidirectional" in cfg else False,
                scaler=scaler,
            )

        self.scaler = self.model.scaler
        self.criterion = nn.MSELoss()
        self.test_predictions = []

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y = y.view(y.size(0), -1)
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log_dict(
            {
                "train_mse_loss": loss,
            },
            on_epoch=True,
        )
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y = y.view(y.size(0), -1)
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.log_dict(
            {
                "val_mse_loss": loss,
            },
            on_epoch=True,
            on_step=False,
        )
        return {"loss": loss}

    def test_step(self, batch, batch_idx):
        x, y = batch
        y = y.view(y.size(0), -1)
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        self.test_predictions.append((y_hat, y))

        self.log_dict(
            {
                "test_mse_loss": loss,
            },
            on_step=True,
        )
        return {"loss": loss}

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x, _ = batch
        y_hat = self(x)

        # Reshape y_hat as time series (B, output_steps, input_size)
        batch_size = x.size(0)
        preds = y_hat.view(batch_size, self.model.output_steps, self.model.input_size)

        return preds

    def configure_optimizers(self):
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.scheduler = {
            "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
                self.optimizer,
                mode="min",
                patience=2,
                min_lr=1e-5,
            ),
            "monitor": "val_mse_loss",
        }
        return {"optimizer": self.optimizer, "lr_scheduler": self.scheduler}

### MLFlow Run

**configs**
1. dataset : `data_path, input_steps, output_steps, test_size, val_size, x_cols`
2. model-type : `model_type == enum(['lstm','gru','cnn'])`
3. model-lstm : `num_layers, hidden_size, bidirectional(bool)`
4. model-gru : 
5. model-cnn :

In [None]:
from yacs.config import CfgNode as CN

cfg = CN()
cfg.MODEL_TYPE = "lstm"
cfg.LSTM = CN()
cfg.DATA_PATH = "./data/custom/tmp_dataset_with_interest_col_modified.csv"
cfg.INPUT_STEPS = 3
cfg.OUTPUT_STEPS = 1
cfg.TEST_SIZE = 12
cfg.VAL_SIZE = 12
cfg.X_COLS = None

cfg.N_EPOCHS = 50
cfg.BATCH_SIZE = 8

cfg.LSTM.NUM_LAYERS = 1
cfg.LSTM.HIDDEN_SIZE = 32
cfg.LSTM.BIDIRECTIONAL = False

cfg.MLFLOW_TRACKING_URI = "databricks"
cfg.DATABRICKS_WORKSPACE = "/Users/cwwojin@gmail.com"
cfg.EXPERIMENT_NAME = f"Compas_{cfg.MODEL_TYPE.upper()}"

In [None]:
# Lightning-Datamodule : Multi
data_module = TSMultiDataModule(
    data_path=cfg.DATA_PATH,
    input_steps=cfg.INPUT_STEPS,
    output_steps=cfg.OUTPUT_STEPS,
    test_size=cfg.TEST_SIZE,
    val_size=cfg.VAL_SIZE,
    batch_size=cfg.BATCH_SIZE,
    x_cols=cfg.X_COLS,
)
data_module.prepare_data()
data_module.setup()

# lightningModule
l_model = LSTMSimpleLightningModule(
    cfg=dict(
        input_size=data_module.n_features,
        output_steps=cfg.OUTPUT_STEPS,
        num_layers=cfg.LSTM.NUM_LAYERS,
        hidden_size=cfg.LSTM.HIDDEN_SIZE,
        bidirectional=cfg.LSTM.BIDIRECTIONAL,
    ),
    scaler=data_module.scaler,
)

In [None]:
# Train with MLflow
mlflow.login()
timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d_%H-%m-%s")

trainer = pl.Trainer(
    max_epochs=cfg.N_EPOCHS,
    devices="auto",
    logger=MLFlowLogger(
        experiment_name=f"{cfg.DATABRICKS_WORKSPACE}/{cfg.EXPERIMENT_NAME}",
        run_name=f"run_{timestamp}",
        tracking_uri=cfg.MLFLOW_TRACKING_URI,
        log_model=True,
    ),
    check_val_every_n_epoch=1,
)

In [None]:
trainer.fit(
    model=l_model,
    datamodule=data_module,
)

In [None]:
trainer.test(
    model=l_model,
    datamodule=data_module,
)

### Export to TorchScript
- Export model -> `.pt file of scripted-model`
- Export data scaler -> `.pkl file using SKLearn`

In [None]:
# # export lightningmodule
# l_model.to_torchscript("./models/lightning_module.pt")

# # import & check forward
# imported_model = torch.jit.load("./models/lightning_module.pt")
# imported_model(torch.Tensor(np.random.randn(8, 3, 24))).shape

In [None]:
# # export pytorch nn.Module
# scripted_model = torch.jit.script(l_model.model)
# torch.jit.save(scripted_model, "./models/torch_module.pt")

# # import & check forward
# imported_model = torch.jit.load("./models/torch_module.pt")
# imported_model(torch.Tensor(np.random.randn(8, 3, 24))).shape

### inference & graph

In [None]:
import pandas as pd
import os.path as path
from compas.inference import ForecastModel

model = ForecastModel(
    model_path="../.saved_models/gru_custom_GRU_uni_in_6_out_1_2024-07-22_09:36:32"
)
df = pd.read_csv("../data/custom/dataset_v1.1.csv")
df = df.dropna()
emd_target = list(df["EMD_CD"].unique())
df_emd = pd.read_csv("../data/custom/data_01.pnu_gid_emd_map.csv", low_memory=False)
emd_map = dict(zip(df_emd["EMD_CD"], df_emd["EMD_NM"]))

results = {}
for emd_cd in emd_target:
    df_sample = df.loc[df["EMD_CD"] == emd_cd].sort_values(by="STD_YM")
    # predict last 12 months
    df_gt = df_sample.iloc[-12:, :].set_index("STD_YM")
    df_out = model.forecast(df_sample.iloc[:-12, :], steps=12)
    results[emd_cd] = (df_gt["vacancy_rate"], df_out["vacancy_rate"])

In [None]:
import matplotlib.pyplot as plt

plt.rc("font", family="AppleGothic")

fig, ax = plt.subplots(5, 2, figsize=(36, 24))
for i, (k, v) in enumerate(results.items()):
    v[0].plot(
        ax=ax[i // 2, i % 2],
        # ylim=(0, 0.6),
        title=emd_map[k],
        legend=True,
        label="gt",
        ylabel="vacancy rate",
    )
    v[1].plot(
        ax=ax[i // 2, i % 2],
        # ylim=(0, 0.6),
        title=emd_map[k],
        legend=True,
        label="pred",
        ylabel="vacancy rate",
    )
# fig.savefig("fig.png", dpi=300)

### inference - Sejong City

In [None]:
import pandas as pd
import os.path as path
from compas.inference import ForecastModel

model = ForecastModel("../.saved_models/GRU_uni_in_3_out_1_2024-07-22_13:44:18")
df = pd.read_csv("../data/custom/dataset_v1.1.csv")
emd_target = list(df["EMD_CD"].unique())

In [None]:
# 1. inference w/ sejong-city total dataset
df_sejong = df.copy()
df_sejong['vacancy_rate'] = df_sejong['vacancy_rate'] * df_sejong['bld_tot_area']
df_sejong = df_sejong.groupby('STD_YM').agg({
    'vacancy_rate':'sum',
    'move_pop':'sum',
    'area_pop':'sum',
    'service_type_count':'max', 
    'biz_opens':'sum', 
    'biz_closures':'sum', 
    'bld_tot_area':'sum',
    'bld_area_small':'sum', 
    'bld_area_midlarge':'sum', 
    'bld_area_complex':'sum',
    'maxgrid_lat':'mean', 
    'maxgrid_lon':'mean', 
    'call_rate':'first', 
    'novel_balance_COFIX':'first',
    'bld_loan_complex':'first', 
    'novel_trade_COFIX':'first', 
    'avg_comp_stock':'first',
    'balance_COFIX':'first', 
    'avg_treasury_10yrs':'first', 
    'bld_loan_small':'first',
    'avg_treasury_5yrs':'first', 
    'CPI':'first', 
    'bld_loan_midlarge':'first', 
    'avg_treasury_3yrs':'first',
    'CD_91':'first', 
    'standard_interest':'first',
}).reset_index().sort_values('STD_YM')
df_sejong['vacancy_rate'] = df_sejong['vacancy_rate'] / df_sejong['bld_tot_area']
df_sejong_out = model.forecast(df_sejong, steps=19)[['vacancy_rate']]

# 2. inference per-EMD, post process sum
results = []
for emd_cd in emd_target:
    df_sample = df.loc[df["EMD_CD"] == emd_cd].sort_values(by="STD_YM")
    df_out = model.forecast(df_sample, steps=19)
    results.append(df_out[["vacancy_rate","bld_tot_area"]])
df_out = pd.concat(results,axis=0).reset_index().rename(columns={'index':'STD_YM'})
df_out['vacancy_rate'] = df_out['vacancy_rate'] * df_out['bld_tot_area']
df_out = df_out.groupby('STD_YM').agg('sum')
df_out['vacancy_rate'] = df_out['vacancy_rate'] / df_out['bld_tot_area']
df_out = df_out[['vacancy_rate']]

result_df = pd.concat([
    df_out.rename(columns={'vacancy_rate':'vac_EMD'}), 
    df_sejong_out.rename(columns={'vacancy_rate':'vac_Sejong'})],
    axis=1,
)
result_df.to_csv("./inference_result.csv")

In [None]:
import plotly.express as px


fig = px.line(
    pd.concat([
        pd.concat([df_sejong.set_index('STD_YM')['vacancy_rate'],result_df['vac_EMD']]).rename('EMD-wise'),
        pd.concat([df_sejong.set_index('STD_YM')['vacancy_rate'],result_df['vac_Sejong']]).rename('City-wise')],
        axis=1,
    ),
    title='Vacancy Rate Prediction',
    labels={'index': 'Time', 'value': 'Vacancy Rate'},
)
fig.add_vline(x='2024-05', line_dash='dash', line_color='green')
fig.show()

### Feature Importance - Permutation importance
- feature importance = **loss difference when shuffling a single feature / column**

In [None]:
import pandas as pd
import os.path as path
import plotly.express as px

DATA_PATH = "../data/.saved_models/LSTM_uni_in_3_out_1_2024-07-23_06:45:07"

df = pd.read_csv(path.join(DATA_PATH, "feature_importance.csv"))
fig = px.bar(
    df,
    x='feature_name',
    y='feature_importance',
    title='Feature Importance (Relative)',
)
fig.show()