### inference & graph

In [None]:
import pandas as pd
import os.path as path
from compas.inference import ForecastModel

model = ForecastModel(
    model_path="../.saved_models/gru_custom_GRU_uni_in_6_out_1_2024-07-22_09:36:32"
)
df = pd.read_csv("../data/custom/dataset_v1.1.csv")
df = df.dropna()
emd_target = list(df["EMD_CD"].unique())
df_emd = pd.read_csv("../data/custom/data_01.pnu_gid_emd_map.csv", low_memory=False)
emd_map = dict(zip(df_emd["EMD_CD"], df_emd["EMD_NM"]))

results = {}
for emd_cd in emd_target:
    df_sample = df.loc[df["EMD_CD"] == emd_cd].sort_values(by="STD_YM")
    # predict last 12 months
    df_gt = df_sample.iloc[-12:, :].set_index("STD_YM")
    df_out = model.forecast(df_sample.iloc[:-12, :], steps=12)
    results[emd_cd] = (df_gt["vacancy_rate"], df_out["vacancy_rate"])

In [None]:
import matplotlib.pyplot as plt

plt.rc("font", family="AppleGothic")

fig, ax = plt.subplots(5, 2, figsize=(36, 24))
for i, (k, v) in enumerate(results.items()):
    v[0].plot(
        ax=ax[i // 2, i % 2],
        # ylim=(0, 0.6),
        title=emd_map[k],
        legend=True,
        label="gt",
        ylabel="vacancy rate",
    )
    v[1].plot(
        ax=ax[i // 2, i % 2],
        # ylim=(0, 0.6),
        title=emd_map[k],
        legend=True,
        label="pred",
        ylabel="vacancy rate",
    )
# fig.savefig("fig.png", dpi=300)

### inference - Sejong City

In [None]:
import pandas as pd
import os.path as path
import plotly.express as px

df = pd.read_csv("../data/dataset_v1.2.csv")
emd_target = list(df["EMD_CD"].unique())

# # 1. sejong-city total dataset
df_sejong = df.copy()
df_sejong['vacancy_rate'] = df_sejong['vacancy_rate'] * df_sejong['bld_tot_area']
df_sejong = df_sejong.groupby('STD_YM').agg({
    'vacancy_rate':'sum',
    'move_pop':'sum',
    'area_pop':'sum',
    'service_type_count':'max', 
    'biz_opens':'sum', 
    'biz_closures':'sum', 
    'bld_tot_area':'sum',
    'bld_area_small':'sum', 
    'bld_area_midlarge':'sum', 
    'bld_area_complex':'sum',
    'maxgrid_lat':'mean', 
    'maxgrid_lon':'mean', 
    'call_rate':'first', 
    'novel_balance_COFIX':'first',
    'bld_loan_complex':'first', 
    'novel_trade_COFIX':'first', 
    'avg_comp_stock':'first',
    'balance_COFIX':'first', 
    'avg_treasury_10yrs':'first', 
    'bld_loan_small':'first',
    'avg_treasury_5yrs':'first', 
    'CPI':'first', 
    'bld_loan_midlarge':'first', 
    'avg_treasury_3yrs':'first',
    'CD_91':'first', 
    'standard_interest':'first',
}).reset_index().sort_values('STD_YM')
df_sejong['vacancy_rate'] = df_sejong['vacancy_rate'] / df_sejong['bld_tot_area']

In [None]:
fig = px.line(
    df_sejong,
    x='STD_YM',
    y='vacancy_rate',
    # title='Vacancy Rate by Month',
)
fig.show()

In [None]:
import pandas as pd
import os.path as path
from compas.inference import ForecastModel
import plotly.express as px

model = ForecastModel("../data/.saved_models/GRU_03_emd_top3_union_GRU_uni_in_3_out_1_2024-07-26_06:24:46")
df = pd.read_csv("../data/dataset_v1.2.csv")
emd_target = list(df["EMD_CD"].unique())

# # 1. sejong-city total dataset
df_sejong = df.copy()
df_sejong['vacancy_rate'] = df_sejong['vacancy_rate'] * df_sejong['bld_tot_area']
df_sejong = df_sejong.groupby('STD_YM').agg({
    'vacancy_rate':'sum',
    'move_pop':'sum',
    'area_pop':'sum',
    'service_type_count':'max', 
    'biz_opens':'sum', 
    'biz_closures':'sum', 
    'bld_tot_area':'sum',
    'bld_area_small':'sum', 
    'bld_area_midlarge':'sum', 
    'bld_area_complex':'sum',
    'maxgrid_lat':'mean', 
    'maxgrid_lon':'mean', 
    'call_rate':'first', 
    'novel_balance_COFIX':'first',
    'bld_loan_complex':'first', 
    'novel_trade_COFIX':'first', 
    'avg_comp_stock':'first',
    'balance_COFIX':'first', 
    'avg_treasury_10yrs':'first', 
    'bld_loan_small':'first',
    'avg_treasury_5yrs':'first', 
    'CPI':'first', 
    'bld_loan_midlarge':'first', 
    'avg_treasury_3yrs':'first',
    'CD_91':'first', 
    'standard_interest':'first',
}).reset_index().sort_values('STD_YM')
df_sejong['vacancy_rate'] = df_sejong['vacancy_rate'] / df_sejong['bld_tot_area']
df_sejong_out = model.forecast(df_sejong, steps=19)[['vacancy_rate']]

# 2. inference per-EMD, post process sum
results = []
for emd_cd in emd_target:
    df_sample = df.loc[df["EMD_CD"] == emd_cd].sort_values(by="STD_YM")
    df_out = model.forecast(df_sample, steps=19)
    results.append(df_out[["vacancy_rate","bld_tot_area"]])
df_out = pd.concat(results,axis=0).reset_index().rename(columns={'index':'STD_YM'})
df_out['vacancy_rate'] = df_out['vacancy_rate'] * df_out['bld_tot_area']
df_out = df_out.groupby('STD_YM').agg('sum')
df_out['vacancy_rate'] = df_out['vacancy_rate'] / df_out['bld_tot_area']
df_out = df_out[['vacancy_rate']]

result_df = pd.concat([
    df_out.rename(columns={'vacancy_rate':'vac_EMD'}), 
    df_sejong_out.rename(columns={'vacancy_rate':'vac_Sejong'})
    ],
    axis=1,
)
result_df.to_csv("./inference_result.csv")
pd.concat([df_sejong.set_index('STD_YM')['vacancy_rate'],result_df['vac_EMD']]).rename('EMD-wise').to_csv(
    "./inference_result_combined.csv"
)

fig = px.line(
    pd.concat([
        pd.concat([df_sejong.set_index('STD_YM')['vacancy_rate'],result_df['vac_EMD']]).rename('EMD-wise'),
        # pd.concat([df_sejong.set_index('STD_YM')['vacancy_rate'],result_df['vac_Sejong']]).rename('Sejong-total'),
    ], axis=1),
    title='Vacancy Rate Prediction',
    labels={'index': 'Time', 'value': 'Vacancy Rate'},
)
fig.add_vline(x='2024-05', line_dash='dash', line_color='green')
fig.update_layout(showlegend=False)
fig.show()

### Feature Importance - Permutation importance
- feature importance = **loss difference when shuffling a single feature / column**

In [None]:
import pandas as pd
import plotly.express as px

df_l = pd.read_csv("./feature_importance_exclude1.csv").set_index('feature_name')
baseline = df_l.loc['base']['loss']
df_l['loss'] = df_l['loss'] - baseline
# s = df_l['loss'].sum()
# df_l['loss'] = df_l['loss'] / s
df_l.sort_values(by='loss',ascending=False,inplace=True)
df_l.rename(columns={'loss':'feature_importance'},inplace=True)

fig = px.bar(df_l, title='LSTM Feature Importance - exclude 1 column', labels={
    'feature_name': 'Feature Name',
    'value': 'loss difference (mse)'
})
fig.update_layout(showlegend=False)
fig.show()

In [None]:
import pandas as pd
import plotly.express as px

df_l2 = pd.read_csv("../data/.saved_models/GRU_03_emd_top3_union_GRU_uni_in_3_out_1_2024-07-26_08:44:23/feature_importance.csv").set_index('feature_name')
df_l2.sort_values(by='feature_importance',ascending=False,inplace=True)

fig = px.bar(df_l2, title='GRU Feature Importance - noise 1 column')
fig.update_layout(showlegend=False)
fig.show()

In [None]:
list(df_l.loc[df_l['feature_importance']>=0].index)

In [None]:
list(df_l2.loc[df_l2['feature_importance']>=0].index)

### Feature Selection - Visualization

In [None]:
import plotly.express as px
import pandas as pd

# feature selection - LSTM feature importance -> select features w/ importance-score > 0 (14 out of 26)
features_ex_importance = [
    'move_pop',
    'maxgrid_lon',
    'bld_area_complex',
    'novel_balance_COFIX',
    'CPI',
    'maxgrid_lat',
    'bld_loan_midlarge',
    'standard_interest',
    'CD_91',
    'avg_treasury_10yrs',
    'biz_opens',
    'bld_tot_area',
    'balance_COFIX',
    'avg_comp_stock',
]
features_DA = [
    "avg_comp_stock",
    "bld_loan_complex",
    "area_pop",
    "bld_loan_midlarge",
    "novel_balance_COFIX",
    "CPI",
    "bld_area_midlarge",
    "avg_treasury_10yrs",
    "avg_treasury_5yrs",
    "service_type_count",
    "avg_treasury_3yrs",
    "novel_trade_COFIX",
    "balance_COFIX",
    "bld_area_small",
]

lstm_base_loss = 0.1515
lstm_feature_subset_ex_loss = 0.0694
lstm_feature_subset_da_loss = 0.0541

fig = px.bar(
    pd.DataFrame.from_dict({
        'All features (26)': lstm_base_loss,
        'Select features - LSTM (14)': lstm_feature_subset_ex_loss,
        'Select features - DA (14)': lstm_feature_subset_da_loss,
        },
        orient='index',
        columns=['Test Loss (MSE)'],
    ),
    labels={
        'index': '',
        'value': 'test_mse_loss',
    },
    title='Feature Selection - Test Loss Comparison'
)
fig.update_layout(showlegend=False)
fig.show()

### Feature Inspection - Inference on augmented data

In [None]:
import pandas as pd
