In [2]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    model_ready_data_paths,
    model_output_data_paths,
)

from jre_utils.visualize import plot_time_series
from jre_utils.process import get_cumulative_growth, get_cumulative_growth_from_base


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [3]:
asset_type = "all"
investment_asset_type = "building"

metrics = {
    "median": "unit_price_median",
}

granularity_columns = ["area", "area_code", "asset_type"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = f"median"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"

In [4]:
start_year = 2006
eval_start_year = 2020 # eval_years = [2020, 2021, 2022]
eval_end_year = 2022

dataset_key = "transactions"
years_ahead = 2
dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"

test_year = 2022
output_dataset_name = f"{dataset_name}_{test_year}"
model_ready_data_path = model_ready_data_paths[dataset_name]
model_output_data_path = model_output_data_paths[output_dataset_name]

output_df = pd.read_csv(model_output_data_path)
core_df = pd.read_csv(model_ready_data_path)

output_df = output_df[output_df["asset_type"] == investment_asset_type]
core_df = core_df[core_df["asset_type"] == investment_asset_type]
core_df = core_df[core_df["year"] >= start_year]

In [5]:
# What do I want to see with tooling? What is the argument I want to make?

# The model outputs normalized returns for each area for each eval year
# I want to take out one target year, for e.g. 2022
# I want to sort the relative returns for that year
# I want to compare several areas
# I want to see the cumulative factors for each area, this will help me identify trends

# once this is done, write a narrative for the teammates

In [6]:
target_year = 2022
prefecture_code = 13 # (Tokyo, 13), (Hokaido, 1)
results_df = output_df[output_df["year"] == target_year].sort_values(by=["predicted_normalized_return"], ascending=False)
prefectural_results_df = results_df[(results_df["area_code"].astype(int) // 1000 == prefecture_code)]
# results_df
print(len(prefectural_results_df))
prefectural_results_df.head(10)

53


Unnamed: 0,year,area_code,asset_type,predicted_normalized_return,yearly_price_growth,unit_price_median_pct_chg,unit_price_median_pct_chg_normalized_yearly
1602,2022,13361,building,0.188501,-0.575,-0.510256,-0.96033
2135,2022,13107,building,0.154623,0.268456,0.239714,0.265125
955,2022,13105,building,0.150699,0.207273,0.509091,0.705289
269,2022,13108,building,0.12171,0.037621,0.163265,0.140208
1912,2022,13103,building,0.118786,-0.03,0.202429,0.204202
1841,2022,13102,building,0.106212,0.530752,0.22807,0.246099
660,2022,13106,building,0.103457,0.363636,0.202105,0.203672
1590,2022,13104,building,0.103145,-0.030769,0.093852,0.026786
1682,2022,13116,building,0.096974,0.046402,0.198946,0.19851
843,2022,13109,building,0.092934,0.027473,0.147541,0.114514


In [7]:
prefectural_results_df.tail(10)

Unnamed: 0,year,area_code,asset_type,predicted_normalized_return,yearly_price_growth,unit_price_median_pct_chg,unit_price_median_pct_chg_normalized_yearly
97,2022,13228,building,-0.096933,0.03956,0.091165,0.022396
187,2022,13205,building,-0.105954,-0.038462,0.064815,-0.020661
688,2022,13303,building,-0.108889,0.4848,0.568,0.801547
1663,2022,13224,building,-0.113233,0.241558,0.080382,0.004777
1954,2022,13215,building,-0.14701,0.005663,0.015873,-0.100632
1863,2022,13222,building,-0.159731,0.029514,0.092752,0.024988
347,2022,13227,building,-0.235453,0.121094,0.091129,0.022337
1836,2022,13218,building,-0.313857,-0.022523,-0.008069,-0.139754
1626,2022,13305,building,-0.360879,0.106257,-0.19394,-0.443469
2309,2022,13401,building,-0.55166,-0.241806,-0.586338,-1.08465


In [13]:
area_codes = [13103, 1100, 12238]
area_df = core_df[core_df["area_code"].isin(area_codes)].sort_values(by="year", ascending=True)

area_df[f"cumulative_{metric}_growth"] = get_cumulative_growth_from_base(area_df.copy(), metric)

plot_time_series(
    area_df,
    f"cumulative_{metric}_growth",
    group_by_columns,
    granularity_columns,
    f"cumulative_{metric}_growth over time",
    # visible="legendonly",
    width=1200,
    height=600,
    # highlight=True,
    # highlight_range=(2020, 2022)
)

growth_factors = [
    "taxable_income_growth",
    "total_tax_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
]

base_factors = [
    "taxpayer_count",
    "count",
    "population",
    "taxable_income"
]

cumulative_factors = [
    f"cumulative_{factor}" for factor in growth_factors
] + [
    f"cumulative_{factor}_growth" for factor in base_factors
]

area_df[growth_factors] = area_df[growth_factors].replace({0: np.nan})
area_df[base_factors] = area_df[base_factors].replace({0: np.nan})

for factor in growth_factors:
    area_df[f"cumulative_{factor}"] = get_cumulative_growth(
        area_df.copy(), factor
    )

for factor in base_factors:
    area_df[f"cumulative_{factor}_growth"] = get_cumulative_growth_from_base(
        area_df.copy(), factor
    )

for cumulative_factor in cumulative_factors:
    plot_time_series(
        area_df,
        cumulative_factor,
        group_by_columns,
        granularity_columns,
        f"{cumulative_factor} over time",
        # visible="legendonly",
        width=1200,
        height=600,
        # highlight=True,
        # highlight_range=(2015, 2020)
    )



In [9]:
# use population as the error scaling parameter instead

# autocorrelation over time series
# cross time series lead lag for pairs

# Forward looking smoothing for prediction column
# backward looking smoothing for factor column

# time series of all transa


In [12]:
plot_time_series(
    area_df,
    f"cumulative_{'taxable_income'}",
    group_by_columns,
    granularity_columns,
    f"cumulative_{metric}_growth over time",
    # visible="legendonly",
    width=1200,
    height=600,
    # highlight=True,
    # highlight_range=(2020, 2022)
)

In [11]:
# area_df