In [5]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    model_ready_data_paths,
    model_output_data_paths,
)

from jre_utils.visualize import plot_time_series
from jre_utils.process import get_cumulative_growth, get_cumulative_growth_from_base


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [6]:
asset_type = "all"
investment_asset_type = "building"

metrics = {
    "median": "unit_price_median",
}

granularity_columns = ["area", "area_code", "asset_type"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = f"median"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"

In [8]:
start_year = 2006
eval_start_year = 2020 # eval_years = [2020, 2021, 2022]
eval_end_year = 2022

dataset_key = "transactions"
years_ahead = 2
dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"

test_year = 2022
output_dataset_name = f"{dataset_name}_{test_year}"
model_ready_data_path = model_ready_data_paths[dataset_name]
model_output_data_path = model_output_data_paths[output_dataset_name]

output_df = pd.read_csv(model_output_data_path)
core_df = pd.read_csv(model_ready_data_path)

output_df = output_df[output_df["asset_type"] == investment_asset_type]
core_df = core_df[core_df["asset_type"] == investment_asset_type]

In [9]:
# What do I want to see with tooling? What is the argument I want to make?

# The model outputs normalized returns for each area for each eval year
# I want to take out one target year, for e.g. 2022
# I want to sort the relative returns for that year
# I want to compare several areas
# I want to see the cumulative factors for each area, this will help me identify trends

# once this is done, write a narrative for the teammates

In [16]:
target_year = 2022
prefecture_code = 11 # (Tokyo, 13), (Hokaido, 1)
results_df = output_df[output_df["year"] == target_year].sort_values(by=["predicted_normalized_return"], ascending=False)
prefectural_results_df = results_df[(results_df["area_code"].astype(int) // 1000 == prefecture_code)]
# results_df
print(len(prefectural_results_df))
prefectural_results_df.head(10)

57


Unnamed: 0,year,area_code,asset_type,predicted_normalized_return,yearly_price_growth,unit_price_median_pct_chg,unit_price_median_pct_chg_normalized_yearly
2486,2022,11442,building,0.279503,1.072376,0.543488,0.761495
2594,2022,11361,building,0.273942,0.05,-0.283295,-0.589475
2367,2022,11327,building,0.261939,0.451154,0.740789,1.083886
2245,2022,11207,building,0.220698,-0.604839,-0.416377,-0.806932
46,2022,11341,building,0.197714,0.096491,0.061026,-0.026853
1640,2022,11346,building,0.07361,0.501375,1.014888,1.531766
370,2022,11245,building,0.072751,0.088235,0.106735,0.047837
2453,2022,11242,building,0.065982,-0.165928,0.181481,0.169973
676,2022,11229,building,0.065681,-0.066947,-0.030594,-0.17656
1743,2022,11326,building,0.05587,0.278333,0.226523,0.243572


In [17]:
prefectural_results_df.tail(10)

Unnamed: 0,year,area_code,asset_type,predicted_normalized_return,yearly_price_growth,unit_price_median_pct_chg,unit_price_median_pct_chg_normalized_yearly
1617,2022,11233,building,-0.202221,-0.066667,-0.026087,-0.169195
1908,2022,11211,building,-0.210324,-0.263957,-0.286858,-0.595297
2218,2022,11215,building,-0.212037,0.067883,0.039062,-0.06274
402,2022,11246,building,-0.222981,0.153073,0.05614,-0.034835
697,2022,11232,building,-0.234049,-0.088312,-0.094156,-0.28042
1372,2022,11301,building,-0.241644,0.105263,-0.230263,-0.502821
1860,2022,11238,building,-0.286753,0.08396,-0.019231,-0.157992
131,2022,11206,building,-0.315135,-0.002829,-0.029982,-0.17556
2176,2022,11342,building,-0.608337,-0.164396,-0.506313,-0.953888
188,2022,11349,building,-0.705051,-0.690909,-0.75507,-1.360359


In [119]:
area_codes = [26209, 26206, 26201]
area_df = core_df[core_df["area_code"].isin(area_codes)].sort_values(by="year", ascending=True)

area_df[f"cumulative_{metric}_growth"] = get_cumulative_growth_from_base(area_df.copy(), metric)

plot_time_series(
    area_df,
    f"cumulative_{metric}_growth",
    group_by_columns,
    granularity_columns,
    f"cumulative_{metric}_growth over time",
    # visible="legendonly",
    width=1200,
    height=600,
    highlight=True,
    highlight_range=(2020, 2022)
)

growth_factors = [
    "taxable_income_growth",
    "total_tax_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
]

base_factors = [
    "taxpayer_count",
    "count"
]

cumulative_factors = [
    f"cumulative_{factor}" for factor in growth_factors
] + [
    f"cumulative_{factor}_growth" for factor in base_factors
]

area_df[growth_factors] = area_df[growth_factors].replace({0: np.nan})
area_df[base_factors] = area_df[base_factors].replace({0: np.nan})

for factor in growth_factors:
    area_df[f"cumulative_{factor}"] = get_cumulative_growth(
        area_df.copy(), factor
    )

for factor in base_factors:
    area_df[f"cumulative_{factor}_growth"] = get_cumulative_growth_from_base(
        area_df.copy(), factor
    )

for cumulative_factor in cumulative_factors:
    plot_time_series(
        area_df,
        cumulative_factor,
        group_by_columns,
        granularity_columns,
        f"{cumulative_factor} over time",
        # visible="legendonly",
        width=1200,
        height=600,
        highlight=True,
        highlight_range=(2015, 2020)
    )



In [None]:
# use population as the error scaling parameter instead

# autocorrelation over time series
# cross time series lead lag for pairs

# Forward looking smoothing for prediction column
# backward looking smoothing for factor column

# time series of all transa
