In [44]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    model_ready_data_paths,
    model_output_data_paths,
)

from jre_utils.visualize import plot_time_series
from jre_utils.process import get_cumulative_growth, get_cumulative_growth_from_base


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [45]:
metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key_unsmoothed = "median"
metric_unsmoothed = metrics[metric_key_unsmoothed]

metric_key = f"{metric_key_unsmoothed}_smoothed"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
normalized_metric_pct_chg = "normalized_" + metric + "_pct_chg"

In [46]:
start_year = 2006
eval_start_year = 2020 # eval_years = [2020, 2021, 2022]
eval_end_year = 2022

dataset_key = "transactions"
years_ahead = 2
dataset_name = f"sequence_{dataset_key}_{metric_key}_{years_ahead}"
output_dataset_name = f"{dataset_name}_{eval_start_year}"
model_ready_data_path = model_ready_data_paths[dataset_name]
model_output_data_path = model_output_data_paths[output_dataset_name]

output_df = pd.read_csv(model_output_data_path)
core_df = pd.read_csv(model_ready_data_path)

In [47]:
# What do I want to see with tooling? What is the argument I want to make?

# The model outputs normalized returns for each area for each eval year
# I want to take out one target year, for e.g. 2022
# I want to sort the relative returns for that year
# I want to compare several areas
# I want to see the cumulative factors for each area, this will help me identify trends

# once this is done, write a narrative for the teammates

In [56]:
target_year = 2022
prefecture_code = 10 # (Tokyo, 13), (Hokaido, 1)
results_df = output_df[output_df["year"] == target_year].sort_values(by=["predicted_normalized_return"], ascending=False)
prefectural_results_df = results_df[(results_df["area_code"].astype(int) // 1000 == prefecture_code)]
# results_df
print(len(prefectural_results_df))
prefectural_results_df.head(20)

27


Unnamed: 0,year,area_code,predicted_normalized_return,unit_price_median_smoothed_pct_chg,normalized_unit_price_median_smoothed_pct_chg
753,2022,10525,2.970461,1.113216,3.809652
207,2022,10344,2.18285,0.57638,1.944931
769,2022,10209,1.241252,0.490647,1.647133
871,2022,10207,1.008328,0.142881,0.439152
605,2022,10206,0.647301,0.159415,0.496586
707,2022,10449,0.416388,0.111407,0.329826
1208,2022,10384,0.346386,0.087479,0.246714
1323,2022,10211,0.180039,0.003996,-0.04327
592,2022,10205,0.105639,-0.019911,-0.126312
18,2022,10202,0.036622,-0.057195,-0.25582


In [62]:
prefectural_results_df.tail(10)

Unnamed: 0,year,area_code,predicted_normalized_return,unit_price_median_smoothed_pct_chg,normalized_unit_price_median_smoothed_pct_chg
706,2022,10345,-0.685054,-0.365424,-1.326466
1,2022,10212,-0.759265,-0.236664,-0.879212
338,2022,10424,-0.768067,-0.218538,-0.816249
1171,2022,10208,-0.821253,-0.178683,-0.677812
236,2022,10521,-0.82366,-0.243811,-0.904037
98,2022,10425,-0.872423,-0.084673,-0.351263
1067,2022,10203,-0.884829,-0.230304,-0.85712
68,2022,10421,-0.915542,-0.198235,-0.745726
807,2022,10464,-1.03472,-0.350219,-1.273649
124,2022,10382,-1.31126,-0.322114,-1.176026


In [63]:
area_codes = [10525, 10344, 10209, 10421, 10464, 10382]
area_df = core_df[core_df["area_code"].isin(area_codes)].sort_values(by="year", ascending=True)

area_df[f"cumulative_{metric}_growth"] = get_cumulative_growth_from_base(area_df.copy(), metric)

plot_time_series(
    area_df,
    f"cumulative_{metric}_growth",
    group_by_columns,
    granularity_columns,
    f"cumulative_{metric}_growth over time",
    # visible="legendonly",
    width=1200,
    height=600,
    highlight=True,
    highlight_range=(2020, 2022)
)

growth_factors = [
    "taxable_income_growth",
    "total_tax_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
]

base_factors = [
    "taxpayer_count", 
]

cumulative_factors = [
    f"cumulative_{factor}" for factor in growth_factors
] + [
    f"cumulative_{factor}_growth" for factor in base_factors
]

area_df[growth_factors] = area_df[growth_factors].replace({0: np.nan})
area_df[base_factors] = area_df[base_factors].replace({0: np.nan})

for factor in growth_factors:
    area_df[f"cumulative_{factor}"] = get_cumulative_growth(
        area_df.copy(), factor
    )

for factor in base_factors:
    area_df[f"cumulative_{factor}_growth"] = get_cumulative_growth_from_base(
        area_df.copy(), factor
    )

for cumulative_factor in cumulative_factors:
    plot_time_series(
        area_df,
        cumulative_factor,
        group_by_columns,
        granularity_columns,
        f"{cumulative_factor} over time",
        # visible="legendonly",
        width=1200,
        height=600,
        highlight=True,
        highlight_range=(2015, 2020)
    )

