In [1]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    model_ready_data_paths,
    model_output_data_paths,
)

from jre_utils.visualize import plot_time_series
from jre_utils.process import get_cumulative_growth, get_cumulative_growth_from_base


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key_unsmoothed = "median"
metric_unsmoothed = metrics[metric_key_unsmoothed]

metric_key = f"{metric_key_unsmoothed}_smoothed"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
normalized_metric_pct_chg = "normalized_" + metric + "_pct_chg"

In [3]:
start_year = 2006
eval_start_year = 2020 # eval_years = [2020, 2021, 2022]
eval_end_year = 2022

dataset_key = "transactions"
years_ahead = 2
dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
# dataset_name = f"sequence_{dataset_key}_{metric_key}_{years_ahead}"
output_dataset_name = f"{dataset_name}_{eval_start_year}"
model_ready_data_path = model_ready_data_paths[dataset_name]
model_output_data_path = model_output_data_paths[output_dataset_name]

output_df = pd.read_csv(model_output_data_path)
core_df = pd.read_csv(model_ready_data_path)

In [4]:
# What do I want to see with tooling? What is the argument I want to make?

# The model outputs normalized returns for each area for each eval year
# I want to take out one target year, for e.g. 2022
# I want to sort the relative returns for that year
# I want to compare several areas
# I want to see the cumulative factors for each area, this will help me identify trends

# once this is done, write a narrative for the teammates

In [5]:
target_year = 2022
prefecture_code = 26 # (Tokyo, 13), (Hokaido, 1)
results_df = output_df[output_df["year"] == target_year].sort_values(by=["predicted_normalized_return"], ascending=False)
prefectural_results_df = results_df[(results_df["area_code"].astype(int) // 1000 == prefecture_code)]
# results_df
print(len(prefectural_results_df))
prefectural_results_df.head(10)

19


Unnamed: 0,year,area_code,predicted_normalized_return,unit_price_median_smoothed_pct_chg,unit_price_median_smoothed_pct_chg_normalized_yearly
778,2022,26344,2.697197,0.899478,3.501919
579,2022,26407,0.844264,0.393327,1.488076
993,2022,26213,0.669491,0.19097,0.682949
733,2022,26209,0.515084,0.185804,0.662396
747,2022,26366,0.359659,0.05339,0.135554
876,2022,26210,0.242936,0.179051,0.635526
90,2022,26208,0.040632,-0.006962,-0.104572
880,2022,26100,-0.0288,0.020296,0.003879
1021,2022,26207,-0.044638,0.062,0.169808
559,2022,26206,-0.06044,-0.03932,-0.233317


In [6]:
prefectural_results_df.tail(10)

Unnamed: 0,year,area_code,predicted_normalized_return,unit_price_median_smoothed_pct_chg,unit_price_median_smoothed_pct_chg_normalized_yearly
559,2022,26206,-0.06044,-0.03932,-0.233317
830,2022,26204,-0.317391,-0.03132,-0.201488
691,2022,26212,-0.348565,-0.224156,-0.968729
954,2022,26211,-0.46157,-0.098752,-0.469781
171,2022,26201,-0.658193,-0.190131,-0.833355
735,2022,26203,-0.662315,-0.202208,-0.881405
1101,2022,26303,-0.695637,-0.270154,-1.151744
634,2022,26202,-0.868562,-0.331114,-1.394289
356,2022,26205,-1.040541,-0.288202,-1.223554
745,2022,26214,-1.041658,-0.328045,-1.382077


In [119]:
area_codes = [26209, 26206, 26201]
area_df = core_df[core_df["area_code"].isin(area_codes)].sort_values(by="year", ascending=True)

area_df[f"cumulative_{metric}_growth"] = get_cumulative_growth_from_base(area_df.copy(), metric)

plot_time_series(
    area_df,
    f"cumulative_{metric}_growth",
    group_by_columns,
    granularity_columns,
    f"cumulative_{metric}_growth over time",
    # visible="legendonly",
    width=1200,
    height=600,
    highlight=True,
    highlight_range=(2020, 2022)
)

growth_factors = [
    "taxable_income_growth",
    "total_tax_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
]

base_factors = [
    "taxpayer_count",
    "count"
]

cumulative_factors = [
    f"cumulative_{factor}" for factor in growth_factors
] + [
    f"cumulative_{factor}_growth" for factor in base_factors
]

area_df[growth_factors] = area_df[growth_factors].replace({0: np.nan})
area_df[base_factors] = area_df[base_factors].replace({0: np.nan})

for factor in growth_factors:
    area_df[f"cumulative_{factor}"] = get_cumulative_growth(
        area_df.copy(), factor
    )

for factor in base_factors:
    area_df[f"cumulative_{factor}_growth"] = get_cumulative_growth_from_base(
        area_df.copy(), factor
    )

for cumulative_factor in cumulative_factors:
    plot_time_series(
        area_df,
        cumulative_factor,
        group_by_columns,
        granularity_columns,
        f"{cumulative_factor} over time",
        # visible="legendonly",
        width=1200,
        height=600,
        highlight=True,
        highlight_range=(2015, 2020)
    )



In [None]:
# use population as the error scaling parameter instead

# autocorrelation over time series
# cross time series lead lag for pairs

# Forward looking smoothing for prediction column
# backward looking smoothing for factor column

# time series of all transa
