In [1]:
import warnings
import math

import numpy as np
import pandas as pd
import statsmodels.api as sm

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)
from jre_utils.process import get_most_active_municipalities
from jre_utils.visualize import plot_time_series


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
asset_type = "building"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "weighted_median_smoothed"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg


In [3]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]


In [4]:
def years_since_crisis(year):
    year_ranges = {
        (1960, 1973): 1960,
        (1973, 1990): 1973,
        (1990, 1997): 1990,
        (1997, 2008): 1997,
        (2008, 2019): 2008,
        (2019, math.inf): 2019,
    }

    for range_start, range_end in year_ranges:
        if range_start <= year < range_end:
            return year - year_ranges[(range_start, range_end)]

In [5]:
df = pd.read_csv(core_path)
df = get_most_active_municipalities(df, 1500)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)

# Might go back and undo the new dwellings filling for unknown municipalities.
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

# Years since crisis (Yn - 2008) or (Yn - 2020)
df["years_since_crisis"] = df["year"].apply(years_since_crisis)

df = df.fillna(0)
# df[df["area_code"] == "13103"]

In [6]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()
df[upcoming_metric_pct_chg] = df[metric_pct_chg].shift(-years_ahead)


# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25315 entries, 16845 to 14370
Data columns (total 32 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   year                                          25315 non-null  int64  
 1   area_code                                     25315 non-null  int64  
 2   area                                          25315 non-null  object 
 3   unit_price_wmean                              25315 non-null  float64
 4   unit_price_wmedian                            25315 non-null  float64
 5   unit_price_mean                               25315 non-null  float64
 6   unit_price_median                             25315 non-null  float64
 7   total_traded_area                             25315 non-null  float64
 8   count                                         25315 non-null  float64
 9   unit_price_wmean_smoothed                     25315 non-null  

In [8]:
y_columns = [upcoming_metric_pct_chg]

X_categorical_columns = ["area_code", "area"]

X_basic_columns = [
    "year",
    "years_since_crisis",
    "population",
    "count",
    "total_traded_area",
    "taxable_income_per_taxpayer",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "migrations_is_available",
    "taxable_income_is_available",
    "total_tax_is_available",
    "dwellings_is_available",
]

X_engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    "total_tax_growth",
    metric_pct_chg,
]

initial_columns = (
    X_categorical_columns + X_basic_columns + X_engineering_columns + y_columns
)

df = df[initial_columns]
df[X_categorical_columns] = df[X_categorical_columns].astype("category")
df = df.dropna()

In [9]:
lag = 3

X_engineered_columns = []

for col in X_engineering_columns:
    df["multiplier"] = df[col] + 1
    df[f"{col}_ma3"] = df.groupby(granularity_columns)[col].transform(
        lambda x: x.rolling(3, 1).mean()
    )
    df[f"{col}_cumu3"] = df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )

    X_engineered_columns.append(f"{col}_ma3")
    X_engineered_columns.append(f"{col}_cumu3")
    print(f"{col}_ma3")
    print(f"{col}_cumu3")

    for i in range(1, lag + 1):
        df[f"{col}_lag{i}"] = df.groupby(granularity_columns)[col].shift(i)
        X_engineered_columns.append(f"{col}_lag{i}")
        print(f"{col}_lag{i}")

df[X_engineered_columns] = df[X_engineered_columns].fillna(0)

net_migration_ratio_ma3
net_migration_ratio_cumu3
net_migration_ratio_lag1
net_migration_ratio_lag2
net_migration_ratio_lag3
taxable_income_growth_ma3
taxable_income_growth_cumu3
taxable_income_growth_lag1
taxable_income_growth_lag2
taxable_income_growth_lag3
new_dwellings_ratio_ma3
new_dwellings_ratio_cumu3
new_dwellings_ratio_lag1
new_dwellings_ratio_lag2
new_dwellings_ratio_lag3
total_tax_growth_ma3
total_tax_growth_cumu3
total_tax_growth_lag1
total_tax_growth_lag2
total_tax_growth_lag3
unit_price_wmedian_smoothed_pct_chg_ma3
unit_price_wmedian_smoothed_pct_chg_cumu3
unit_price_wmedian_smoothed_pct_chg_lag1
unit_price_wmedian_smoothed_pct_chg_lag2
unit_price_wmedian_smoothed_pct_chg_lag3


In [10]:
X_engineered_columns

['net_migration_ratio_ma3',
 'net_migration_ratio_cumu3',
 'net_migration_ratio_lag1',
 'net_migration_ratio_lag2',
 'net_migration_ratio_lag3',
 'taxable_income_growth_ma3',
 'taxable_income_growth_cumu3',
 'taxable_income_growth_lag1',
 'taxable_income_growth_lag2',
 'taxable_income_growth_lag3',
 'new_dwellings_ratio_ma3',
 'new_dwellings_ratio_cumu3',
 'new_dwellings_ratio_lag1',
 'new_dwellings_ratio_lag2',
 'new_dwellings_ratio_lag3',
 'total_tax_growth_ma3',
 'total_tax_growth_cumu3',
 'total_tax_growth_lag1',
 'total_tax_growth_lag2',
 'total_tax_growth_lag3',
 'unit_price_wmedian_smoothed_pct_chg_ma3',
 'unit_price_wmedian_smoothed_pct_chg_cumu3',
 'unit_price_wmedian_smoothed_pct_chg_lag1',
 'unit_price_wmedian_smoothed_pct_chg_lag2',
 'unit_price_wmedian_smoothed_pct_chg_lag3']

In [11]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = y_columns[0]
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  22314
Filtered Size:  21866


Unnamed: 0,year,years_since_crisis,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,migrations_is_available,taxable_income_is_available,total_tax_is_available,dwellings_is_available,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,total_tax_growth,unit_price_wmedian_smoothed_pct_chg,upcoming_unit_price_wmedian_smoothed_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,net_migration_ratio_lag2,net_migration_ratio_lag3,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,taxable_income_growth_lag2,taxable_income_growth_lag3,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,new_dwellings_ratio_lag2,new_dwellings_ratio_lag3,total_tax_growth_ma3,total_tax_growth_cumu3,total_tax_growth_lag1,total_tax_growth_lag2,total_tax_growth_lag3,unit_price_wmedian_smoothed_pct_chg_ma3,unit_price_wmedian_smoothed_pct_chg_cumu3,unit_price_wmedian_smoothed_pct_chg_lag1,unit_price_wmedian_smoothed_pct_chg_lag2,unit_price_wmedian_smoothed_pct_chg_lag3
count,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0,21866.0
mean,2014.947636,4.288439,82361.31,77.068325,20351.772844,2689.792077,0.001465,12790290.0,0.92637,0.929342,0.936477,0.522043,-0.003231,0.002034,0.006986,0.001713,-0.003931,-0.003673,0.996069,-0.0035,0.990661,-0.003203,-0.003022,-0.00278,0.00111,1.004559,0.001965,0.000467,0.000148,0.007439,1.02083,0.00698,0.006561,0.006168,0.001979,1.00451,0.001472,0.001795,0.002292,-0.005656,1.002623,-0.004727,-0.005757,-0.006601
std,4.455369,3.205699,204363.9,197.825811,39519.473671,919.570406,0.031712,39954960.0,0.261174,0.256258,0.243907,0.499525,0.006809,0.041851,0.008303,0.060776,0.234637,0.155952,0.234637,0.006261,0.017255,0.00682,0.00657,0.006351,0.026592,0.073027,0.042011,0.040734,0.039887,0.008256,0.023491,0.008304,0.008307,0.008312,0.03786,0.099489,0.059893,0.05904,0.058311,0.198086,0.608649,0.215708,0.227701,0.219355
min,2006.0,0.0,0.0,1.0,65.0,0.0,-0.694693,0.0,0.0,0.0,0.0,0.0,-0.203379,-0.682739,0.0,-0.677251,-6.788158,-0.425624,-5.788158,-0.16034,0.590548,-0.203379,-0.156526,-0.123926,-0.682739,0.19941,-0.682739,-0.682739,-0.682739,0.0,1.0,0.0,0.0,0.0,-0.417418,0.297333,-0.677251,-0.677251,-0.677251,-6.788158,-20.523133,-6.788158,-6.788158,-6.788158
25%,2011.0,1.0,10902.0,8.0,3800.0,2511.769504,-0.007991,1356956.0,1.0,1.0,1.0,0.0,-0.006803,-0.010656,0.0,-0.015693,-0.09743,-0.094884,0.90257,-0.006924,0.981258,-0.006752,-0.006456,-0.006092,-0.013976,0.965181,-0.010663,-0.010109,-0.008736,0.0,1.0,0.0,0.0,0.0,-0.011734,0.967402,-0.015617,-0.013218,-0.010687,-0.078058,0.794874,-0.090598,-0.084026,-0.075563
50%,2015.0,4.0,29602.5,24.0,9137.5,2756.438086,0.00066,3732882.0,1.0,1.0,1.0,1.0,-0.002923,0.002733,0.004427,0.0,-0.013925,-0.011157,0.986075,-0.003532,0.99062,-0.002865,-0.0023,-0.001641,0.003892,1.009673,0.002686,0.0,0.0,0.006127,1.01583,0.004383,0.0,0.0,0.000131,1.0,0.0,0.0,0.0,-0.013226,0.953679,-0.00491,0.0,0.0
75%,2019.0,7.0,72958.75,69.0,21050.0,3062.696312,0.011344,10353360.0,1.0,1.0,1.0,1.0,0.0,0.018864,0.012832,0.015161,0.069332,0.071761,1.069332,-0.000125,0.999652,0.0,0.0,0.0,0.015665,1.044916,0.018777,0.015727,0.014456,0.013312,1.037449,0.012828,0.012403,0.012058,0.011404,1.031364,0.014785,0.013905,0.013006,0.04832,1.127499,0.057197,0.045939,0.036593
max,2023.0,10.0,3811873.0,4358.0,666150.0,12667.02,1.026488,846456000.0,1.0,1.0,1.0,1.0,0.043446,1.06604,0.12027,4.759148,14.172713,0.645045,15.172713,0.038018,1.114415,0.043446,0.043446,0.099253,0.366478,2.188546,1.06604,1.06604,1.06604,0.092722,1.303819,0.12027,0.12027,0.12027,1.58671,5.757765,4.759148,4.759148,4.759148,14.172713,52.081772,14.172713,14.172713,14.172713


In [12]:
filtered_df.to_csv(model_ready_data_paths[f"xgb_{dataset_key}_{metric_key}_{years_ahead}"], index=False)

In [13]:
area_code = 13102
test_df = filtered_df[filtered_df["area_code"] == area_code]
test_df[["year", metric_pct_chg, upcoming_metric_pct_chg]]

Unnamed: 0,year,unit_price_wmedian_smoothed_pct_chg,upcoming_unit_price_wmedian_smoothed_pct_chg
2030,2006,0.124996,0.024744
2031,2007,0.234751,-0.18247
2032,2008,0.024744,-0.163039
2033,2009,-0.18247,-0.113967
2034,2010,-0.163039,0.019283
2035,2011,-0.113967,0.027242
2036,2012,0.019283,0.120985
2037,2013,0.027242,0.100856
2038,2014,0.120985,0.163395
2039,2015,0.100856,0.169525


In [14]:
test_year = 2020
presentation_df = filtered_df[filtered_df["area_code"].astype(int) // 1000 == 13]
presentation_df = presentation_df[presentation_df["year"] == test_year]
presentation_df = presentation_df.sort_index(axis=1)
presentation_df["upcoming_return"] = presentation_df[upcoming_metric_pct_chg]
presentation_df = presentation_df.sort_values(by=upcoming_metric_pct_chg, ascending=False).reset_index(drop=True)

drop_columns = [upcoming_metric_pct_chg, "area_code"]
presentation_df.drop(columns=drop_columns).style.background_gradient(cmap="cividis")

Unnamed: 0,area,count,dwellings_is_available,migrations_is_available,multiplier,net_migration_ratio,net_migration_ratio_cumu3,net_migration_ratio_lag1,net_migration_ratio_lag2,net_migration_ratio_lag3,net_migration_ratio_ma3,new_dwellings_ratio,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,new_dwellings_ratio_lag2,new_dwellings_ratio_lag3,new_dwellings_ratio_ma3,population,taxable_income_growth,taxable_income_growth_cumu3,taxable_income_growth_lag1,taxable_income_growth_lag2,taxable_income_growth_lag3,taxable_income_growth_ma3,taxable_income_is_available,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,total_tax_growth_cumu3,total_tax_growth_lag1,total_tax_growth_lag2,total_tax_growth_lag3,total_tax_growth_ma3,total_tax_is_available,total_traded_area,unit_price_wmedian_smoothed_pct_chg,unit_price_wmedian_smoothed_pct_chg_cumu3,unit_price_wmedian_smoothed_pct_chg_lag1,unit_price_wmedian_smoothed_pct_chg_lag2,unit_price_wmedian_smoothed_pct_chg_lag3,unit_price_wmedian_smoothed_pct_chg_ma3,year,years_since_crisis,upcoming_return
0,Tokyo-to Minato-ku,73.0,1,1,0.988556,-0.005551,1.008307,0.006153,0.007734,0.008911,0.002779,0.015563,1.073654,0.033005,0.023423,0.020929,0.023997,260486.0,-0.032463,1.105685,0.10635,0.03293,0.033991,0.035606,1,11631.584149,-0.044258,82850638.0,-0.038556,1.07958,0.099437,0.021316,-0.00108,0.027399,1,12845.0,-0.011444,1.086123,0.018747,0.078478,0.177739,0.028594,2020,1,0.442619
1,Tokyo-to Akiruno-shi,96.0,1,1,0.977978,-0.000416,1.004792,0.00225,0.002954,-0.00052,0.001596,0.01046,1.040067,0.011881,0.017215,0.015996,0.013185,79292.0,0.012907,1.028056,0.004807,0.0101,0.025264,0.009271,1,3195.260238,0.008573,10884230.0,0.005471,1.011073,0.008361,-0.002766,0.00874,0.003689,1,20440.0,-0.022022,0.884666,-0.000143,-0.095285,-0.124926,-0.03915,2020,1,0.329238
2,Tokyo-to Bunkyo-ku,134.0,1,1,1.021174,0.002641,1.029446,0.014546,0.012014,0.009442,0.009734,0.020945,1.051028,0.016256,0.012998,0.015185,0.016733,240069.0,0.006374,1.084229,0.047456,0.028551,0.063543,0.02746,1,6211.041633,-0.004183,36317776.0,0.04067,1.104599,0.036618,0.023936,0.027259,0.033742,1,16550.0,0.021174,1.056498,0.019797,0.014508,0.022589,0.018493,2020,1,0.301527
3,Tokyo-to Musashino-shi,109.0,1,1,0.987623,0.003064,1.013247,0.002035,0.0081,0.002766,0.0044,0.009621,1.039539,0.015687,0.013731,0.012887,0.013013,150149.0,0.027259,1.072916,0.012522,0.031528,0.022822,0.02377,1,5404.433989,0.015092,41823423.0,0.003416,1.04496,0.021257,0.019727,-0.00835,0.0148,1,16080.0,-0.012377,1.046947,0.019618,0.039672,0.023545,0.015638,2020,1,0.284839
4,Tokyo-to Higashikurume-shi,167.0,1,1,1.025203,0.004745,1.010135,0.002766,0.00259,0.000908,0.003367,0.011848,1.037082,0.012624,0.012161,0.017312,0.012211,115271.0,0.000657,1.047904,0.007785,0.039126,0.030597,0.015856,1,3609.52831,-0.005584,17258660.0,0.002599,1.027727,0.009301,0.015616,0.006895,0.009172,1,23360.0,0.025203,1.043184,0.008233,0.009229,0.040609,0.014222,2020,1,0.244565
5,Tokyo-to Chiyoda-ku,41.0,1,1,1.032263,0.014607,1.076727,0.028108,0.032212,0.017522,0.024976,0.026555,1.070092,0.018523,0.023453,0.035125,0.022844,66680.0,-0.028845,1.201759,0.138419,0.086993,0.061411,0.065522,1,10056.536152,-0.070201,20573851.0,-0.049652,1.078063,0.092478,0.038361,0.034839,0.027063,1,5665.0,0.032263,1.111822,0.031791,0.043886,0.037167,0.03598,2020,1,0.223484
6,Tokyo-to Kiyose-shi,123.0,1,1,1.062516,0.008884,1.015802,0.002829,0.004016,0.00744,0.005243,0.011618,1.044285,0.022237,0.009836,0.015672,0.014564,76208.0,0.035732,1.065855,0.001379,0.027666,0.033335,0.021592,1,3489.786668,0.023182,9906116.0,0.023653,1.042,0.008567,0.009277,0.011912,0.013832,1,15260.0,0.062516,0.979063,-0.023914,-0.055966,0.011258,-0.005788,2020,1,0.21939
7,Tokyo-to Mizuho-machi,33.0,0,1,0.97039,0.001259,0.988654,-0.007002,-0.005626,-0.003685,-0.00379,0.0,1.0,0.0,0.0,0.0,0.0,31765.0,0.010155,1.024885,-0.006034,0.020742,-0.016155,0.008287,1,3125.22634,0.021603,6739676.0,0.001405,1.025385,-0.007396,0.031576,-0.011838,0.008528,1,5645.0,-0.02961,0.85767,-0.046324,-0.073228,0.051705,-0.049721,2020,1,0.200586
8,Tokyo-to Shinagawa-ku,227.0,1,1,0.973605,0.009151,1.036949,0.013579,0.01378,0.008937,0.01217,0.021512,1.0627,0.017279,0.022651,0.024668,0.020481,422488.0,0.041911,1.176251,0.062792,0.062236,-0.003401,0.055646,1,5052.42683,0.016574,52996147.0,0.033609,1.139403,0.055724,0.044169,-0.009402,0.044501,1,24805.0,-0.026395,1.049654,-0.004422,0.082899,0.077984,0.017361,2020,1,0.188603
9,Tokyo-to Higashiyamato-shi,89.0,1,1,0.921485,0.002849,1.001321,-0.000905,-0.000619,-0.003238,0.000442,0.01323,1.049067,0.007318,0.027847,0.011181,0.016132,83901.0,0.002557,1.029729,0.009987,0.016947,-0.025248,0.00983,1,3502.529097,-0.003245,12830100.0,-0.001848,1.012511,0.004974,0.009365,-0.008921,0.004164,1,13670.0,-0.078515,0.872883,-0.056759,0.004258,0.05779,-0.043672,2020,1,0.163043


In [None]:
# sort by median and weighted median to see the difference in ordering.
# Try to find the best metric to use