In [40]:
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

from jre_utils.process import (
    get_most_active_municipalities
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [55]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["median"]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg

In [42]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]


In [43]:
df = pd.read_csv(core_path)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
)

In [44]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()

# set up target variables
df[upcoming_metric_pct_chg] = df.groupby(granularity_columns)[metric_pct_chg].shift(-1)

# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27728 entries, 18191 to 15439
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   year                                27728 non-null  int64  
 1   area_code                           27728 non-null  int64  
 2   area                                27728 non-null  object 
 3   unit_price_wmean                    27728 non-null  float64
 4   unit_price_wmedian                  27728 non-null  float64
 5   unit_price_mean                     27728 non-null  float64
 6   unit_price_median                   27728 non-null  float64
 7   total_traded_area                   27728 non-null  float64
 8   count                               27728 non-null  float64
 9   population                          25740 non-null  float64
 10  net_migration_ratio                 25740 non-null  float64
 11  taxable_income                      24504 

In [46]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

df = get_most_active_municipalities(df, n=1500)
# df = df[df["count"] > 5]

In [47]:
y_columns = [upcoming_metric_pct_chg]

X_categorical_columns = ["area_code", "area"]

X_basic_columns = [
    "year",
    "population",
    "count",
    "total_traded_area",
    "taxable_income_per_taxpayer",
    "taxable_income_per_taxpayer_growth",
]

X_engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    metric_pct_chg,
]

initial_columns = (
    X_categorical_columns + X_basic_columns + X_engineering_columns + y_columns
)

df = df[initial_columns]
df[X_categorical_columns] = df[X_categorical_columns].astype("category")
df = df.dropna()

In [48]:
lag = 1

X_engineered_columns = []

for col in X_engineering_columns:
    df["multiplier"] = df[col] + 1
    df[f"{col}_ma3"] = df.groupby(granularity_columns)[col].transform(
        lambda x: x.rolling(3, 1).mean()
    )
    df[f"{col}_cumu3"] = df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )

    X_engineered_columns.append(f"{col}_ma3")
    X_engineered_columns.append(f"{col}_cumu3")
    print(f"{col}_ma3")
    print(f"{col}_cumu3")

    for i in range(1, lag + 1):
        df[f"{col}_lag{i}"] = df.groupby(granularity_columns)[col].shift(i)
        X_engineered_columns.append(f"{col}_lag{i}")
        print(f"{col}_lag{i}")

df[X_engineered_columns] = df[X_engineered_columns].fillna(0)

net_migration_ratio_ma3
net_migration_ratio_cumu3
net_migration_ratio_lag1
taxable_income_growth_ma3
taxable_income_growth_cumu3
taxable_income_growth_lag1
new_dwellings_ratio_ma3
new_dwellings_ratio_cumu3
new_dwellings_ratio_lag1
unit_price_median_pct_chg_ma3
unit_price_median_pct_chg_cumu3
unit_price_median_pct_chg_lag1


In [49]:
X_engineered_columns

['net_migration_ratio_ma3',
 'net_migration_ratio_cumu3',
 'net_migration_ratio_lag1',
 'taxable_income_growth_ma3',
 'taxable_income_growth_cumu3',
 'taxable_income_growth_lag1',
 'new_dwellings_ratio_ma3',
 'new_dwellings_ratio_cumu3',
 'new_dwellings_ratio_lag1',
 'unit_price_median_pct_chg_ma3',
 'unit_price_median_pct_chg_cumu3',
 'unit_price_median_pct_chg_lag1']

In [50]:
df

Unnamed: 0,area_code,area,year,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,unit_price_median_pct_chg,upcoming_unit_price_median_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,unit_price_median_pct_chg_ma3,unit_price_median_pct_chg_cumu3,unit_price_median_pct_chg_lag1
18034,23441,Aichi-ken Agui-cho,2010,25695.0,41.0,8265.0,3213.665886,-0.077565,0.021522,-0.089172,0.000000,0.062112,0.000000,1.062112,0.021522,1.021522,0.000000,-0.089172,0.910828,0.000000,0.000000,1.000000,0.000000,0.062112,1.062112,0.000000
17982,23441,Aichi-ken Agui-cho,2011,26248.0,40.0,11000.0,3245.342254,0.009857,0.020763,0.023952,0.000000,0.000000,0.026316,1.000000,0.021143,1.042732,0.021522,-0.032610,0.932644,-0.089172,0.000000,1.000000,0.000000,0.031056,1.062112,0.062112
17929,23441,Aichi-ken Agui-cho,2012,26793.0,45.0,9585.0,3270.662717,0.007802,0.012503,0.032618,0.000000,0.026316,-0.037695,1.026316,0.018263,1.055770,0.020763,-0.010867,0.963065,0.023952,0.000000,1.000000,0.000000,0.029476,1.090062,0.000000
17877,23441,Aichi-ken Agui-cho,2013,27128.0,44.0,8920.0,3266.188636,-0.001368,0.014376,0.013382,0.000000,-0.037695,-0.004430,0.962305,0.015881,1.048385,0.012503,0.023317,1.071500,0.032618,0.000000,1.000000,0.000000,-0.003793,0.987629,0.026316
17824,23441,Aichi-ken Agui-cho,2014,27518.0,50.0,11620.0,3326.902623,0.018589,0.015953,0.031320,0.000000,-0.004430,-0.119682,0.995570,0.014278,1.043444,0.014376,0.025773,1.079210,0.013382,0.000000,1.000000,0.000000,-0.005270,0.983254,-0.037695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15585,19205,Yamanashi-ken Yamanashi-shi,2017,33957.0,15.0,5130.0,2772.886669,0.002362,-0.004947,-0.000603,0.009326,0.970149,-0.439361,1.970149,-0.004990,0.985102,-0.003317,0.002522,1.007565,0.007778,0.009868,1.029898,0.010223,0.175164,1.156021,-0.088145
15560,19205,Yamanashi-ken Yamanashi-shi,2018,33789.0,14.0,7400.0,2832.607559,0.021537,-0.006363,0.028138,0.007798,-0.439361,0.407036,0.560639,-0.004876,0.985442,-0.004947,0.011771,1.035509,-0.000603,0.009116,1.027596,0.009326,0.147548,1.007183,0.970149
15535,19205,Yamanashi-ken Yamanashi-shi,2019,33574.0,22.0,8785.0,2861.296683,0.010128,-0.004140,0.019458,0.008447,0.407036,-0.628406,1.407036,-0.005150,0.984628,-0.006363,0.015664,1.047511,0.028138,0.008524,1.025789,0.007798,0.312608,1.554131,-0.439361
15510,19205,Yamanashi-ken Yamanashi-shi,2020,33435.0,16.0,9255.0,2870.243213,0.003127,-0.000927,0.004950,0.008121,-0.628406,0.628207,0.371594,-0.003810,0.988606,-0.004140,0.017515,1.053331,0.019458,0.008122,1.024564,0.008447,-0.220244,0.293128,0.407036


In [51]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = y_columns[0]
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  20270
Filtered Size:  19864


Unnamed: 0,year,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,unit_price_median_pct_chg,upcoming_unit_price_median_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,unit_price_median_pct_chg_ma3,unit_price_median_pct_chg_cumu3,unit_price_median_pct_chg_lag1
count,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0,19864.0
mean,2014.540425,89718.83,80.878574,21342.288814,2891.448711,0.001864,-0.003469,0.002374,0.007466,0.152912,0.099946,1.152912,-0.00366,0.990357,-0.003277,0.000299,1.003455,0.000516,0.007664,1.021386,0.007015,0.156239,1.122571,0.151476
std,4.056016,212744.7,204.030015,40406.805343,565.40079,0.031455,0.006911,0.042392,0.008315,1.097823,0.589221,1.097823,0.006235,0.017022,0.006602,0.027876,0.074157,0.041237,0.008343,0.023829,0.008351,0.681592,1.041585,1.170662
min,2006.0,1479.0,1.0,65.0,1989.133483,-0.694693,-0.203379,-0.682739,0.0,-0.989961,-0.821628,0.010039,-0.16034,0.590548,-0.156526,-0.682739,0.19941,-0.682739,0.0,1.0,0.0,-0.958065,0.013768,-0.991268
25%,2011.0,14184.75,9.0,4215.0,2563.370989,-0.008824,-0.007142,-0.012071,0.0,-0.202033,-0.198948,0.797967,-0.007204,0.980757,-0.006869,-0.01566,0.961352,-0.011699,0.0,1.0,0.0,-0.050768,0.748267,-0.183315
50%,2015.0,32994.5,25.0,9815.0,2786.840705,0.002292,-0.003494,0.005192,0.006218,-0.006145,-0.004437,0.993855,-0.003814,0.990034,-0.003004,0.003288,1.008007,0.001488,0.006648,1.01677,0.004392,0.027265,0.96829,0.0
75%,2018.0,78740.25,73.0,21931.25,3087.482184,0.012662,5.9e-05,0.020427,0.013245,0.20723,0.208791,1.20723,-0.000336,0.999092,0.0,0.015838,1.045605,0.017077,0.013699,1.03846,0.012892,0.163622,1.196598,0.177092
max,2021.0,3811873.0,4358.0,666150.0,12667.02,0.915431,0.043446,0.882113,0.12027,55.809857,4.115385,56.809857,0.038018,1.114415,0.043446,0.367086,2.188546,0.882113,0.092722,1.303819,0.12027,31.333333,33.238636,55.809857


In [52]:
filtered_df.to_csv(model_ready_data_paths[f"xgb_{dataset_key}"], index=False)

In [53]:
area_code = 13102
test_df = filtered_df[filtered_df["area_code"] == area_code]
test_df[["year", metric_pct_chg, upcoming_metric_pct_chg]]

Unnamed: 0,year,unit_price_median_pct_chg,upcoming_unit_price_median_pct_chg
3275,2006,-0.020641,0.394034
3219,2007,0.394034,-0.133005
3162,2008,-0.133005,-0.253898
3104,2009,-0.253898,-0.124004
3047,2010,-0.124004,-0.129796
2992,2011,-0.129796,0.060761
2935,2012,0.060761,0.064558
2876,2013,0.064558,0.112918
2818,2014,0.112918,0.197544
2761,2015,0.197544,0.262745


In [54]:
test_year = 2021
presentation_df = filtered_df[filtered_df["area_code"].astype(int) // 1000 == 13]
presentation_df = presentation_df[presentation_df["year"] == test_year]
presentation_df = presentation_df.sort_index(axis=1)
presentation_df["upcoming_return"] = presentation_df[upcoming_metric_pct_chg]
presentation_df = presentation_df.sort_values(by=upcoming_metric_pct_chg, ascending=False).reset_index(drop=True)

drop_columns = [upcoming_metric_pct_chg, "area_code"]
presentation_df.drop(columns=drop_columns).style.background_gradient(cmap="cividis")

Unnamed: 0,area,count,multiplier,net_migration_ratio,net_migration_ratio_cumu3,net_migration_ratio_lag1,net_migration_ratio_ma3,new_dwellings_ratio,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,new_dwellings_ratio_ma3,population,taxable_income_growth,taxable_income_growth_cumu3,taxable_income_growth_lag1,taxable_income_growth_ma3,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_traded_area,unit_price_median_pct_chg,unit_price_median_pct_chg_cumu3,unit_price_median_pct_chg_lag1,unit_price_median_pct_chg_ma3,year,upcoming_return
0,Tokyo-to Mizuho-machi,41.0,1.125,0.000378,0.994623,0.001259,-0.001789,0.0,1.0,0.0,0.0,31777.0,0.014835,1.018954,0.010155,0.006318,3153.717552,0.009117,11685.0,0.125,0.99,0.08642,0.00714,2021,0.444444
1,Tokyo-to Chuo-ku,80.0,0.877696,0.001523,1.031616,0.005651,0.010478,0.013507,1.055344,0.023048,0.018125,169437.0,0.059288,1.199215,0.021457,0.06302,7124.559729,0.041049,10815.0,-0.122304,0.976136,-0.011111,-0.002921,2021,0.434898
2,Tokyo-to Taito-ku,126.0,0.925,0.007016,1.032885,0.010457,0.010849,0.038018,1.103853,0.031256,0.033489,212938.0,0.027814,1.141904,0.051647,0.045301,4460.378159,0.009682,13840.0,-0.075,0.958636,-0.04,-0.011818,2021,0.320192
3,Tokyo-to Chiyoda-ku,48.0,0.95569,-0.000976,1.042108,0.014607,0.013913,0.017712,1.064089,0.026555,0.02093,66615.0,-0.006847,1.098012,-0.028845,0.034243,9851.789377,-0.02036,7520.0,-0.04431,1.092685,0.13069,0.032525,2021,0.295499
4,Tokyo-to Akishima-shi,89.0,1.0,0.006859,1.014638,0.003932,0.004857,0.015792,1.045022,0.018064,0.014792,114736.0,0.027309,1.069553,0.019449,0.022672,3458.239137,0.021327,14095.0,0.0,1.008333,0.012303,0.002794,2021,0.272727
5,Tokyo-to Sumida-ku,190.0,0.97734,0.003724,1.018306,0.004076,0.00607,0.029357,1.096477,0.033261,0.031178,273102.0,0.043297,1.136474,0.042262,0.043566,4050.592444,0.02933,24450.0,-0.02266,0.993333,-0.036115,-0.001443,2021,0.253468
6,Tokyo-to Higashiyamato-shi,99.0,1.055072,0.003409,1.005357,0.002849,0.001784,0.011779,1.032668,0.01323,0.010776,84188.0,0.007013,1.019671,0.002557,0.006519,3518.860246,0.004663,17435.0,0.055072,0.929952,-0.017094,-0.021761,2021,0.227381
7,Tokyo-to Tama-shi,92.0,0.880041,-0.001663,1.00164,0.000932,0.000548,0.01135,1.037714,0.013411,0.012417,146707.0,0.014449,1.043833,0.00971,0.01441,3767.066592,0.014367,16955.0,-0.119959,0.973263,0.091954,-0.005069,2021,0.225165
8,Tokyo-to Kita-ku,252.0,0.908209,0.000132,1.008312,0.002607,0.002765,0.018369,1.065525,0.023591,0.021384,355260.0,0.033838,1.102407,0.027999,0.03304,3896.760335,0.031869,25670.0,-0.091791,1.003571,0.109266,0.004543,2021,0.217873
9,Tokyo-to Bunkyo-ku,139.0,1.212958,0.00146,1.018711,0.002641,0.006216,0.017166,1.055352,0.020945,0.018123,240420.0,0.00913,1.063756,0.006374,0.020986,6241.258847,0.004865,16205.0,0.212958,1.132576,-0.097052,0.049999,2021,0.215217


In [None]:
# sort by median and weighted median to see the difference in ordering.
# Try to find the best metric to use