In [1]:
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

from jre_utils.process import (
    get_most_active_municipalities
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = "median"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg

In [3]:
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)


In [4]:
datasets = {name: pd.read_csv(path) for name, path in dataset_paths.items()}
for name, df in datasets.items():
    datasets[name] = (
        datasets[name]
        .merge(population_df, on=group_by_columns, how="left")
        .merge(migration_df, on=group_by_columns, how="left")
        .merge(taxable_income_df, on=group_by_columns, how="left")
        .merge(new_dwellings_df, on=group_by_columns, how="left")
        .merge(lfs_revenue_df, on=group_by_columns, how="left")
    )

    datasets[name]["migrations_is_available"] = (
        datasets[name]["net_migration_ratio"].notnull().astype(int)
    )
    datasets[name]["taxable_income_is_available"] = (
        datasets[name]["taxable_income"].notnull().astype(int)
    )
    datasets[name]["total_tax_is_available"] = (
        datasets[name]["total_tax"].notnull().astype(int)
    )

    # Might go back and undo the new dwellings filling for unknown municipalities.
    datasets[name]["dwellings_is_available"] = (
        datasets[name]["new_dwellings"].notnull().astype(int)
    )

    datasets[name] = datasets[name].fillna(0)
    datasets[name] = datasets["df"].sort_values(by=group_by_columns, ascending=True)
    datasets[name][metric_pct_chg] = (
        datasets["df"].groupby(granularity_columns)[metric].pct_change()
    )
    datasets[name][upcoming_metric_pct_chg] = datasets[name].groupby(granularity_columns)[metric_pct_chg].shift(-1)


KeyError: 'name'

In [44]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()

# set up target variables
df[upcoming_metric_pct_chg] = df.groupby(granularity_columns)[metric_pct_chg].shift(-1)

# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27728 entries, 18191 to 15439
Data columns (total 27 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   year                                27728 non-null  int64  
 1   area_code                           27728 non-null  int64  
 2   area                                27728 non-null  object 
 3   unit_price_wmean                    27728 non-null  float64
 4   unit_price_wmedian                  27728 non-null  float64
 5   unit_price_mean                     27728 non-null  float64
 6   unit_price_median                   27728 non-null  float64
 7   total_traded_area                   27728 non-null  float64
 8   count                               27728 non-null  float64
 9   population                          27728 non-null  float64
 10  net_migration_ratio                 27728 non-null  float64
 11  taxable_income                      27728 

In [46]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

# df = get_most_active_municipalities(df, n=1500)
# df = df[df["count"] > 5]

In [47]:
y_columns = [upcoming_metric_pct_chg]

X_categorical_columns = ["area_code", "area"]

X_basic_columns = [
    "year",
    "population",
    "count",
    "total_traded_area",
    "taxable_income_per_taxpayer",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "migrations_is_available",
    "taxable_income_is_available",
    "total_tax_is_available",
    "dwellings_is_available",
]

X_engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    "total_tax_growth",
    metric_pct_chg,
]

initial_columns = (
    X_categorical_columns + X_basic_columns + X_engineering_columns + y_columns
)

df = df[initial_columns]
df[X_categorical_columns] = df[X_categorical_columns].astype("category")
df = df.dropna()

In [48]:
lag = 1

X_engineered_columns = []

for col in X_engineering_columns:
    df["multiplier"] = df[col] + 1
    df[f"{col}_ma3"] = df.groupby(granularity_columns)[col].transform(
        lambda x: x.rolling(3, 1).mean()
    )
    df[f"{col}_cumu3"] = df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )

    X_engineered_columns.append(f"{col}_ma3")
    X_engineered_columns.append(f"{col}_cumu3")
    print(f"{col}_ma3")
    print(f"{col}_cumu3")

    for i in range(1, lag + 1):
        df[f"{col}_lag{i}"] = df.groupby(granularity_columns)[col].shift(i)
        X_engineered_columns.append(f"{col}_lag{i}")
        print(f"{col}_lag{i}")

df[X_engineered_columns] = df[X_engineered_columns].fillna(0)

net_migration_ratio_ma3
net_migration_ratio_cumu3
net_migration_ratio_lag1
taxable_income_growth_ma3
taxable_income_growth_cumu3
taxable_income_growth_lag1
new_dwellings_ratio_ma3
new_dwellings_ratio_cumu3
new_dwellings_ratio_lag1
total_tax_growth_ma3
total_tax_growth_cumu3
total_tax_growth_lag1
unit_price_median_pct_chg_ma3
unit_price_median_pct_chg_cumu3
unit_price_median_pct_chg_lag1


In [49]:
X_engineered_columns

['net_migration_ratio_ma3',
 'net_migration_ratio_cumu3',
 'net_migration_ratio_lag1',
 'taxable_income_growth_ma3',
 'taxable_income_growth_cumu3',
 'taxable_income_growth_lag1',
 'new_dwellings_ratio_ma3',
 'new_dwellings_ratio_cumu3',
 'new_dwellings_ratio_lag1',
 'total_tax_growth_ma3',
 'total_tax_growth_cumu3',
 'total_tax_growth_lag1',
 'unit_price_median_pct_chg_ma3',
 'unit_price_median_pct_chg_cumu3',
 'unit_price_median_pct_chg_lag1']

In [50]:
df

Unnamed: 0,area_code,area,year,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,migrations_is_available,taxable_income_is_available,total_tax_is_available,dwellings_is_available,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,total_tax_growth,unit_price_median_pct_chg,upcoming_unit_price_median_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,total_tax_growth_ma3,total_tax_growth_cumu3,total_tax_growth_lag1,unit_price_median_pct_chg_ma3,unit_price_median_pct_chg_cumu3,unit_price_median_pct_chg_lag1
18140,23441,Aichi-ken Agui-cho,2008,0.0,14.0,2905.0,3549.241030,0.003820,3957657.0,0,1,1,0,0.000000,0.003993,0.000000,0.016734,0.798786,0.237847,1.798786,0.000000,1.000000,0.000000,0.003993,1.003993,0.000000,0.000000,1.000000,0.000000,0.016734,1.016734,0.000000,0.798786,1.798786,0.000000
18086,23441,Aichi-ken Agui-cho,2009,0.0,31.0,6100.0,3483.893426,-0.018412,3776007.0,0,1,1,0,0.000000,-0.015716,0.000000,-0.045898,0.237847,0.062112,1.237847,0.000000,1.000000,0.000000,-0.005861,0.988214,0.003993,0.000000,1.000000,0.000000,-0.014582,0.970067,0.016734,0.518317,2.226623,0.798786
18034,23441,Aichi-ken Agui-cho,2010,25695.0,41.0,8265.0,3213.665886,-0.077565,3646545.0,1,1,1,0,0.021522,-0.089172,0.000000,-0.034285,0.062112,0.000000,1.062112,0.007174,1.021522,0.000000,-0.033632,0.900093,-0.015716,0.000000,1.000000,0.000000,-0.021150,0.936808,-0.045898,0.366248,2.364922,0.237847
17982,23441,Aichi-ken Agui-cho,2011,26248.0,40.0,11000.0,3245.342254,0.009857,3801941.0,1,1,1,0,0.020763,0.023952,0.000000,0.042615,0.000000,0.026316,1.000000,0.014095,1.042732,0.021522,-0.026979,0.917987,-0.089172,0.000000,1.000000,0.000000,-0.012523,0.960654,-0.034285,0.099986,1.314732,0.062112
17929,23441,Aichi-ken Agui-cho,2012,26793.0,45.0,9585.0,3270.662717,0.007802,3842470.0,1,1,1,0,0.012503,0.032618,0.000000,0.010660,0.026316,-0.037695,1.026316,0.018263,1.055770,0.020763,-0.010867,0.963065,0.023952,0.000000,1.000000,0.000000,0.006330,1.017601,0.042615,0.029476,1.090062,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15560,19205,Yamanashi-ken Yamanashi-shi,2018,33789.0,14.0,7400.0,2832.607559,0.021537,3995097.0,1,1,1,1,-0.006363,0.028138,0.007798,0.008340,-0.439361,0.407036,0.560639,-0.004876,0.985442,-0.004947,0.011771,1.035509,-0.000603,0.009116,1.027596,0.009326,0.006241,1.018796,-0.001229,0.147548,1.007183,0.970149
15535,19205,Yamanashi-ken Yamanashi-shi,2019,33574.0,22.0,8785.0,2861.296683,0.010128,4083057.0,1,1,1,1,-0.004140,0.019458,0.008447,0.022017,0.407036,-0.628406,1.407036,-0.005150,0.984628,-0.006363,0.015664,1.047511,0.028138,0.008524,1.025789,0.007798,0.009709,1.029275,0.008340,0.312608,1.554131,-0.439361
15510,19205,Yamanashi-ken Yamanashi-shi,2020,33435.0,16.0,9255.0,2870.243213,0.003127,4119255.0,1,1,1,1,-0.000927,0.004950,0.008121,0.008865,-0.628406,0.628207,0.371594,-0.003810,0.988606,-0.004140,0.017515,1.053331,0.019458,0.008122,1.024564,0.008447,0.013074,1.039677,0.022017,-0.220244,0.293128,0.407036
15485,19205,Yamanashi-ken Yamanashi-shi,2021,33357.0,30.0,13315.0,2956.122862,0.029921,4065382.0,1,1,1,1,-0.002338,0.022114,0.011216,-0.013078,0.628207,0.083744,1.628207,-0.002469,0.992610,-0.000927,0.015507,1.047160,0.004950,0.009261,1.028039,0.008121,0.005935,1.017593,0.008865,0.135612,0.851301,-0.628406


In [51]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = y_columns[0]
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  24261
Filtered Size:  23775


Unnamed: 0,year,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,migrations_is_available,taxable_income_is_available,total_tax_is_available,dwellings_is_available,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,total_tax_growth,unit_price_median_pct_chg,upcoming_unit_price_median_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,total_tax_growth_ma3,total_tax_growth_cumu3,total_tax_growth_lag1,unit_price_median_pct_chg_ma3,unit_price_median_pct_chg_cumu3,unit_price_median_pct_chg_lag1
count,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0,23775.0
mean,2014.930431,81419.53,74.392681,19751.101157,2690.665748,0.001995,12693980.0,0.987213,0.937119,0.997476,0.48387,-0.003735,0.002121,0.006455,0.003724,0.230019,0.149561,1.230019,-0.003881,0.989676,-0.003488,0.000949,1.004536,0.001896,0.006875,1.019255,0.006453,0.002641,1.006732,0.001607,0.242274,1.195414,0.237063
std,4.313935,202730.8,195.03403,39135.489253,882.162071,0.032451,39930310.0,0.112355,0.242754,0.050174,0.49975,0.007824,0.043424,0.008183,0.064709,1.747214,0.792869,1.747214,0.006816,0.018581,0.007308,0.028087,0.075366,0.043381,0.008166,0.023219,0.008184,0.038691,0.101676,0.060852,1.103677,1.56976,1.898766
min,2006.0,0.0,1.0,60.0,0.0,-0.694693,0.0,0.0,0.0,0.0,0.0,-0.313065,-0.682739,0.0,-0.677251,-0.989961,-0.87823,0.010039,-0.224323,0.461571,-0.203379,-0.682739,0.19941,-0.682739,0.0,1.0,0.0,-0.417418,0.297333,-0.677251,-0.958065,0.003056,-0.991268
25%,2011.0,10671.5,7.0,3280.0,2498.573185,-0.008284,1333330.0,1.0,1.0,1.0,0.0,-0.007382,-0.011612,0.0,-0.016724,-0.220629,-0.221964,0.779371,-0.007379,0.980046,-0.007106,-0.014373,0.964108,-0.011389,0.0,1.0,0.0,-0.011478,0.968346,-0.015917,-0.049768,0.729085,-0.197157
50%,2015.0,29079.0,22.0,8690.0,2739.682182,0.000984,3688276.0,1.0,1.0,1.0,0.0,-0.003544,0.0028,0.0,0.001463,-0.0073,-0.006847,0.9927,-0.003861,0.989802,-0.00305,0.003711,1.009193,0.002148,0.003959,1.010201,0.0,0.000943,1.001804,0.0,0.033039,0.967029,0.0
75%,2019.0,71822.0,66.0,20470.0,3044.601308,0.01217,10186400.0,1.0,1.0,1.0,1.0,0.0,0.019456,0.012246,0.018862,0.228974,0.232972,1.228974,-0.000271,0.999237,0.0,0.015727,1.045047,0.01894,0.012701,1.035692,0.012246,0.0123,1.033584,0.014985,0.202814,1.21641,0.194873
max,2022.0,3811873.0,4358.0,666150.0,12667.02,0.915431,867276500.0,1.0,1.0,1.0,1.0,0.144218,0.882113,0.12027,4.759148,85.124402,6.439015,86.124402,0.050664,1.15313,0.144218,0.367086,2.188546,0.882113,0.092722,1.303819,0.12027,1.58671,5.757765,4.759148,35.441026,82.490909,85.124402


In [52]:
filtered_df.to_csv(model_ready_data_paths[f"xgb_{dataset_key}_{metric_key}"], index=False)

In [33]:
area_code = 13102
test_df = filtered_df[filtered_df["area_code"] == area_code]
test_df[["year", metric_pct_chg, upcoming_metric_pct_chg]]

Unnamed: 0,year,unit_price_median_pct_chg,upcoming_unit_price_median_pct_chg
1937,1998,-0.099057,-0.041885
3340,1999,-0.041885,-0.180328
4744,2000,-0.180328,-0.08
6152,2001,-0.08,-0.094203
7562,2002,-0.094203,-0.04
8978,2003,-0.04,-0.025
10401,2004,-0.025,0.042735
11868,2005,0.042735,0.110656
15182,2007,0.239852,0.10119
16883,2008,0.10119,-0.140541


In [34]:
test_year = 2021
presentation_df = filtered_df[filtered_df["area_code"].astype(int) // 1000 == 13]
presentation_df = presentation_df[presentation_df["year"] == test_year]
presentation_df = presentation_df.sort_index(axis=1)
presentation_df["upcoming_return"] = presentation_df[upcoming_metric_pct_chg]
presentation_df = presentation_df.sort_values(by=upcoming_metric_pct_chg, ascending=False).reset_index(drop=True)

drop_columns = [upcoming_metric_pct_chg, "area_code"]
presentation_df.drop(columns=drop_columns).style.background_gradient(cmap="cividis")

Unnamed: 0,area,count,multiplier,net_migration_ratio,net_migration_ratio_cumu3,net_migration_ratio_lag1,net_migration_ratio_ma3,new_dwellings_ratio,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,new_dwellings_ratio_ma3,population,taxable_income_growth,taxable_income_growth_cumu3,taxable_income_growth_lag1,taxable_income_growth_ma3,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,total_tax_growth_cumu3,total_tax_growth_lag1,total_tax_growth_ma3,total_traded_area,unit_price_median_pct_chg,unit_price_median_pct_chg_cumu3,unit_price_median_pct_chg_lag1,unit_price_median_pct_chg_ma3,year,upcoming_return
0,Tokyo-to Ota-ku,44.0,1.004926,-0.003056,1.004302,0.000624,0.00144,0.017234,1.056461,0.018414,0.018477,745802.0,0.028964,1.084767,0.00968,0.02759,4445.706246,0.027352,78354599.0,-0.002647,1.042299,0.014051,0.013996,8822.0,0.004926,1.03764,0.007944,0.012429,2021,0.052941
1,Tokyo-to Ome-shi,22.0,1.004372,0.001204,1.000508,0.000554,0.00017,0.010802,1.033237,0.009244,0.01096,133696.0,0.00185,1.002637,-0.006842,0.000896,3211.912817,0.012785,19400365.0,-0.016218,0.977769,-0.000774,-0.007445,8937.0,0.004372,0.960795,-0.029178,-0.013148,2021,0.050054
2,Tokyo-to Suginami-ku,39.0,1.009934,-0.003036,1.004873,0.000986,0.00163,0.016826,1.047301,0.014842,0.015525,589319.0,0.023602,1.034645,0.001256,0.01146,4781.483851,0.018231,67412097.0,-0.001468,1.020151,0.01123,0.006689,10351.0,0.009934,1.077739,0.011725,0.025476,2021,0.032787
3,Tokyo-to Kita-ku,23.0,1.007233,0.000132,1.008312,0.002607,0.002765,0.018369,1.065525,0.023591,0.021384,355260.0,0.033838,1.102407,0.027999,0.03304,3896.760335,0.031869,31140267.0,0.008502,1.059483,0.021805,0.01948,8487.0,0.007233,1.114,0.027881,0.037038,2021,0.032316
4,Tokyo-to Toshima-ku,22.0,1.0,-0.004005,1.002777,0.000683,0.000933,0.017367,1.05266,0.017342,0.017254,300396.0,0.007164,1.092806,0.038492,0.030157,4658.391767,0.011596,34531351.0,-0.008449,1.044673,0.021897,0.014815,4102.0,0.0,1.077969,0.016183,0.025662,2021,0.031851
5,Tokyo-to Kokubunji-shi,11.0,0.992701,0.007914,1.029195,0.011475,0.009639,0.015096,1.040786,0.013148,0.013415,130273.0,0.023048,1.090576,0.011204,0.029483,4443.018749,0.007849,23566389.0,-0.012456,1.02221,-1e-05,0.007549,1671.0,-0.007299,0.996337,-0.007246,-0.001186,2021,0.029412
6,Tokyo-to Arakawa-ku,14.0,1.005137,0.001295,1.010223,0.002092,0.003399,0.022159,1.06124,0.019652,0.020011,217757.0,0.022875,1.09766,0.034157,0.031567,3878.342045,0.023083,18562761.0,0.006797,1.057473,0.023755,0.018838,3620.0,0.005137,1.099251,0.017422,0.032488,2021,0.028961
7,Tokyo-to Bunkyo-ku,18.0,1.0,0.00146,1.018711,0.002641,0.006216,0.017166,1.055352,0.020945,0.018123,240420.0,0.00913,1.063756,0.006374,0.020986,6241.258847,0.004865,36512105.0,0.005351,1.08455,0.04067,0.027546,3741.0,0.0,1.094595,0.021008,0.031027,2021,0.028807
8,Tokyo-to Edogawa-ku,41.0,1.004219,-0.004794,0.99193,-0.003543,-0.002695,0.012581,1.038768,0.012228,0.012759,694602.0,0.019488,1.063746,0.024909,0.020817,3781.857679,0.019496,56493352.0,-0.005515,1.031682,0.025207,0.010529,8321.0,0.004219,1.059347,0.023022,0.019466,2021,0.028011
9,Tokyo-to Kunitachi-shi,7.0,1.011236,0.004132,1.010743,0.002917,0.003568,0.013218,1.037429,0.016074,0.01233,77450.0,-0.00081,1.037835,0.03417,0.012573,4476.53235,-0.012001,15033745.0,-0.026964,0.997609,0.022382,-0.000591,1240.0,0.011236,1.028571,-0.002801,0.009478,2021,0.027778


In [None]:
# sort by median and weighted median to see the difference in ordering.
# Try to find the best metric to use