In [1]:
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from jre_utils.datapath import factor_data_paths, get_derived_csv_path, model_ready_data_paths
from jre_utils.config import asset_types, statistics, area_levels, period_cols
from jre_utils.process import get_most_active_municipalities

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
period = "yearly"
area_level = "municipality"
asset_type = "building"
statistic = "median"

granularity_columns = area_levels[area_level]["columns"]
group_by_columns = granularity_columns + [period_cols[period]]
display_columns = list(statistics[statistic].keys())

metric = asset_types[asset_type]["metric"]
metric_pct_chg = asset_types[asset_type]["metric_pct_chg"]
upcoming_metric_pct_chg = "Upcoming" + metric_pct_chg

core_path = get_derived_csv_path(period, area_level, asset_type, statistic)
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]

In [3]:
df = pd.read_csv(core_path)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
)

In [4]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()

# set up target variables
df[upcoming_metric_pct_chg] = df.groupby(granularity_columns)[metric_pct_chg].shift(-1)

# time box
start_year = 2005
end_year = 2023

df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30339 entries, 19363 to 17393
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Prefecture                          30339 non-null  object 
 1   Municipality                        30339 non-null  object 
 2   year                                30339 non-null  int64  
 3   TradePricePerArea                   30339 non-null  float64
 4   UnitPrice                           0 non-null      float64
 5   Count                               30339 non-null  int64  
 6   population                          24235 non-null  float64
 7   net_migration_ratio                 24235 non-null  float64
 8   taxable_income                      23202 non-null  float64
 9   taxpayer_count                      23202 non-null  float64
 10  taxable_income_per_taxpayer         23202 non-null  float64
 11  taxable_income_growth               23202 

In [6]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

# df = get_most_active_municipalities(df, n=800)
# df = df[df["Count"] >= 10]

In [7]:
y_columns = [upcoming_metric_pct_chg]

X_categorical_columns = ["Prefecture", "Municipality"]

X_basic_columns = [
    "year",
    "population",
    "Count",
    "taxable_income_per_taxpayer",
    "taxable_income_per_taxpayer_growth",
]

X_engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    metric_pct_chg,
]

initial_columns = (
    X_categorical_columns + X_basic_columns + X_engineering_columns + y_columns
)

df = df[initial_columns]
df[X_categorical_columns] = df[X_categorical_columns].astype("category")
df = df.dropna()

In [8]:
lag = 1

X_engineered_columns = []

for col in X_engineering_columns:
    df["multiplier"] = df[col] + 1
    df[f"{col}_ma3"] = df.groupby(granularity_columns)[col].transform(
        lambda x: x.rolling(3, 1).mean()
    )
    df[f"{col}_cumu3"] = df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )

    X_engineered_columns.append(f"{col}_ma3")
    X_engineered_columns.append(f"{col}_cumu3")
    print(f"{col}_ma3")
    print(f"{col}_cumu3")

    for i in range(1, lag + 1):
        df[f"{col}_lag{i}"] = df.groupby(granularity_columns)[col].shift(i)
        X_engineered_columns.append(f"{col}_lag{i}")
        print(f"{col}_lag{i}")

df[X_engineered_columns] = df[X_engineered_columns].fillna(0)

net_migration_ratio_ma3
net_migration_ratio_cumu3
net_migration_ratio_lag1
taxable_income_growth_ma3
taxable_income_growth_cumu3
taxable_income_growth_lag1
new_dwellings_ratio_ma3
new_dwellings_ratio_cumu3
new_dwellings_ratio_lag1
TradePricePctChg_ma3
TradePricePctChg_cumu3
TradePricePctChg_lag1


In [9]:
df

Unnamed: 0,Prefecture,Municipality,year,population,Count,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,TradePricePctChg,UpcomingTradePricePctChg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,TradePricePctChg_ma3,TradePricePctChg_cumu3,TradePricePctChg_lag1
19383,Aichi,Aisai,2010,61910.0,51,2982.171416,-0.066436,0.001454,-0.095355,0.014016,-0.282297,0.000000,0.717703,0.001454,1.001454,0.000000,-0.095355,0.904645,0.000000,0.014016,1.014016,0.000000,-0.282297,0.717703,0.000000
19384,Aichi,Aisai,2011,62000.0,43,2977.500230,-0.001566,-0.003113,-0.011972,0.012682,0.000000,0.100000,1.000000,-0.000830,0.998336,0.001454,-0.053663,0.893815,-0.095355,0.013349,1.026876,0.014016,-0.141148,0.717703,-0.282297
19385,Aichi,Aisai,2012,61807.0,42,2968.945417,-0.002873,0.000712,0.001476,0.014550,0.100000,-0.079637,1.100000,-0.000316,0.999047,-0.003113,-0.035284,0.895134,-0.011972,0.013749,1.041817,0.012682,-0.060766,0.789474,0.000000
19386,Aichi,Aisai,2013,61851.0,36,2960.406212,-0.002876,-0.002280,0.000891,0.011411,-0.079637,-0.163846,0.920363,-0.001560,0.995323,0.000712,-0.003202,0.990368,0.001476,0.012881,1.039141,0.014550,0.006788,1.012399,0.100000
19387,Aichi,Aisai,2014,61710.0,46,2996.514481,0.012197,-0.001912,0.004293,0.010975,-0.163846,0.004109,0.836154,-0.001160,0.996521,-0.002280,0.002220,1.006671,0.000891,0.012312,1.037389,0.011411,-0.047828,0.846522,-0.079637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17387,Yamanashi,Yamanashi,2017,33957.0,15,2772.886669,0.002362,-0.004947,-0.000603,0.009326,0.970149,-0.439361,1.970149,-0.004990,0.985102,-0.003317,0.002522,1.007565,0.007778,0.009868,1.029898,0.010223,0.175164,1.156021,-0.088145
17388,Yamanashi,Yamanashi,2018,33789.0,14,2832.607559,0.021537,-0.006363,0.028138,0.007798,-0.439361,0.407036,0.560639,-0.004876,0.985442,-0.004947,0.011771,1.035509,-0.000603,0.009116,1.027596,0.009326,0.147548,1.007183,0.970149
17389,Yamanashi,Yamanashi,2019,33574.0,22,2861.296683,0.010128,-0.004140,0.019458,0.008447,0.407036,-0.628406,1.407036,-0.005150,0.984628,-0.006363,0.015664,1.047511,0.028138,0.008524,1.025789,0.007798,0.312608,1.554131,-0.439361
17390,Yamanashi,Yamanashi,2020,33435.0,16,2870.243213,0.003127,-0.000927,0.004950,0.008121,-0.628406,0.628207,0.371594,-0.003810,0.988606,-0.004140,0.017515,1.053331,0.019458,0.008122,1.024564,0.008447,-0.220244,0.293128,0.407036


In [10]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = y_columns[0]
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  10701
Filtered Size:  10487


Unnamed: 0,year,population,Count,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,TradePricePctChg,UpcomingTradePricePctChg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,TradePricePctChg_ma3,TradePricePctChg_cumu3,TradePricePctChg_lag1
count,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0
mean,2014.462001,118085.3,103.806618,3022.698681,0.001225,-0.002433,0.003387,0.013322,0.036578,0.025494,1.036578,-0.002612,0.993211,-0.002299,0.001617,1.006916,0.001669,0.013696,1.038157,0.012513,0.038656,1.025911,0.035848
std,4.075304,137623.2,118.645472,668.935295,0.022551,0.005506,0.033416,0.006797,0.473326,0.287359,0.473326,0.005271,0.014421,0.005398,0.024317,0.065955,0.032262,0.006509,0.019467,0.007483,0.418574,0.665836,0.468031
min,2006.0,2961.0,1.0,2053.240341,-0.294513,-0.058973,-0.398456,0.0,-0.886667,-0.614716,0.113333,-0.027463,0.920712,-0.058973,-0.201696,0.483597,-0.398456,0.000762,1.00098,0.0,-0.886667,0.02933,-0.973874
25%,2011.0,41394.5,29.0,2647.120051,-0.007598,-0.005702,-0.008249,0.00886,-0.138136,-0.134442,0.861864,-0.005823,0.984274,-0.005501,-0.012679,0.968971,-0.008042,0.009287,1.024581,0.00794,-0.049417,0.81727,-0.125091
50%,2015.0,68603.0,62.0,2893.738632,0.002385,-0.002634,0.006771,0.012416,-0.005405,-0.003605,0.994595,-0.00288,0.992599,-0.002177,0.005167,1.013121,0.003411,0.012912,1.036014,0.012021,0.008819,0.973214,0.0
75%,2018.0,133438.5,126.0,3218.537565,0.011454,0.000399,0.019931,0.016489,0.128411,0.128733,1.128411,6.4e-05,1.000151,7.7e-05,0.016362,1.047582,0.017098,0.016897,1.048347,0.016294,0.076782,1.133027,0.111764
max,2021.0,1540632.0,886.0,12667.02,0.409227,0.043446,0.440856,0.12027,31.333333,1.45925,32.333333,0.038018,1.105401,0.043446,0.221486,1.809814,0.440856,0.091613,1.299733,0.095865,31.333333,32.666667,31.333333


In [11]:
filtered_df.to_csv(model_ready_data_paths["basic"], index=False)

In [39]:
test_prefecture, test_municipality = "Tokyo", "Minato"
test_df = filtered_df[(filtered_df["Prefecture"] == test_prefecture) & (filtered_df["Municipality"] == test_municipality)]
test_df[["year", metric_pct_chg, upcoming_metric_pct_chg]]

Unnamed: 0,year,TradePricePctChg,UpcomingTradePricePctChg
3076,2006,0.467742,0.027027
3077,2007,0.027027,0.010341
3078,2008,0.010341,-0.413008
3079,2009,-0.413008,-0.071642
3080,2010,-0.071642,0.052466
3081,2011,0.052466,-0.041267
3082,2012,-0.041267,0.09541
3083,2013,0.09541,0.053292
3084,2014,0.053292,0.16
3085,2015,0.16,0.115481


In [62]:
test_year = 2021
test_prefecture = "Tokyo"
presentation_df = df[(df["Prefecture"] == test_prefecture) & (df["year"] == test_year)]
presentation_df = presentation_df.sort_index(axis=1)
presentation_df["upcoming_return"] = tokyo_df["UpcomingTradePricePctChg"]
presentation_df = presentation_df.sort_values(by=upcoming_metric_pct_chg, ascending=False).reset_index(drop=True)

drop_columns = ["UpcomingTradePricePctChg", "year", "Prefecture"]
presentation_df.drop(columns=drop_columns).style.background_gradient(cmap="cividis")

Unnamed: 0,Count,Municipality,TradePricePctChg,TradePricePctChg_cumu3,TradePricePctChg_lag1,TradePricePctChg_ma3,multiplier,net_migration_ratio,net_migration_ratio_cumu3,net_migration_ratio_lag1,net_migration_ratio_ma3,new_dwellings_ratio,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,new_dwellings_ratio_ma3,population,taxable_income_growth,taxable_income_growth_cumu3,taxable_income_growth_lag1,taxable_income_growth_ma3,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,upcoming_return
0,80,Chuo,-0.122304,0.976136,-0.011111,-0.002921,0.877696,0.001523,1.031616,0.005651,0.010478,0.013507,1.055344,0.023048,0.018125,169437.0,0.059288,1.199215,0.021457,0.06302,7124.559729,0.041049,
1,126,Taito,-0.075,0.958636,-0.04,-0.011818,0.925,0.007016,1.032885,0.010457,0.010849,0.038018,1.103853,0.031256,0.033489,212938.0,0.027814,1.141904,0.051647,0.045301,4460.378159,0.009682,
2,48,Chiyoda,-0.04431,1.092685,0.13069,0.032525,0.95569,-0.000976,1.042108,0.014607,0.013913,0.017712,1.064089,0.026555,0.02093,66615.0,-0.006847,1.098012,-0.028845,0.034243,9851.789377,-0.02036,
3,89,Akishima,0.0,1.008333,0.012303,0.002794,1.0,0.006859,1.014638,0.003932,0.004857,0.015792,1.045022,0.018064,0.014792,114736.0,0.027309,1.069553,0.019449,0.022672,3458.239137,0.021327,
4,190,Sumida,-0.02266,0.993333,-0.036115,-0.001443,0.97734,0.003724,1.018306,0.004076,0.00607,0.029357,1.096477,0.033261,0.031178,273102.0,0.043297,1.136474,0.042262,0.043566,4050.592444,0.02933,
5,99,Higashiyamato,0.055072,0.929952,-0.017094,-0.021761,1.055072,0.003409,1.005357,0.002849,0.001784,0.011779,1.032668,0.01323,0.010776,84188.0,0.007013,1.019671,0.002557,0.006519,3518.860246,0.004663,
6,92,Tama,-0.119959,0.973263,0.091954,-0.005069,0.880041,-0.001663,1.00164,0.000932,0.000548,0.01135,1.037714,0.013411,0.012417,146707.0,0.014449,1.043833,0.00971,0.01441,3767.066592,0.014367,
7,252,Kita,-0.091791,1.003571,0.109266,0.004543,0.908209,0.000132,1.008312,0.002607,0.002765,0.018369,1.065525,0.023591,0.021384,355260.0,0.033838,1.102407,0.027999,0.03304,3896.760335,0.031869,
8,139,Bunkyo,0.212958,1.132576,-0.097052,0.049999,1.212958,0.00146,1.018711,0.002641,0.006216,0.017166,1.055352,0.020945,0.018123,240420.0,0.00913,1.063756,0.006374,0.020986,6241.258847,0.004865,
9,73,Komae,-0.041667,1.00079,0.090909,0.002173,0.958333,0.000165,1.009325,0.000849,0.003106,0.009401,1.034383,0.011445,0.011333,84786.0,0.038202,1.082165,-0.011307,0.027054,4051.712908,0.033988,
