In [109]:
import warnings
import math

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import statsmodels.api as sm

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)
from jre_utils.process import get_most_active_municipalities
from jre_utils.visualize import plot_time_series


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [110]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = "weighted_median"
metric = metrics[metric_key]
smoothed_metric = f"{metric}_smoothed"
metric_pct_chg = smoothed_metric + "_pct_chg"
upcoming_metric = "upcoming_" + smoothed_metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg


In [111]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]


In [112]:
def years_since_crisis(year):
    year_ranges = {
        (1960, 1973): 1960,
        (1973, 1990): 1973,
        (1990, 1997): 1990,
        (1997, 2008): 1997,
        (2008, 2019): 2008,
        (2019, math.inf): 2019,
    }

    for range_start, range_end in year_ranges:
        if range_start <= year < range_end:
            return year - year_ranges[(range_start, range_end)]
        
def get_loess_parameters(
    area_code, area_to_diff_std, min_diff_std, max_diff_std, min_frac=0.3, max_frac=0.6
):
    diff_score = (area_to_diff_std[area_code] - min_diff_std) / (
        max_diff_std - min_diff_std
    )
    frac = min_frac + diff_score * (max_frac - min_frac)
    frac = min(frac, 0.8)
    # frac = max(frac, 0.2)
    return (frac, 3, diff_score)

In [113]:
core_df = pd.read_csv(core_path)
core_df = get_most_active_municipalities(core_df, 1500)

# Smooth each area_code with loess smoothing
area_to_diff_std = {}
area_to_loess_params = {}

for area_code in core_df["area_code"].unique():
    area_df = core_df[core_df["area_code"] == area_code].sort_values(by="year", ascending=True)
    area_to_diff_std[area_code] = area_df[metric].pct_change().std()

area_to_diff_std_series = pd.Series(area_to_diff_std)
min_diff_std = area_to_diff_std_series.quantile(0.05)
max_diff_std = area_to_diff_std_series.quantile(0.95)

for area_code in core_df["area_code"].unique():
    area_to_loess_params[area_code] = get_loess_parameters(
        area_code, area_to_diff_std, min_diff_std, max_diff_std
    )

smoothed_df = pd.DataFrame()
for area_code in core_df["area_code"].unique():
    area_df = core_df[core_df["area_code"] == area_code].sort_values(by="year", ascending=True)
    frac, it, _ = area_to_loess_params[area_code]
    area_df[smoothed_metric] = sm.nonparametric.lowess(
        area_df[metric], area_df.index, frac=frac, it=it, return_sorted=False
    )
    smoothed_df = pd.concat([smoothed_df, area_df])


In [114]:
df = smoothed_df.copy()

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)

# Might go back and undo the new dwellings filling for unknown municipalities.
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

# Years since crisis (Yn - 2008) or (Yn - 2020)
df["years_since_crisis"] = df["year"].apply(years_since_crisis)

df = df.fillna(0)

In [115]:
# df

In [116]:
area_df = df[df["area_code"].isin([13102])][granularity_columns + ["year", smoothed_metric, metric]]
area_df[metric_pct_chg+"_old"] = area_df[smoothed_metric].pct_change(periods=2)
area_df[upcoming_metric_pct_chg+"_old"] = area_df[metric_pct_chg+"_old"].shift(-2)

area_df[upcoming_metric] = area_df.groupby(granularity_columns)[smoothed_metric].shift(-2)
area_df[upcoming_metric_pct_chg] = area_df[upcoming_metric] / area_df[smoothed_metric] - 1

# area_df
smoothed_metric
plot_time_series(
    area_df,
    metric,
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
    width=1000,
    height=400,
    showlegend=False,
)

plot_time_series(
    area_df,
    smoothed_metric,
    group_by_columns,
    granularity_columns,
    f"Smoothed Unit Price (Robust LOWESS) over time",
    # visible="legendonly",
    width=1000,
    height=400,
    showlegend=False,
)

In [117]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[smoothed_metric].pct_change()

# set up target variables
years_ahead = 2
df[metric_pct_chg] = df[smoothed_metric].pct_change(periods=years_ahead)
df[upcoming_metric_pct_chg] = df[metric_pct_chg].shift(-years_ahead)


# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25315 entries, 16845 to 14370
Data columns (total 29 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   year                                          25315 non-null  int64  
 1   area_code                                     25315 non-null  int64  
 2   area                                          25315 non-null  object 
 3   unit_price_wmean                              25315 non-null  float64
 4   unit_price_wmedian                            25315 non-null  float64
 5   unit_price_mean                               25315 non-null  float64
 6   unit_price_median                             25315 non-null  float64
 7   total_traded_area                             25315 non-null  float64
 8   count                                         25315 non-null  float64
 9   unit_price_wmedian_smoothed                   25315 non-null  

In [119]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

# df = get_most_active_municipalities(df, n=1500)
# df = df[df["count"] > 5]

In [120]:
y_columns = [upcoming_metric_pct_chg]

X_categorical_columns = ["area_code", "area"]

X_basic_columns = [
    "year",
    "years_since_crisis",
    "population",
    "count",
    "total_traded_area",
    "taxable_income_per_taxpayer",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "migrations_is_available",
    "taxable_income_is_available",
    "total_tax_is_available",
    "dwellings_is_available",
]

X_engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    "total_tax_growth",
    metric_pct_chg,
]

initial_columns = (
    X_categorical_columns + X_basic_columns + X_engineering_columns + y_columns
)

df = df[initial_columns]
df[X_categorical_columns] = df[X_categorical_columns].astype("category")
df = df.dropna()

In [122]:
lag = 3

X_engineered_columns = []

for col in X_engineering_columns:
    df["multiplier"] = df[col] + 1
    df[f"{col}_ma3"] = df.groupby(granularity_columns)[col].transform(
        lambda x: x.rolling(3, 1).mean()
    )
    df[f"{col}_cumu3"] = df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )

    X_engineered_columns.append(f"{col}_ma3")
    X_engineered_columns.append(f"{col}_cumu3")
    print(f"{col}_ma3")
    print(f"{col}_cumu3")

    for i in range(1, lag + 1):
        df[f"{col}_lag{i}"] = df.groupby(granularity_columns)[col].shift(i)
        X_engineered_columns.append(f"{col}_lag{i}")
        print(f"{col}_lag{i}")

df[X_engineered_columns] = df[X_engineered_columns].fillna(0)

net_migration_ratio_ma3
net_migration_ratio_cumu3
net_migration_ratio_lag1
net_migration_ratio_lag2
net_migration_ratio_lag3
taxable_income_growth_ma3
taxable_income_growth_cumu3
taxable_income_growth_lag1
taxable_income_growth_lag2
taxable_income_growth_lag3
new_dwellings_ratio_ma3
new_dwellings_ratio_cumu3
new_dwellings_ratio_lag1
new_dwellings_ratio_lag2
new_dwellings_ratio_lag3
total_tax_growth_ma3
total_tax_growth_cumu3
total_tax_growth_lag1
total_tax_growth_lag2
total_tax_growth_lag3
unit_price_wmedian_smoothed_pct_chg_ma3
unit_price_wmedian_smoothed_pct_chg_cumu3
unit_price_wmedian_smoothed_pct_chg_lag1
unit_price_wmedian_smoothed_pct_chg_lag2
unit_price_wmedian_smoothed_pct_chg_lag3


In [123]:
X_engineered_columns

['net_migration_ratio_ma3',
 'net_migration_ratio_cumu3',
 'net_migration_ratio_lag1',
 'net_migration_ratio_lag2',
 'net_migration_ratio_lag3',
 'taxable_income_growth_ma3',
 'taxable_income_growth_cumu3',
 'taxable_income_growth_lag1',
 'taxable_income_growth_lag2',
 'taxable_income_growth_lag3',
 'new_dwellings_ratio_ma3',
 'new_dwellings_ratio_cumu3',
 'new_dwellings_ratio_lag1',
 'new_dwellings_ratio_lag2',
 'new_dwellings_ratio_lag3',
 'total_tax_growth_ma3',
 'total_tax_growth_cumu3',
 'total_tax_growth_lag1',
 'total_tax_growth_lag2',
 'total_tax_growth_lag3',
 'unit_price_wmedian_smoothed_pct_chg_ma3',
 'unit_price_wmedian_smoothed_pct_chg_cumu3',
 'unit_price_wmedian_smoothed_pct_chg_lag1',
 'unit_price_wmedian_smoothed_pct_chg_lag2',
 'unit_price_wmedian_smoothed_pct_chg_lag3']

In [124]:
df

Unnamed: 0,area_code,area,year,years_since_crisis,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,migrations_is_available,taxable_income_is_available,total_tax_is_available,dwellings_is_available,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,total_tax_growth,unit_price_wmedian_smoothed_pct_chg,upcoming_unit_price_wmedian_smoothed_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,total_tax_growth_ma3,total_tax_growth_cumu3,total_tax_growth_lag1,unit_price_wmedian_smoothed_pct_chg_ma3,unit_price_wmedian_smoothed_pct_chg_cumu3,unit_price_wmedian_smoothed_pct_chg_lag1,net_migration_ratio_lag2,net_migration_ratio_lag3,taxable_income_growth_lag2,taxable_income_growth_lag3,new_dwellings_ratio_lag2,new_dwellings_ratio_lag3,total_tax_growth_lag2,total_tax_growth_lag3,unit_price_wmedian_smoothed_pct_chg_lag2,unit_price_wmedian_smoothed_pct_chg_lag3
16847,23441,Aichi-ken Agui-cho,2009,1,0.0,31.0,6100.0,3483.893426,-0.018412,3776007.0,0,1,1,0,0.000000,-0.015716,0.000000,-0.045898,1.210082,0.153279,2.210082,0.000000,1.000000,0.000000,-0.015716,0.984284,0.000000,0.000000,1.000000,0.000000,-0.045898,0.954102,0.000000,1.210082,2.210082,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
16848,23441,Aichi-ken Agui-cho,2010,2,25695.0,41.0,8265.0,3213.665886,-0.077565,3646545.0,1,1,1,0,0.021522,-0.089172,0.000000,-0.034285,0.528666,0.033528,1.528666,0.010761,1.021522,0.000000,-0.052444,0.896514,-0.015716,0.000000,1.000000,0.000000,-0.040092,0.921390,-0.045898,0.869374,3.378476,1.210082,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
16849,23441,Aichi-ken Agui-cho,2011,3,26248.0,40.0,11000.0,3245.342254,0.009857,3801941.0,1,1,1,0,0.020763,0.023952,0.000000,0.042615,0.153279,-0.018164,1.153279,0.014095,1.042732,0.021522,-0.026979,0.917987,-0.089172,0.000000,1.000000,0.000000,-0.012523,0.960654,-0.034285,0.630675,3.896325,0.528666,0.000000,0.000000,-0.015716,0.000000,0.000000,0.000000,-0.045898,0.000000,1.210082,0.000000
16850,23441,Aichi-ken Agui-cho,2012,4,26793.0,45.0,9585.0,3270.662717,0.007802,3842470.0,1,1,1,0,0.012503,0.032618,0.000000,0.010660,0.033528,-0.117464,1.033528,0.018263,1.055770,0.020763,-0.010867,0.963065,0.023952,0.000000,1.000000,0.000000,0.006330,1.017601,0.042615,0.238491,1.822087,0.153279,0.021522,0.000000,-0.089172,-0.015716,0.000000,0.000000,-0.034285,-0.045898,0.528666,1.210082
16851,23441,Aichi-ken Agui-cho,2013,5,27128.0,44.0,8920.0,3266.188636,-0.001368,4013515.0,1,1,1,0,0.014376,0.013382,0.000000,0.044514,-0.018164,-0.290334,0.981836,0.015881,1.048385,0.012503,0.023317,1.071500,0.032618,0.000000,1.000000,0.000000,0.032596,1.100635,0.010660,0.056214,1.170295,0.033528,0.020763,0.021522,0.023952,-0.089172,0.000000,0.000000,0.042615,-0.034285,0.153279,0.528666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14364,19205,Yamanashi-ken Yamanashi-shi,2017,9,33957.0,15.0,5130.0,2772.886669,0.002362,3962053.0,1,1,1,1,-0.004947,-0.000603,0.009326,-0.001229,-0.129377,-0.075969,0.870623,-0.004990,0.985102,-0.003317,0.002522,1.007565,0.007778,0.009868,1.029898,0.010223,0.002353,1.007011,0.011612,-0.214282,0.480433,-0.281578,-0.006706,-0.003515,0.000392,0.008882,0.010056,0.007205,-0.003324,0.001940,-0.231891,0.105323
14365,19205,Yamanashi-ken Yamanashi-shi,2018,10,33789.0,14.0,7400.0,2832.607559,0.021537,3995097.0,1,1,1,1,-0.006363,0.028138,0.007798,0.008340,-0.027645,-0.207023,0.972355,-0.004876,0.985442,-0.004947,0.011771,1.035509,-0.000603,0.009116,1.027596,0.009326,0.006241,1.018796,-0.001229,-0.146200,0.608183,-0.129377,-0.003317,-0.006706,0.007778,0.000392,0.010223,0.010056,0.011612,-0.003324,-0.281578,-0.231891
14366,19205,Yamanashi-ken Yamanashi-shi,2019,0,33574.0,22.0,8785.0,2861.296683,0.010128,4083057.0,1,1,1,1,-0.004140,0.019458,0.008447,0.022017,-0.075969,-0.244296,0.924031,-0.005150,0.984628,-0.006363,0.015664,1.047511,0.028138,0.008524,1.025789,0.007798,0.009709,1.029275,0.008340,-0.077663,0.782243,-0.027645,-0.004947,-0.003317,-0.000603,0.007778,0.009326,0.010223,-0.001229,0.011612,-0.129377,-0.281578
14367,19205,Yamanashi-ken Yamanashi-shi,2020,1,33435.0,16.0,9255.0,2870.243213,0.003127,4119255.0,1,1,1,1,-0.000927,0.004950,0.008121,0.008865,-0.207023,0.003643,0.792977,-0.003810,0.988606,-0.004140,0.017515,1.053331,0.019458,0.008122,1.024564,0.008447,0.013074,1.039677,0.022017,-0.103546,0.712479,-0.075969,-0.006363,-0.004947,0.028138,-0.000603,0.007798,0.009326,0.008340,-0.001229,-0.027645,-0.129377


In [125]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = y_columns[0]
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  25311
Filtered Size:  24803


Unnamed: 0,year,years_since_crisis,population,count,total_traded_area,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,migrations_is_available,taxable_income_is_available,total_tax_is_available,dwellings_is_available,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,total_tax_growth,unit_price_wmedian_smoothed_pct_chg,upcoming_unit_price_wmedian_smoothed_pct_chg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,total_tax_growth_ma3,total_tax_growth_cumu3,total_tax_growth_lag1,unit_price_wmedian_smoothed_pct_chg_ma3,unit_price_wmedian_smoothed_pct_chg_cumu3,unit_price_wmedian_smoothed_pct_chg_lag1,net_migration_ratio_lag2,net_migration_ratio_lag3,taxable_income_growth_lag2,taxable_income_growth_lag3,new_dwellings_ratio_lag2,new_dwellings_ratio_lag3,total_tax_growth_lag2,total_tax_growth_lag3,unit_price_wmedian_smoothed_pct_chg_lag2,unit_price_wmedian_smoothed_pct_chg_lag3
count,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0,24803.0
mean,2014.777124,4.45055,81660.83,73.58251,19541.8401,2601.677994,0.001261,12724250.0,0.933919,0.898359,0.95053,0.494416,-0.003411,0.002318,0.006777,0.00734,0.121444,0.080632,1.121444,-0.003626,0.990177,-0.003365,0.002368,1.006248,0.002074,0.007306,1.020478,0.006726,0.010164,1.019356,0.007167,0.223192,-0.5807,0.13846,-0.003177,-0.00299,0.001927,0.000636,0.006673,0.006303,0.00551,0.00575,0.135393,0.13259
std,4.860824,3.315701,202908.2,191.29586,38302.526044,1024.85916,0.038207,39780730.0,0.248428,0.302182,0.216851,0.499979,0.007264,0.046781,0.008509,0.067665,18.146482,0.588725,18.146482,0.006561,0.018068,0.007208,0.035993,0.075099,0.046463,0.008502,0.023897,0.008505,0.047666,0.108739,0.067232,12.602249,742.002108,18.462418,0.006798,0.006518,0.046202,0.0449,0.008503,0.008495,0.064641,0.063789,18.463326,18.463196
min,2005.0,0.0,0.0,1.0,65.0,0.0,-0.694693,0.0,0.0,0.0,0.0,0.0,-0.313065,-0.682739,0.0,-0.677251,-2822.085994,-0.778576,-2821.085994,-0.224323,0.461571,-0.313065,-0.362134,0.19941,-0.682739,0.0,1.0,0.0,-0.350207,0.297333,-0.677251,-1395.911943,-88193.09237,-2822.085994,-0.203379,-0.156526,-0.682739,-0.682739,0.0,0.0,-0.677251,-0.677251,-2822.085994,-2822.085994
25%,2011.0,2.0,10953.0,8.0,3590.0,2485.405879,-0.007943,1398806.0,1.0,1.0,1.0,0.0,-0.007024,-0.009848,0.0,-0.013998,-0.167407,-0.165532,0.832593,-0.007089,0.980643,-0.006924,-0.011636,0.969036,-0.009783,0.0,1.0,0.0,-0.007209,0.978157,-0.013609,-0.135344,0.632537,-0.154796,-0.006674,-0.006379,-0.009516,-0.008922,0.0,0.0,-0.012937,-0.010799,-0.143581,-0.134161
50%,2015.0,4.0,29414.0,22.0,8730.0,2740.861976,0.0,3749085.0,1.0,1.0,1.0,0.0,-0.003117,0.001607,0.0,0.001478,-0.021494,-0.022066,0.978506,-0.00358,0.990322,-0.002982,0.004871,1.012805,0.001136,0.004857,1.013213,0.0,0.004303,1.011605,0.001007,-0.007282,0.953552,-0.006445,-0.002496,-0.00193,0.000536,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2019.0,7.0,72338.5,66.0,20155.0,3050.614269,0.010624,10339260.0,1.0,1.0,1.0,1.0,0.0,0.018401,0.012625,0.020629,0.144648,0.139985,1.144648,-0.000133,0.999624,0.0,0.015893,1.044498,0.017909,0.013157,1.037165,0.012565,0.017956,1.049579,0.020083,0.153081,1.440343,0.124009,0.0,0.0,0.0175,0.01484,0.012509,0.012116,0.016634,0.01553,0.103946,0.084956
max,2023.0,10.0,3832957.0,4358.0,666150.0,12667.02,3.549172,867276500.0,1.0,1.0,1.0,1.0,0.099253,3.608838,0.12027,4.759148,280.550806,5.861926,281.550806,0.046926,1.145081,0.043452,3.608838,4.608838,3.608838,0.092722,1.303819,0.12027,3.344525,5.757765,4.759148,189.092505,11708.721894,536.716329,0.099253,0.099253,3.608838,3.608838,0.12027,0.12027,4.759148,4.759148,536.716329,536.716329


In [126]:
filtered_df.to_csv(model_ready_data_paths[f"xgb_{dataset_key}_{metric_key}_{years_ahead}"], index=False)

In [127]:
area_code = 13102
test_df = filtered_df[filtered_df["area_code"] == area_code]
test_df[["year", metric_pct_chg, upcoming_metric_pct_chg]]

Unnamed: 0,year,unit_price_wmedian_smoothed_pct_chg,upcoming_unit_price_wmedian_smoothed_pct_chg
2029,2005,3.486388,0.38909
2030,2006,3.744622,0.265304
2031,2007,0.38909,-0.162241
2032,2008,0.265304,-0.315759
2033,2009,-0.162241,-0.258425
2034,2010,-0.315759,-0.096882
2035,2011,-0.258425,0.047051
2036,2012,-0.096882,0.151523
2037,2013,0.047051,0.234043
2038,2014,0.151523,0.280731


In [128]:
test_year = 2020
presentation_df = filtered_df[filtered_df["area_code"].astype(int) // 1000 == 13]
presentation_df = presentation_df[presentation_df["year"] == test_year]
presentation_df = presentation_df.sort_index(axis=1)
presentation_df["upcoming_return"] = presentation_df[upcoming_metric_pct_chg]
presentation_df = presentation_df.sort_values(by=upcoming_metric_pct_chg, ascending=False).reset_index(drop=True)

drop_columns = [upcoming_metric_pct_chg, "area_code"]
presentation_df.drop(columns=drop_columns).style.background_gradient(cmap="cividis")

Unnamed: 0,area,count,dwellings_is_available,migrations_is_available,multiplier,net_migration_ratio,net_migration_ratio_cumu3,net_migration_ratio_lag1,net_migration_ratio_lag2,net_migration_ratio_lag3,net_migration_ratio_ma3,new_dwellings_ratio,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,new_dwellings_ratio_lag2,new_dwellings_ratio_lag3,new_dwellings_ratio_ma3,population,taxable_income_growth,taxable_income_growth_cumu3,taxable_income_growth_lag1,taxable_income_growth_lag2,taxable_income_growth_lag3,taxable_income_growth_ma3,taxable_income_is_available,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,total_tax_growth_cumu3,total_tax_growth_lag1,total_tax_growth_lag2,total_tax_growth_lag3,total_tax_growth_ma3,total_tax_is_available,total_traded_area,unit_price_wmedian_smoothed_pct_chg,unit_price_wmedian_smoothed_pct_chg_cumu3,unit_price_wmedian_smoothed_pct_chg_lag1,unit_price_wmedian_smoothed_pct_chg_lag2,unit_price_wmedian_smoothed_pct_chg_lag3,unit_price_wmedian_smoothed_pct_chg_ma3,year,years_since_crisis,upcoming_return
0,Tokyo-to Minato-ku,73.0,1,1,1.072795,-0.005551,1.008307,0.006153,0.007734,0.008911,0.002779,0.015563,1.073654,0.033005,0.023423,0.020929,0.023997,260486.0,-0.032463,1.105685,0.10635,0.03293,0.033991,0.035606,1,11631.584149,-0.044258,82850638.0,-0.038556,1.07958,0.099437,0.021316,-0.00108,0.027399,1,12845.0,0.072795,1.416297,0.116752,0.182173,0.263356,0.123907,2020,1,0.623022
1,Tokyo-to Mizuho-machi,33.0,0,1,0.925437,0.001259,0.988654,-0.007002,-0.005626,-0.003685,-0.00379,0.0,1.0,0.0,0.0,0.0,0.0,31765.0,0.010155,1.024885,-0.006034,0.020742,-0.016155,0.008287,1,3125.22634,0.021603,6739676.0,0.001405,1.025385,-0.007396,0.031576,-0.011838,0.008528,1,5645.0,-0.074563,0.797237,-0.11616,-0.025309,0.078372,-0.072011,2020,1,0.49268
2,Tokyo-to Bunkyo-ku,134.0,1,1,1.04139,0.002641,1.029446,0.014546,0.012014,0.009442,0.009734,0.020945,1.051028,0.016256,0.012998,0.015185,0.016733,240069.0,0.006374,1.084229,0.047456,0.028551,0.063543,0.02746,1,6211.041633,-0.004183,36317776.0,0.04067,1.104599,0.036618,0.023936,0.027259,0.033742,1,16550.0,0.04139,1.117735,0.034592,0.037424,0.061846,0.037802,2020,1,0.42748
3,Tokyo-to Kiyose-shi,123.0,1,1,1.037107,0.008884,1.015802,0.002829,0.004016,0.00744,0.005243,0.011618,1.044285,0.022237,0.009836,0.015672,0.014564,76208.0,0.035732,1.065855,0.001379,0.027666,0.033335,0.021592,1,3489.786668,0.023182,9906116.0,0.023653,1.042,0.008567,0.009277,0.011912,0.013832,1,15260.0,0.037107,0.912322,-0.078542,-0.045339,0.068711,-0.028925,2020,1,0.375764
4,Tokyo-to Musashino-shi,109.0,1,1,1.006998,0.003064,1.013247,0.002035,0.0081,0.002766,0.0044,0.009621,1.039539,0.015687,0.013731,0.012887,0.013013,150149.0,0.027259,1.072916,0.012522,0.031528,0.022822,0.02377,1,5404.433989,0.015092,41823423.0,0.003416,1.04496,0.021257,0.019727,-0.00835,0.0148,1,16080.0,0.006998,1.135967,0.060068,0.064151,0.010525,0.043739,2020,1,0.30932
5,Tokyo-to Chuo-ku,66.0,1,1,0.905293,0.005651,1.054597,0.02426,0.023834,0.033346,0.017915,0.023048,1.088115,0.017821,0.044978,0.023577,0.028616,169179.0,0.021457,1.217758,0.108315,0.075667,0.082041,0.06848,1,6843.638156,-0.008373,32478391.0,0.018681,1.171959,0.080343,0.06491,0.048118,0.054644,1,8965.0,-0.094707,1.422269,0.159925,0.354449,0.36062,0.139889,2020,1,0.308429
6,Tokyo-to Higashikurume-shi,167.0,1,1,1.033644,0.004745,1.010135,0.002766,0.00259,0.000908,0.003367,0.011848,1.037082,0.012624,0.012161,0.017312,0.012211,115271.0,0.000657,1.047904,0.007785,0.039126,0.030597,0.015856,1,3609.52831,-0.005584,17258660.0,0.002599,1.027727,0.009301,0.015616,0.006895,0.009172,1,23360.0,0.033644,1.104586,0.017539,0.050213,0.068115,0.033799,2020,1,0.289606
7,Tokyo-to Akiruno-shi,96.0,1,1,0.977839,-0.000416,1.004792,0.00225,0.002954,-0.00052,0.001596,0.01046,1.040067,0.011881,0.017215,0.015996,0.013185,79292.0,0.012907,1.028056,0.004807,0.0101,0.025264,0.009271,1,3195.260238,0.008573,10884230.0,0.005471,1.011073,0.008361,-0.002766,0.00874,0.003689,1,20440.0,-0.022161,0.700284,-0.095414,-0.208307,-0.183157,-0.108627,2020,1,0.284947
8,Tokyo-to Chiyoda-ku,41.0,1,1,1.06508,0.014607,1.076727,0.028108,0.032212,0.017522,0.024976,0.026555,1.070092,0.018523,0.023453,0.035125,0.022844,66680.0,-0.028845,1.201759,0.138419,0.086993,0.061411,0.065522,1,10056.536152,-0.070201,20573851.0,-0.049652,1.078063,0.092478,0.038361,0.034839,0.027063,1,5665.0,0.06508,1.242021,0.077072,0.082684,0.119927,0.074945,2020,1,0.269361
9,Tokyo-to Toshima-ku,186.0,1,1,1.035983,0.000683,1.011669,0.006121,0.004827,0.003843,0.003877,0.017342,1.053658,0.017052,0.018332,0.017449,0.017575,301599.0,0.038492,1.135479,0.044815,0.046494,0.04395,0.043267,1,4604.990425,0.024874,34825599.0,0.021897,1.082665,0.030998,0.027612,0.022194,0.026836,1,20180.0,0.035983,1.144983,0.017246,0.086477,0.123226,0.046569,2020,1,0.259137


In [None]:
# sort by median and weighted median to see the difference in ordering.
# Try to find the best metric to use