In [32]:
import warnings
import math

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import statsmodels.api as sm

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)
from jre_utils.process import get_most_active_municipalities
from jre_utils.visualize import plot_time_series


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [40]:
asset_type = "building"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = "weighted_median"
metric = metrics[metric_key]
smoothed_metric = f"{metric}_smoothed"
metric_pct_chg = smoothed_metric + "_pct_chg"
upcoming_metric = "upcoming_" + smoothed_metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg


In [34]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]


In [35]:
def years_since_crisis(year):
    year_ranges = {
        (1960, 1973): 1960,
        (1973, 1990): 1973,
        (1990, 1997): 1990,
        (1997, 2008): 1997,
        (2008, 2019): 2008,
        (2019, math.inf): 2019,
    }

    for range_start, range_end in year_ranges:
        if range_start <= year < range_end:
            return year - year_ranges[(range_start, range_end)]
        
def get_loess_parameters(
    area_code, area_to_diff_std, min_diff_std, max_diff_std, min_frac=0.3, max_frac=0.6
):
    diff_score = (area_to_diff_std[area_code] - min_diff_std) / (
        max_diff_std - min_diff_std
    )
    frac = min_frac + diff_score * (max_frac - min_frac)
    frac = min(frac, 0.8)
    # frac = max(frac, 0.2)
    return (frac, 3, diff_score)

In [36]:
core_df = pd.read_csv(core_path)
core_df = get_most_active_municipalities(core_df, 1500)

# Smooth each area_code with loess smoothing
area_to_diff_std = {}
area_to_loess_params = {}

for area_code in core_df["area_code"].unique():
    area_df = core_df[core_df["area_code"] == area_code].sort_values(by="year", ascending=True)
    area_to_diff_std[area_code] = area_df[metric].pct_change().std()

area_to_diff_std_series = pd.Series(area_to_diff_std)
min_diff_std = area_to_diff_std_series.quantile(0.05)
max_diff_std = area_to_diff_std_series.quantile(0.95)

for area_code in core_df["area_code"].unique():
    area_to_loess_params[area_code] = get_loess_parameters(
        area_code, area_to_diff_std, min_diff_std, max_diff_std
    )

smoothed_df = pd.DataFrame()
for area_code in core_df["area_code"].unique():
    area_df = core_df[core_df["area_code"] == area_code].sort_values(by="year", ascending=True)
    frac, it, _ = area_to_loess_params[area_code]
    area_df[smoothed_metric] = sm.nonparametric.lowess(
        area_df[metric], area_df.index, frac=frac, it=it, return_sorted=False
    )
    smoothed_df = pd.concat([smoothed_df, area_df])


In [41]:
df = smoothed_df.copy()

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [42]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)

# Might go back and undo the new dwellings filling for unknown municipalities.
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

# Years since crisis (Yn - 2008) or (Yn - 2020)
df["years_since_crisis"] = df["year"].apply(years_since_crisis)

df = df.fillna(0)

In [43]:
df

Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count,unit_price_wmedian_smoothed,population,net_migration_ratio,taxable_income,taxpayer_count,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,new_dwellings,existing_dwellings,new_dwellings_ratio,total_tax,total_tax_growth,migrations_is_available,taxable_income_is_available,total_tax_is_available,dwellings_is_available,years_since_crisis
0,2005,40100,Fukuoka-ken Kitakyushu-shi,183815.028902,186861.207898,194445.084485,202976.190476,865.0,4.0,84275.746045,983758.0,-0.006164,1.167200e+09,367645.0,3174.800792,0.016089,-0.024813,9469.0,409024.0,0.023150,158407071.0,0.042219,1,1,1,1,8
1,2006,40100,Fukuoka-ken Kitakyushu-shi,118952.714901,83333.333333,119147.214997,91304.347826,144020.0,491.0,84876.500250,977694.0,-0.006195,1.231664e+09,399893.0,3079.983198,0.055230,-0.029866,10364.0,418493.0,0.024765,159254747.0,0.005351,1,1,1,1,9
2,2007,40100,Fukuoka-ken Kitakyushu-shi,152365.103719,87500.000000,120436.099758,87980.769231,162940.0,528.0,84196.428592,971637.0,-0.006701,1.246597e+09,402496.0,3097.166849,0.012125,0.005579,7100.0,428857.0,0.016556,168622537.0,0.058823,1,1,1,1,10
3,2008,40100,Fukuoka-ken Kitakyushu-shi,135330.623702,80000.000000,128754.821073,85714.285714,170995.0,523.0,80990.177549,965126.0,-0.005586,1.255583e+09,403541.0,3111.414877,0.007209,0.004600,6812.0,435957.0,0.015625,167490841.0,-0.006711,1,1,1,1,0
4,2009,40100,Fukuoka-ken Kitakyushu-shi,101341.916775,75847.157502,101979.779110,78977.272727,172425.0,588.0,76323.457199,959735.0,-0.004858,1.242957e+09,402884.0,3085.149487,-0.010056,-0.008442,5257.0,442769.0,0.011873,160893827.0,-0.039387,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25310,2018,29402,Nara-ken Asuka-mura,63333.333333,50390.998594,79317.460317,51777.777778,1500.0,4.0,51094.525252,5162.0,-0.000194,6.308628e+06,2180.0,2893.866055,-0.023967,-0.030235,0.0,0.0,0.000000,419486.0,-0.039654,1,1,1,0,10
25311,2019,29402,Nara-ken Asuka-mura,67777.777778,15595.238095,120992.063492,17142.857143,180.0,3.0,43893.487864,5161.0,0.003488,6.493620e+06,2203.0,2947.625965,0.029324,0.018577,0.0,0.0,0.000000,429243.0,0.023259,1,1,1,0,0
25312,2020,29402,Nara-ken Asuka-mura,47058.823529,47058.823529,47058.823529,47058.823529,340.0,1.0,34467.266626,5179.0,-0.002510,6.559000e+06,2196.0,2986.794171,0.010068,0.013288,0.0,0.0,0.000000,427743.0,-0.003495,1,1,1,0,1
25313,2021,29402,Nara-ken Asuka-mura,40952.380952,25249.388293,86807.712215,52083.333333,1050.0,4.0,21786.827396,5142.0,-0.007196,6.660864e+06,2157.0,3088.022253,0.015530,0.033892,0.0,0.0,0.000000,421511.0,-0.014569,1,1,1,0,2


In [44]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[smoothed_metric].pct_change()

# set up target variables
df[metric_pct_chg] = df[smoothed_metric].pct_change(periods=years_ahead)
df[upcoming_metric_pct_chg] = df[metric_pct_chg].shift(-years_ahead)

# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [45]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

# df = get_most_active_municipalities(df, n=1500)
# df = df[df["count"] > 5]

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25315 entries, 16845 to 14370
Data columns (total 29 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   year                                          25315 non-null  int64  
 1   area_code                                     25315 non-null  int64  
 2   area                                          25315 non-null  object 
 3   unit_price_wmean                              25315 non-null  float64
 4   unit_price_wmedian                            25315 non-null  float64
 5   unit_price_mean                               25315 non-null  float64
 6   unit_price_median                             25315 non-null  float64
 7   total_traded_area                             25315 non-null  float64
 8   count                                         25315 non-null  float64
 9   unit_price_wmedian_smoothed                   25315 non-null  

In [47]:
df.columns

Index(['year', 'area_code', 'area', 'unit_price_wmean', 'unit_price_wmedian',
       'unit_price_mean', 'unit_price_median', 'total_traded_area', 'count',
       'unit_price_wmedian_smoothed', 'population', 'net_migration_ratio',
       'taxable_income', 'taxpayer_count', 'taxable_income_per_taxpayer',
       'taxable_income_growth', 'taxable_income_per_taxpayer_growth',
       'new_dwellings', 'existing_dwellings', 'new_dwellings_ratio',
       'total_tax', 'total_tax_growth', 'migrations_is_available',
       'taxable_income_is_available', 'total_tax_is_available',
       'dwellings_is_available', 'years_since_crisis',
       'unit_price_wmedian_smoothed_pct_chg',
       'upcoming_unit_price_wmedian_smoothed_pct_chg'],
      dtype='object')

In [48]:
columns = [
    metric_pct_chg,
    smoothed_metric,
    "year",
    "years_since_crisis",
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "total_tax_growth",
    "new_dwellings",
    "existing_dwellings",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "area_code",  # id
]

# add ratios and growths if necessary

df = df[columns]
df = df.dropna()

In [49]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = metric_pct_chg
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  25313
Filtered Size:  24805


Unnamed: 0,unit_price_wmedian_smoothed_pct_chg,unit_price_wmedian_smoothed,year,years_since_crisis,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,area_code
count,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0,24805.0
mean,0.080695,71564.22,2015.05773,4.418706,73.361338,19526.092522,80604.02,34594.61,116153400.0,2550.090973,0.002324,0.001462,12543700.0,0.007017,532.135053,32979.63,-0.00334,0.006614,0.927958,0.881395,0.485789,0.942431,21392.037089
std,0.588752,191836.5,4.857921,3.256426,190.60512,38310.512596,201105.0,90363.04,340459700.0,1074.34932,0.046477,0.03804,39322390.0,0.067123,1808.371033,98342.56,0.007181,0.008421,0.258563,0.32333,0.499808,0.232931,13843.755898
min,-0.778576,664.6742,2005.0,0.0,1.0,65.0,0.0,0.0,0.0,0.0,-0.682739,-0.694693,0.0,-0.677251,0.0,0.0,-0.313065,0.0,0.0,0.0,0.0,0.0,1100.0
25%,-0.16553,13912.93,2011.0,2.0,8.0,3585.0,10774.0,3636.0,9469633.0,2468.436212,-0.009195,-0.007376,1355661.0,-0.01381,0.0,0.0,-0.006916,0.0,1.0,1.0,0.0,1.0,10206.0
50%,-0.022048,27039.3,2015.0,4.0,22.0,8715.0,29033.0,11294.0,30570330.0,2727.377697,0.001116,0.0,3693996.0,0.001207,0.0,0.0,-0.003019,0.0,1.0,1.0,0.0,1.0,20482.0
75%,0.140026,66402.85,2019.0,7.0,65.0,20130.0,71261.0,29609.0,89044170.0,3038.988748,0.018024,0.010587,10213980.0,0.020194,386.0,28911.0,0.0,0.012409,1.0,1.0,1.0,1.0,32449.0
max,5.861926,6241887.0,2023.0,10.0,4358.0,666150.0,3832957.0,1906224.0,7965148000.0,12667.02,3.608838,3.549172,867276500.0,4.759148,39143.0,1916062.0,0.099253,0.12027,1.0,1.0,1.0,1.0,47362.0


In [50]:
filtered_df.to_csv(model_ready_data_paths[f"sequence_{dataset_key}_{metric_key}_{years_ahead}"], index=False)