In [52]:
import warnings

import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

from jre_utils.process import (
    get_most_active_municipalities
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [54]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area_code", "area"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["weighted_median"]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg


In [55]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]


In [56]:
df = pd.read_csv(core_path)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
)

In [57]:
df

Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count,population,net_migration_ratio,taxable_income,taxpayer_count,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,new_dwellings,existing_dwellings,new_dwellings_ratio
0,2023,40100,Fukuoka-ken Kitakyushu-shi,111020.840355,75850.000000,112618.534187,77171.215881,89010.0,278.0,,,,,,,,,,
1,2023,40130,Fukuoka-ken Fukuoka-shi,383051.876669,260000.000000,417016.083864,283870.967742,95515.0,365.0,,,,,,,,,,
2,2023,40202,Fukuoka-ken Omuta-shi,27522.713507,10515.650080,44566.062666,15381.818182,8255.0,26.0,,,,,,,,,,
3,2023,40203,Fukuoka-ken Kurume-shi,99893.243862,57081.815363,110045.308156,91304.347826,32785.0,99.0,,,,,,,,,,
4,2023,40204,Fukuoka-ken Nogata-shi,35340.751043,26605.692444,46028.433096,29411.764706,7190.0,25.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27723,2006,29205,Nara-ken Kashihara-shi,292307.692308,292307.692308,292307.692308,292307.692308,260.0,2.0,127413.0,-0.006703,169637011.0,48439.0,3502.075002,0.024555,-0.030735,1025.0,49344.0,0.020773
27724,2006,29207,Nara-ken Gojo-shi,44357.976654,44357.976654,45570.231959,45570.231959,2570.0,2.0,32653.0,-0.012924,38918213.0,12670.0,3071.682163,0.001536,-0.051189,107.0,13273.0,0.008061
27725,2006,29209,Nara-ken Ikoma-shi,87272.727273,87272.727273,87272.727273,87272.727273,110.0,1.0,115538.0,0.002839,217154351.0,49240.0,4410.120857,0.044310,-0.015477,1237.0,42925.0,0.028818
27726,2006,29212,Nara-ken Uda-shi,16216.216216,16216.216216,16216.216216,16216.216216,185.0,1.0,32836.0,-0.011207,44256793.0,13620.0,3249.397430,0.022078,-0.044485,,,


In [58]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()

# set up target variables
df[upcoming_metric_pct_chg] = df.groupby(granularity_columns)[metric_pct_chg].shift(-1)

# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [60]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

df = get_most_active_municipalities(df, n=1500)
# df = df[df["count"] > 5]

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25315 entries, 24017 to 3344
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   year                                 25315 non-null  int64  
 1   area_code                            25315 non-null  int64  
 2   area                                 25315 non-null  object 
 3   unit_price_wmean                     25315 non-null  float64
 4   unit_price_wmedian                   25315 non-null  float64
 5   unit_price_mean                      25315 non-null  float64
 6   unit_price_median                    25315 non-null  float64
 7   total_traded_area                    25315 non-null  float64
 8   count                                25315 non-null  float64
 9   population                           23454 non-null  float64
 10  net_migration_ratio                  23454 non-null  float64
 11  taxable_income                

In [62]:
df.columns

Index(['year', 'area_code', 'area', 'unit_price_wmean', 'unit_price_wmedian',
       'unit_price_mean', 'unit_price_median', 'total_traded_area', 'count',
       'population', 'net_migration_ratio', 'taxable_income', 'taxpayer_count',
       'taxable_income_per_taxpayer', 'taxable_income_growth',
       'taxable_income_per_taxpayer_growth', 'new_dwellings',
       'existing_dwellings', 'new_dwellings_ratio',
       'unit_price_wmedian_pct_chg', 'upcoming_unit_price_wmedian_pct_chg'],
      dtype='object')

In [63]:
columns = [
    metric_pct_chg,
    metric,
    "year",
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "new_dwellings",
    "existing_dwellings",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "area_code",  # id
]

# add ratios and growths if necessary

df = df[columns]
df = df.dropna()

In [64]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = metric_pct_chg
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  20273
Filtered Size:  19867


Unnamed: 0,unit_price_wmedian_pct_chg,unit_price_wmedian,year,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,area_code
count,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0,19867.0
mean,0.162809,71841.89,2014.542508,80.840389,21322.759853,89669.27,40257.98,135147900.0,2891.265707,0.002563,0.00202,615.433533,41263.11,-0.003468,0.007458,21664.7895
std,0.803471,181878.9,4.048787,204.025601,40411.00553,212742.6,96944.83,365489700.0,565.59008,0.043341,0.032696,1933.59001,106028.2,0.006954,0.008316,13825.299025
min,-0.867237,79.69444,2006.0,1.0,65.0,1479.0,487.0,1127517.0,1989.133483,-0.682739,-0.694693,0.0,0.0,-0.203379,0.0,1100.0
25%,-0.271031,13608.46,2011.0,9.0,4170.0,14088.5,5932.5,15297110.0,2562.328467,-0.011969,-0.0088,0.0,6170.0,-0.007138,0.0,10424.0
50%,-0.013493,28241.55,2015.0,25.0,9805.0,32991.0,14232.0,38991460.0,2786.925732,0.005236,0.002293,94.0,14473.0,-0.003496,0.006201,20583.0
75%,0.308145,68220.2,2018.0,73.0,21922.5,78717.0,35326.0,107771300.0,3087.96901,0.020447,0.012697,483.5,34376.0,6.5e-05,0.01324,33207.0
max,5.739935,4417898.0,2021.0,4358.0,666150.0,3811873.0,1906224.0,7965148000.0,12667.02,1.06604,1.026488,41746.0,1916062.0,0.099253,0.12027,47362.0


In [65]:
filtered_df.to_csv(model_ready_data_paths[f"sequence_{dataset_key}"], index=False)