In [39]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
)
from jre_utils.config import asset_types

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [40]:
asset_type = "combined"
dataset_key = "transactions"
years_ahead = 2

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "gmean"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"


In [41]:
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]

dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_ready_data_path = model_ready_data_paths[dataset_name]

In [42]:
# concal all asset types horizontally here
# Once creating time series dataset,
# create 3 datasets and concat them vertically
# To do that, loop 3 times with different metrics

In [43]:
id_columns = ["area_code", "area", "year"]

factor_log_normalize_columns = [
    "in_migrations",
    "out_migrations",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "total_tax",
    "new_dwellings",
    "existing_dwellings",
]

factor_normalize_columns = [
    "total_tax_growth",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "taxpayer_count_growth",
]

factor_maintain_columns = [
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
]

factor_columns = (
    [f"{column}_log_normalized_yearly" for column in factor_log_normalize_columns]
    + [f"{column}_normalized_yearly" for column in factor_normalize_columns]
    + factor_maintain_columns
)

final_factor_columns = factor_normalize_columns + factor_log_normalize_columns + factor_columns

In [44]:
core_log_normalize_columns = ["count", metric]
core_normalize_columns = ["count_growth", "yearly_price_growth", metric_pct_chg]
core_maintain_columns = ["metric_pct_chg_is_available"]

core_columns = (
    [f"{column}_log_normalized_yearly" for column in core_log_normalize_columns]
    + [f"{column}_normalized_yearly" for column in core_normalize_columns]
    + core_maintain_columns
)

final_core_columns = core_normalize_columns + core_log_normalize_columns + core_columns

combined_final_core_columns = [f"{asset_type}_{column}" for column in final_core_columns for asset_type in asset_types]

In [45]:
final_columns = id_columns + combined_final_core_columns + final_factor_columns

In [46]:
derived_dfs = {}

for asset_type in asset_types:
    df =  pd.read_csv(get_derived_csv_path(asset_type))
    df = df.sort_values(by=group_by_columns, ascending=True)
    df = df[group_by_columns + [metric, "count"]]

    # prepare main metric
    df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)

    # prepare additional factors
    df["count_growth"] = df.groupby(granularity_columns)["count"].pct_change()
    df["yearly_price_growth"] = df.groupby(granularity_columns)[metric].pct_change()
    df["metric_pct_chg_is_available"] = df[metric_pct_chg].notnull().astype(int)

    for column in ["count", metric]:
        df[f"{column}_log"] = df[column].apply(lambda x: np.log10(1 + x))
        df[f"{column}_log_normalized_yearly"] = df.groupby("year")[f"{column}_log"].transform(
            lambda x: (x - x.mean()) / x.std()
        )

    for column in ["count_growth", "yearly_price_growth", metric_pct_chg]:
        df[f"{column}_normalized_yearly"] = df.groupby("year")[column].transform(
            lambda x: (x - x.mean()) / x.std()
        )

    df = df[group_by_columns + final_core_columns]
    
    derived_dfs[asset_type] = df.rename(
        columns={column: f"{asset_type}_{column}" for column in final_core_columns}
    )

combined_derived_dfs = (
    derived_dfs["building"].merge(
        derived_dfs["land"],
        on=["year", "area_code", "area"],
        how="outer",
    ).merge(
        derived_dfs["condo"],
        on=["year", "area_code", "area"],
        how="outer",
    )
)

In [47]:
combined_derived_dfs

Unnamed: 0,area,area_code,year,building_count_growth,building_yearly_price_growth,building_unit_price_gmean_pct_chg,building_count,building_unit_price_gmean,building_count_log_normalized_yearly,building_unit_price_gmean_log_normalized_yearly,building_count_growth_normalized_yearly,building_yearly_price_growth_normalized_yearly,building_unit_price_gmean_pct_chg_normalized_yearly,building_metric_pct_chg_is_available,land_count_growth,land_yearly_price_growth,land_unit_price_gmean_pct_chg,land_count,land_unit_price_gmean,land_count_log_normalized_yearly,land_unit_price_gmean_log_normalized_yearly,land_count_growth_normalized_yearly,land_yearly_price_growth_normalized_yearly,land_unit_price_gmean_pct_chg_normalized_yearly,land_metric_pct_chg_is_available,condo_count_growth,condo_yearly_price_growth,condo_unit_price_gmean_pct_chg,condo_count,condo_unit_price_gmean,condo_count_log_normalized_yearly,condo_unit_price_gmean_log_normalized_yearly,condo_count_growth_normalized_yearly,condo_yearly_price_growth_normalized_yearly,condo_unit_price_gmean_pct_chg_normalized_yearly,condo_metric_pct_chg_is_available
0,Aichi-ken Agui-cho,23441,2007,,,,6.0,67590.233575,-1.205808,0.066006,,,,0.0,,,,20.0,54723.779572,-0.544187,0.674598,,,,0.0,,,,,,,,,,,
1,Aichi-ken Agui-cho,23441,2008,1.333333,1.105567,,14.0,142315.742351,-0.674282,0.968399,0.497956,3.158972,,0.0,0.500000,-0.250691,,30.0,41004.994789,-0.290165,0.511063,0.118376,-0.581869,,0.0,,,,,,,,,,,
2,Aichi-ken Agui-cho,23441,2009,1.071429,0.024848,1.157887,29.0,145852.056278,-0.183325,1.103548,1.470449,0.151198,3.510887,1.0,0.233333,0.473600,0.104181,37.0,60424.944041,-0.023743,0.964866,0.479061,1.331892,0.359541,1.0,,,,,,,,,,,
3,Aichi-ken Agui-cho,23441,2010,0.413793,-0.019251,0.005119,41.0,143044.196187,0.055760,1.082660,0.690053,-0.133281,0.093175,1.0,0.459459,0.044494,0.539166,54.0,63113.475459,0.231435,1.015879,0.515716,-0.019356,1.443527,1.0,,,,,,,,,,,
4,Aichi-ken Agui-cho,23441,2011,-0.024390,0.010080,-0.009365,40.0,144486.087473,0.100640,1.102463,0.004318,-0.085690,-0.082755,1.0,-0.203704,0.098042,0.146898,43.0,69301.255966,0.057424,1.110362,-0.533401,0.159122,0.204269,1.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21230,Yamanashi-ken Oshino-mura,19424,2019,,,,,,,,,,,,-0.181818,1.101751,1.366668,9.0,23205.212513,-1.285960,0.237574,-0.415074,1.490448,1.809799,1.0,,,,,,,,,,,
21231,Yamanashi-ken Oshino-mura,19424,2020,,,,,,,,,,,,0.333333,-0.214747,0.650406,12.0,18221.953987,-1.091118,0.046115,0.438089,-0.585584,0.756029,1.0,,,,,,,,,,,
21232,Yamanashi-ken Oshino-mura,19424,2021,,,,,,,,,,,,-0.250000,0.612252,0.266025,9.0,29378.375210,-1.340108,0.418723,-0.791044,0.929838,0.299287,1.0,,,,,,,,,,,
21233,Yamanashi-ken Oshino-mura,19424,2022,,,,,,,,,,,,-0.111111,-0.497405,-0.189691,8.0,14765.418036,-1.345537,-0.144001,-0.132958,-1.211575,-0.549031,1.0,,,,,,,,,,,


In [48]:
df = combined_derived_dfs

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [49]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

In [50]:
df = df[final_columns]

In [52]:
df.to_csv(model_ready_data_path, index=False)

In [53]:
df.describe()

Unnamed: 0,area_code,year,land_count_growth,building_count_growth,condo_count_growth,land_yearly_price_growth,building_yearly_price_growth,condo_yearly_price_growth,land_unit_price_gmean_pct_chg,building_unit_price_gmean_pct_chg,condo_unit_price_gmean_pct_chg,land_count,building_count,condo_count,land_unit_price_gmean,building_unit_price_gmean,condo_unit_price_gmean,land_count_log_normalized_yearly,building_count_log_normalized_yearly,condo_count_log_normalized_yearly,land_unit_price_gmean_log_normalized_yearly,building_unit_price_gmean_log_normalized_yearly,condo_unit_price_gmean_log_normalized_yearly,land_count_growth_normalized_yearly,building_count_growth_normalized_yearly,condo_count_growth_normalized_yearly,land_yearly_price_growth_normalized_yearly,building_yearly_price_growth_normalized_yearly,condo_yearly_price_growth_normalized_yearly,land_unit_price_gmean_pct_chg_normalized_yearly,building_unit_price_gmean_pct_chg_normalized_yearly,condo_unit_price_gmean_pct_chg_normalized_yearly,land_metric_pct_chg_is_available,building_metric_pct_chg_is_available,condo_metric_pct_chg_is_available,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available
count,21235.0,21235.0,19393.0,17342.0,4911.0,19393.0,17342.0,4911.0,18100.0,16179.0,4576.0,20721.0,18542.0,5262.0,20721.0,18542.0,5262.0,20721.0,18542.0,5262.0,20721.0,18542.0,5262.0,19393.0,17342.0,4911.0,19393.0,17342.0,4911.0,18100.0,16179.0,4576.0,20721.0,18542.0,5262.0,21141.0,19917.0,19917.0,21234.0,12175.0,19917.0,21234.0,21234.0,21234.0,19917.0,19917.0,19917.0,21164.0,12175.0,12175.0,21234.0,21234.0,21234.0,19917.0,19917.0,19917.0,21164.0,12175.0,12175.0,21141.0,19917.0,19917.0,21234.0,12175.0,19917.0,21235.0,21235.0,21235.0,21235.0
mean,21783.687921,2014.446762,0.170641,0.166097,0.106521,0.051633,0.048543,0.02634,0.039459,0.036415,0.040473,79.942908,90.882483,152.278791,46257.97,91275.77,249273.1,-2.853007e-16,-1.870051e-16,-3.024735e-16,-2.7432760000000002e-17,3.801415e-16,-2.59263e-16,-2.931131e-18,-4.916684e-18,-2.8936780000000002e-18,0.0,3.277789e-18,4.340518e-18,3.1405200000000002e-18,8.783519e-19,-7.763796999999999e-19,0.87351,0.87256,0.869631,0.007955,0.003273,0.001302,-0.002976,0.013678,0.001865,3933.123293,3960.423098,97189.9,43634.23,146813100.0,2929.987478,14901160.0,1116.890021,68472.53,0.248806,0.251034,0.291269,0.404418,0.4001,0.180698,0.539913,-0.108651,-0.10227,0.061775,0.034121,-0.012359,0.099158,-0.082956,0.09268,0.999953,0.937933,0.573346,0.996656
std,13528.582464,4.654486,2.46917,3.290677,0.44842,0.505666,0.539019,0.171165,0.519445,0.447017,0.184228,134.961584,205.93478,347.796617,123382.9,179222.3,169939.5,0.9995897,0.9995415,0.998383,0.9995897,0.9995415,0.998383,0.9995874,0.9995386,0.9983693,0.999587,0.9995386,0.9983693,0.9995855,0.9995363,0.9983593,0.332409,0.333475,0.336741,0.05783,0.049345,0.04024,0.006382,0.007129,0.024113,11819.393312,11381.524408,220984.2,100350.2,378701300.0,574.984831,42920780.0,2577.655643,133830.0,0.784131,0.784397,0.760988,0.774262,0.790126,1.008675,0.73122,1.020543,1.04155,1.129133,0.865655,0.872374,0.71007,0.956812,0.811183,0.006862,0.241284,0.494603,0.057728
min,1100.0,2005.0,-0.903226,-0.855556,-0.884615,-0.96422,-0.938191,-0.77153,-0.946739,-0.942314,-0.749036,3.0,3.0,3.0,74.91156,1065.064,39475.62,-2.294218,-2.035521,-2.526924,-4.288099,-3.552028,-3.158544,-2.293802,-2.415549,-2.906729,-2.397059,-2.951646,-4.135254,-2.30614,-2.475647,-4.977319,0.0,0.0,0.0,-0.677251,-0.682739,-0.694693,-0.102674,0.0,-0.590998,10.0,11.0,387.0,161.0,418246.0,1908.27862,52938.0,0.0,2190.0,-2.787129,-2.704741,-2.700291,-2.603703,-2.536632,-2.4012,-2.493658,-4.870548,-3.152641,-15.177904,-18.121708,-23.624752,-7.961075,-2.178107,-14.862366,0.0,0.0,0.0,0.0
25%,11215.0,2010.0,-0.181818,-0.142857,-0.121495,-0.164552,-0.149803,-0.062273,-0.199186,-0.17072,-0.065329,18.0,15.0,19.0,9407.226,23911.17,152299.5,-0.721506,-0.7463328,-0.7437905,-0.6467369,-0.6967543,-0.656684,-0.5279406,-0.5207219,-0.5269214,-0.449793,-0.4278431,-0.5194999,-0.4745879,-0.4637616,-0.5434024,1.0,1.0,1.0,-0.014169,-0.010883,-0.009179,-0.006586,0.008991,-0.007437,509.0,619.0,17586.0,7465.0,20154490.0,2596.501415,2218915.0,178.0,18040.0,-0.333947,-0.315585,-0.279643,-0.176746,-0.193856,-0.500812,0.00183,-0.826853,-0.858108,-0.302737,-0.255501,-0.25368,-0.306262,-0.724417,-0.277367,1.0,1.0,0.0,1.0
50%,21205.0,2014.0,0.0,0.022727,0.028037,-0.013472,-0.009689,0.020164,-0.031672,-0.01903,0.032458,39.0,33.0,46.0,17247.27,45359.61,197746.1,-0.04838284,-0.1274111,-0.1154106,-0.1280589,-0.07905513,-0.1446921,-0.1709586,-0.1527531,-0.1434578,-0.131575,-0.1192099,-0.05144588,-0.1360691,-0.1226657,-0.04211167,1.0,1.0,1.0,0.003698,0.005537,0.001722,-0.003098,0.012692,0.00261,1163.0,1298.0,38092.0,16506.0,46323230.0,2819.637802,4867354.0,407.0,29789.0,0.163076,0.161054,0.219164,0.331129,0.314574,0.021217,0.667247,-0.169945,-0.317344,-0.00803,0.014239,-0.031954,0.076093,-0.171896,0.08452,1.0,1.0,1.0,1.0
75%,32528.0,2018.0,0.234568,0.244898,0.222222,0.158108,0.136453,0.098459,0.155405,0.139073,0.130662,86.0,86.0,122.0,37424.97,96713.04,272306.7,0.6565215,0.6647817,0.5990424,0.5539358,0.6600848,0.4713997,0.2779252,0.2951703,0.3073336,0.229421,0.1950658,0.4093882,0.2320975,0.2238496,0.4607151,1.0,1.0,1.0,0.022335,0.020497,0.0117,0.000355,0.016838,0.011645,2935.0,3011.0,85745.75,38558.0,118763400.0,3128.844573,12632300.0,960.5,61801.0,0.725726,0.701415,0.745099,0.878663,0.88636,0.69097,1.001613,0.513909,0.466554,0.32839,0.283884,0.187873,0.464543,0.391977,0.44954,1.0,1.0,1.0,1.0
max,47382.0,2022.0,206.0,324.333333,8.0,23.478425,42.673283,1.889849,13.437224,12.175872,1.325524,2419.0,4258.0,4215.0,2926971.0,4076227.0,1457856.0,3.720151,3.975306,3.259264,4.372765,4.153827,3.300096,16.44717,27.18723,10.01943,27.694683,32.05835,9.886444,19.60862,23.49057,5.980418,1.0,1.0,1.0,3.053763,3.608838,3.549172,0.057609,0.12027,0.59417,214930.0,236014.0,3832957.0,1906224.0,7965148000.0,12667.02,867276500.0,42858.0,1916062.0,3.327361,3.462084,3.258549,3.380084,3.424611,9.642084,3.505448,3.413502,4.107772,53.722237,38.584813,40.093104,6.442511,12.763745,16.181699,1.0,1.0,1.0,1.0


In [54]:
df[df[f"building_yearly_price_growth"] > 20]

Unnamed: 0,area_code,area,year,land_count_growth,building_count_growth,condo_count_growth,land_yearly_price_growth,building_yearly_price_growth,condo_yearly_price_growth,land_unit_price_gmean_pct_chg,building_unit_price_gmean_pct_chg,condo_unit_price_gmean_pct_chg,land_count,building_count,condo_count,land_unit_price_gmean,building_unit_price_gmean,condo_unit_price_gmean,land_count_log_normalized_yearly,building_count_log_normalized_yearly,condo_count_log_normalized_yearly,land_unit_price_gmean_log_normalized_yearly,building_unit_price_gmean_log_normalized_yearly,condo_unit_price_gmean_log_normalized_yearly,land_count_growth_normalized_yearly,building_count_growth_normalized_yearly,condo_count_growth_normalized_yearly,land_yearly_price_growth_normalized_yearly,building_yearly_price_growth_normalized_yearly,condo_yearly_price_growth_normalized_yearly,land_unit_price_gmean_pct_chg_normalized_yearly,building_unit_price_gmean_pct_chg_normalized_yearly,condo_unit_price_gmean_pct_chg_normalized_yearly,land_metric_pct_chg_is_available,building_metric_pct_chg_is_available,condo_metric_pct_chg_is_available,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available
17884,6428,Yamagata-ken Shonai-machi,2018,0.166667,0.333333,,-0.096588,42.673283,,-0.164031,4.068458,,14.0,4.0,,8245.863439,46514.838675,,-0.95639,-1.771965,,-0.619055,0.004066,,0.228354,0.785975,,-0.181303,32.058345,,-0.420088,8.021277,,1.0,1.0,,-8.7e-05,0.012188,0.00427,-0.007129,,0.007885,407.0,553.0,20480.0,9587.0,22590577.0,2356.37603,1987466.0,,,-0.437891,-0.357076,-0.172928,-0.0237,-0.136917,-1.197839,-0.258908,,,-0.110431,-0.109632,-0.176539,-0.422169,,0.116403,1,1,0,1


In [55]:
area_code = 6428
df[df["area_code"] == area_code][["year"] + [f"{asset_type}_yearly_price_growth" for asset_type in asset_types] + [f"{asset_type}_count" for asset_type in asset_types]]

Unnamed: 0,year,land_yearly_price_growth,building_yearly_price_growth,condo_yearly_price_growth,land_count,building_count,condo_count
17874,2008,-0.148374,,,13.0,12.0,
17875,2009,-0.263423,0.021085,,12.0,14.0,
17876,2010,0.321346,1.464304,,9.0,5.0,
17877,2011,0.11398,-0.349555,,12.0,10.0,
17878,2012,0.062973,1.106908,,17.0,5.0,
17879,2013,-0.132984,-0.630526,,17.0,13.0,
17880,2014,-0.072015,-0.760203,,13.0,4.0,
17881,2015,0.145216,3.453824,,13.0,10.0,
17882,2016,-0.124589,-0.502936,,14.0,4.0,
17883,2017,-0.074654,-0.883946,,12.0,3.0,
