In [1]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_built_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [14]:
asset_type = "building"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = "median"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric


In [15]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]

dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_built_data_path = model_built_data_paths[dataset_name]

In [16]:
df = pd.read_csv(core_path)
df = df.assign(asset_type=asset_type)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [17]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)
# df = df[~df[metric_pct_chg].isna()] # I don't want to drop the data from the first two years. I.e. 2007 should have 2005 and 2006 data in it's window

In [18]:
# prepare additional factors
df["count_growth"] = df.groupby(granularity_columns)["count"].pct_change()
df["yearly_price_growth"] = df.groupby(granularity_columns)[metric].pct_change()

for column in ["count", "total_traded_area", metric]:
    df[f"{column}_log"] = df[column].apply(lambda x: np.log10(1 + x))
    df[f"{column}_log_normalized_yearly"] = df.groupby("year")[f"{column}_log"].transform(
        lambda x: (x - x.mean()) / x.std()
    )

for column in ["count_growth", "yearly_price_growth", metric_pct_chg]: # metric_pct_chg_normalized_yearly will be the key metric
    df[f"{column}_normalized_yearly"] = df.groupby("year")[column].transform(
        lambda x: (x - x.mean()) / x.std()
    )

In [19]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)
df["metric_pct_chg_is_available"] = df[metric_pct_chg].notnull().astype(int)

In [20]:
log_normalize_columns = [
    metric,
    "count",
    "total_traded_area",
    "in_migrations",
    "out_migrations",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "total_tax",
    "new_dwellings",
    "existing_dwellings",
]

normalize_columns = [
    metric_pct_chg,
    "count_growth",
    "yearly_price_growth",
    "total_tax_growth",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "taxpayer_count_growth",
]

maintain_columns = [
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "metric_pct_chg_is_available"
]

id_columns = ["area_code", "area", "year", "asset_type"]

feature_columns = (
    [f"{column}_log_normalized_yearly" for column in log_normalize_columns]
    + [f"{column}_normalized_yearly" for column in normalize_columns]
    + maintain_columns
)

final_columns = id_columns + normalize_columns + log_normalize_columns + feature_columns

# add ratios and growths if necessary

df = df[final_columns]

In [21]:
df.to_csv(model_built_data_path, index=False)

In [22]:
df[df[metric_pct_chg] > 2]

Unnamed: 0,area_code,area,year,asset_type,unit_price_median_pct_chg,count_growth,yearly_price_growth,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,unit_price_median,count,total_traded_area,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,unit_price_median_log_normalized_yearly,count_log_normalized_yearly,total_traded_area_log_normalized_yearly,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,unit_price_median_pct_chg_normalized_yearly,count_growth_normalized_yearly,yearly_price_growth_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,metric_pct_chg_is_available
14196,5213,Akita-ken Kitaakita-shi,2017,building,2.170899,-0.133333,0.064516,0.008246,0.010451,0.005560,-0.003979,0.007335,0.004864,22580.645161,13.0,3975.0,564.0,686.0,30662.0,12188.0,28675802.0,2352.789793,3033280.0,101.0,13769.0,-0.798307,-0.931783,-1.048143,-0.250368,-0.222997,0.083924,0.133196,0.017366,-1.146395,0.007316,-1.262146,-1.192543,3.348990,-0.501146,-0.025278,-0.002860,-0.152912,-0.080740,-0.056440,-0.985714,-0.162979,1,1,1,1,1
14240,5348,Akita-ken Mitane-cho,2013,building,2.020581,0.428571,-0.445209,0.014836,0.012179,0.007761,-0.008292,,0.004385,7628.378378,10.0,7980.0,295.0,429.0,16160.0,6414.0,14366071.0,2239.799033,1421726.0,,,-1.909683,-1.161654,-0.417941,-0.685505,-0.565427,-0.339633,-0.267853,-0.370607,-1.285163,-0.440985,,,4.194427,0.665893,-1.090658,0.171774,0.144822,0.382134,-0.590778,,-0.094910,1,1,0,1,1
14244,5348,Akita-ken Mitane-cho,2017,building,2.500000,-0.166667,0.008333,0.013645,0.024080,0.006214,-0.004929,,0.017756,11000.000000,5.0,2840.0,256.0,333.0,15622.0,6305.0,14607327.0,2316.784615,1420961.0,,,-1.462121,-1.650010,-1.363378,-0.726268,-0.686725,-0.349630,-0.288386,-0.391640,-1.244704,-0.466044,,,3.883714,-0.587525,-0.133016,0.175217,0.153589,-0.063085,-0.166674,,0.396556,1,1,0,1,1
14247,5348,Akita-ken Mitane-cho,2020,building,2.504248,1.000000,0.353846,0.019286,0.022819,0.020381,-0.009506,,0.002390,8461.538462,6.0,4900.0,203.0,348.0,15254.0,6292.0,15076577.0,2396.150191,1487683.0,,,-1.651641,-1.561653,-0.928934,-0.823223,-0.622229,-0.355200,-0.291762,-0.384812,-1.160684,0.780308,,,3.827987,1.866983,0.332083,0.598651,0.440240,0.452039,-0.673120,,0.083555,1,1,0,1,1
14209,5214,Akita-ken Nikaho-shi,2014,building,2.822222,-0.476190,0.107692,-0.069959,-0.037986,0.002243,-0.009690,0.006084,-0.040139,53333.333333,11.0,2250.0,402.0,640.0,24562.0,10498.0,27126886.0,2584.005144,2722969.0,59.0,9697.0,0.014544,-1.014909,-1.542576,-0.476053,-0.283683,-0.065805,0.050086,0.012038,-0.366725,-0.044556,-1.693934,-1.534663,5.977968,-1.293966,0.112112,-2.200680,-0.908857,-0.182576,-0.761150,-1.003640,-1.783833,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16715,35344,Yamaguchi-ken Hirao-cho,2016,building,3.724571,-0.111111,1.147532,0.027842,0.005355,0.012612,-0.003527,,-0.007166,45428.571429,8.0,1810.0,402.0,445.0,12191.0,5126.0,13389313.0,2612.039212,1291046.0,,,-0.165429,-1.291380,-1.753654,-0.465720,-0.505544,-0.512126,-0.416436,-0.435444,-0.431441,-0.514417,,,4.939348,-0.419747,2.089844,0.707632,-0.555945,0.040003,-0.029067,,-1.128923,1,1,0,1,1
16680,35305,Yamaguchi-ken Suooshima-cho,2011,building,5.327160,0.230769,0.637153,-0.007041,-0.020291,-0.009063,-0.000526,,-0.011331,28472.222222,16.0,5525.0,568.0,576.0,15210.0,6108.0,15121424.0,2475.675180,1388885.0,,,-0.652745,-0.657964,-0.599596,-0.301978,-0.379348,-0.382494,-0.296671,-0.340576,-0.653452,-0.454415,,,11.941942,0.739070,1.381713,-0.147184,-0.508282,-0.355077,0.233321,,-0.428517,1,1,0,1,1
10948,19430,Yamanashi-ken Fujikawaguchiko-machi,2018,building,2.221477,0.235294,0.784615,0.016303,0.032696,0.014187,-0.003867,,0.018250,80000.000000,21.0,8740.0,1014.0,1115.0,26118.0,12163.0,40365841.0,3318.740525,4501474.0,,,0.377283,-0.541199,-0.292757,0.109990,0.092005,-0.016895,0.128128,0.214256,0.968195,0.249018,,,3.944391,0.521686,0.321846,0.452285,0.302427,0.050923,-0.022316,,0.633754,1,1,0,1,1
10862,19365,Yamanashi-ken Minobu-cho,2010,building,2.329198,0.000000,2.080750,-0.058103,-0.055232,-0.026459,-0.011270,,-0.029555,51602.564103,4.0,1255.0,285.0,425.0,12422.0,5976.0,15362432.0,2570.688086,1483002.0,,,-0.076921,-1.775275,-2.113083,-0.745416,-0.584604,-0.515255,-0.312973,-0.334063,-0.408910,-0.417119,,,4.720294,-0.324285,3.987921,-1.380701,0.253979,0.312021,-1.122612,,0.072997,1,1,0,1,1


In [23]:
df.describe()

Unnamed: 0,area_code,year,unit_price_median_pct_chg,count_growth,yearly_price_growth,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,unit_price_median,count,total_traded_area,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,unit_price_median_log_normalized_yearly,count_log_normalized_yearly,total_traded_area_log_normalized_yearly,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,unit_price_median_pct_chg_normalized_yearly,count_growth_normalized_yearly,yearly_price_growth_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,metric_pct_chg_is_available
count,18075.0,18075.0,15783.0,16929.0,16929.0,17991.0,16945.0,16945.0,18075.0,11808.0,16945.0,18075.0,18075.0,18075.0,18075.0,18075.0,18075.0,16945.0,16945.0,16945.0,18011.0,11808.0,11808.0,18075.0,18075.0,18075.0,18075.0,18075.0,18075.0,16945.0,16945.0,16945.0,18011.0,11808.0,11808.0,15783.0,16929.0,16929.0,17991.0,16945.0,16945.0,18075.0,11808.0,16945.0,18075.0,18075.0,18075.0,18075.0,18075.0
mean,21787.703513,2014.585007,0.070466,0.086552,0.076275,0.006348,0.003242,0.001076,-0.00251,0.0136,0.002059,101194.2,92.564869,21779.414385,4438.10036,4457.430396,109698.4,49326.68,166330200.0,2971.290207,16884340.0,1109.924543,68974.41,-9.434592e-18,-2.232854e-16,2.358648e-17,0.390795,0.39278,0.432944,0.549325,0.546064,0.273416,0.656559,-0.090774,-0.086763,9.0039e-19,5.45635e-18,1.1752140000000001e-17,0.043077,0.036446,-0.024807,0.148397,-0.077213,0.114413,1.0,0.937483,0.653278,0.996459,0.873195
std,13211.069222,4.571298,0.563051,0.406933,0.72959,0.049969,0.037483,0.028099,0.005959,0.006844,0.020225,180358.1,207.679532,38182.163064,12399.711352,11919.753176,231991.2,105688.5,399153300.0,590.292308,45348690.0,2503.584914,133601.7,0.9995296,0.9995296,0.9995296,0.731524,0.731758,0.701663,0.711773,0.731281,1.017337,0.668104,1.007393,1.032552,0.9995247,0.9995273,0.9995273,0.976708,0.691877,0.706902,0.668856,0.9354,0.679608,0.0,0.242099,0.475939,0.059401,0.332764
min,1100.0,2005.0,-0.949597,-0.855556,-0.933269,-0.677251,-0.400215,-0.395675,-0.099818,0.000631,-0.403889,1010.101,3.0,320.0,98.0,90.0,2729.0,1292.0,3578817.0,1989.133483,518726.0,5.0,5814.0,-3.607744,-2.084999,-3.459538,-1.303517,-1.531965,-1.515944,-1.319372,-1.247956,-2.098896,-1.076072,-3.553202,-2.072076,-2.153636,-2.420212,-2.299884,-15.073015,-9.088769,-10.693872,-7.356845,-2.167483,-14.401495,1.0,0.0,0.0,0.0,0.0
25%,11232.0,2011.0,-0.180953,-0.145833,-0.159016,-0.013625,-0.009775,-0.008754,-0.00602,0.009042,-0.006339,25925.93,16.0,5640.0,709.5,840.0,24615.5,10536.0,28433680.0,2630.334009,2987489.0,183.0,18354.5,-0.7236617,-0.7567006,-0.6735224,-0.13526,-0.116337,-0.061194,0.044048,0.017898,-0.412809,0.162842,-0.802672,-0.8438,-0.4533846,-0.5592166,-0.4162023,-0.290109,-0.236532,-0.252952,-0.250117,-0.711322,-0.238361,1.0,1.0,0.0,1.0,1.0
50%,21213.0,2015.0,-0.009031,0.014599,-0.002491,0.003406,0.005998,0.001836,-0.002667,0.012692,0.003044,52173.91,34.0,10860.0,1425.0,1576.0,47054.0,20501.0,57929440.0,2861.296683,6111493.0,414.0,30282.5,-0.05992616,-0.1270786,-0.05807665,0.287652,0.288224,0.358155,0.468081,0.453766,0.117943,0.765561,-0.156762,-0.305024,-0.1411672,-0.1499312,-0.1313127,-0.010292,0.021498,-0.038019,0.118761,-0.16401,0.102843,1.0,1.0,1.0,1.0,1.0
75%,31390.0,2019.0,0.171553,0.226667,0.16117,0.020636,0.02019,0.011417,0.000604,0.016761,0.01147,114285.7,88.0,22205.0,3421.5,3525.0,100618.5,44762.0,135391000.0,3173.355683,14310970.0,967.25,62709.5,0.6956158,0.6619446,0.6177063,0.819501,0.806178,0.848724,0.969341,0.962132,0.79312,1.049899,0.524982,0.474101,0.1859774,0.3421786,0.1578113,0.301357,0.279377,0.17574,0.487821,0.392031,0.452754,1.0,1.0,1.0,1.0,1.0
max,47362.0,2022.0,11.688172,5.0,65.038314,2.199177,0.808115,0.764597,0.044724,0.08562,0.24245,4420875.0,4258.0,625095.0,211790.0,229585.0,3811873.0,1906224.0,7965148000.0,12667.02,867276500.0,41746.0,1916062.0,3.965719,3.931539,3.798741,3.319728,3.455531,3.247467,3.372613,3.411731,9.642084,3.485539,3.413502,4.075161,20.99515,11.65666,32.39772,31.179627,17.612125,19.00289,5.782456,9.451687,9.828815,1.0,1.0,1.0,1.0,1.0
