In [28]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_built_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [29]:
asset_type = "land"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key_unsmoothed = "median"
metric_unsmoothed = metrics[metric_key_unsmoothed]

# metric_key = f"{metric_key_unsmoothed}_smoothed"
metric_key = "median"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric


In [30]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]

dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_built_data_path = model_built_data_paths[dataset_name]

In [31]:
df = pd.read_csv(core_path)
df = df.assign(asset_type=asset_type)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [32]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)
# df = df[~df[metric_pct_chg].isna()] # I don't want to drop the data from the first two years. I.e. 2007 should have 2005 and 2006 data in it's window

In [33]:
# prepare additional factors
df["count_growth"] = df.groupby(granularity_columns)["count"].pct_change()
df["yearly_price_growth"] = df.groupby(granularity_columns)[metric].pct_change()

for column in ["count", "total_traded_area", metric]:
    df[f"{column}_log"] = df[column].apply(lambda x: np.log10(1 + x))
    df[f"{column}_log_normalized_yearly"] = df.groupby("year")[f"{column}_log"].transform(
        lambda x: (x - x.mean()) / x.std()
    )

for column in ["count_growth", "yearly_price_growth", metric_pct_chg]: # metric_pct_chg_normalized_yearly will be the key metric
    df[f"{column}_normalized_yearly"] = df.groupby("year")[column].transform(
        lambda x: (x - x.mean()) / x.std()
    )

In [34]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)
df["metric_pct_chg_is_available"] = df[metric_pct_chg].notnull().astype(int)

In [35]:
log_normalize_columns = [
    metric,
    "count",
    "total_traded_area",
    "in_migrations",
    "out_migrations",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "total_tax",
    "new_dwellings",
    "existing_dwellings",
]

normalize_columns = [
    metric_pct_chg,
    "count_growth",
    "yearly_price_growth",
    "total_tax_growth",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "taxpayer_count_growth",
]

maintain_columns = [
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "metric_pct_chg_is_available"
]

id_columns = ["area_code", "area", "year", "asset_type"]

feature_columns = (
    [f"{column}_log_normalized_yearly" for column in log_normalize_columns]
    + [f"{column}_normalized_yearly" for column in normalize_columns]
    + maintain_columns
)

final_columns = id_columns + normalize_columns + log_normalize_columns + feature_columns

# add ratios and growths if necessary

df = df[final_columns]

In [36]:
df.to_csv(model_built_data_path, index=False)

In [37]:
df[df[metric_pct_chg] < -1]

Unnamed: 0,area_code,area,year,asset_type,unit_price_median_pct_chg,count_growth,yearly_price_growth,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,unit_price_median,count,total_traded_area,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,unit_price_median_log_normalized_yearly,count_log_normalized_yearly,total_traded_area_log_normalized_yearly,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,unit_price_median_pct_chg_normalized_yearly,count_growth_normalized_yearly,yearly_price_growth_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,metric_pct_chg_is_available


In [38]:
df.describe()

Unnamed: 0,area_code,year,unit_price_median_pct_chg,count_growth,yearly_price_growth,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,unit_price_median,count,total_traded_area,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,unit_price_median_log_normalized_yearly,count_log_normalized_yearly,total_traded_area_log_normalized_yearly,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,unit_price_median_pct_chg_normalized_yearly,count_growth_normalized_yearly,yearly_price_growth_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,metric_pct_chg_is_available
count,20295.0,20295.0,17737.0,19016.0,19016.0,20205.0,19105.0,19105.0,20295.0,12029.0,19105.0,20295.0,20295.0,20295.0,20295.0,20295.0,20295.0,19105.0,19105.0,19105.0,20226.0,12029.0,12029.0,20295.0,20295.0,20295.0,20295.0,20295.0,20295.0,19105.0,19105.0,19105.0,20226.0,12029.0,12029.0,17737.0,19016.0,19016.0,20205.0,19105.0,19105.0,20295.0,12029.0,19105.0,20295.0,20295.0,20295.0,20295.0,20295.0
mean,21810.306036,2014.483075,0.072245,0.080341,0.079387,0.007299,0.003431,0.001448,-0.002821,0.013669,0.001872,53109.91,81.317664,31430.114807,4052.557477,4075.916591,100172.1,44871.78,151091200.0,2938.44742,15386830.0,1113.780281,68642.67,-2.240687e-16,-3.220987e-16,1.015311e-16,0.285565,0.287425,0.328492,0.439969,0.435636,0.200435,0.567898,-0.102033,-0.098002,-3.204793e-18,-7.473104e-18,7.473104e-18,0.058629,0.038248,-0.012715,0.115247,-0.077256,0.100833,1.0,0.941365,0.592708,0.9966,0.873959
std,13475.587026,4.601001,0.734218,0.480476,0.89803,0.053318,0.049414,0.040574,0.006282,0.007082,0.023563,124599.8,135.938139,45165.256404,11972.539899,11521.625932,224097.9,101724.4,384272400.0,580.254029,43662600.0,2558.929759,133986.2,0.9995811,0.9995811,0.9995811,0.76855,0.768753,0.741401,0.754313,0.771642,1.011083,0.713669,1.016128,1.039491,0.999577,0.9995792,0.9995792,1.063753,0.859694,0.872235,0.699921,0.955867,0.791034,0.0,0.234946,0.491342,0.05821,0.331904
min,1100.0,2005.0,-0.978529,-0.903226,-0.994707,-0.677251,-0.682739,-0.694693,-0.102674,0.000557,-0.590998,108.5,3.0,365.0,59.0,83.0,2729.0,1164.0,2693730.0,1989.133483,391955.0,5.0,4966.0,-4.574096,-2.327901,-3.37662,-1.690953,-1.581958,-1.515944,-1.386153,-1.392187,-2.098896,-1.251862,-3.553202,-2.205262,-2.018725,-2.305423,-2.157031,-15.073015,-18.121708,-23.624752,-7.575169,-2.178107,-14.862366,1.0,0.0,0.0,0.0,0.0
25%,11219.0,2011.0,-0.184615,-0.181818,-0.153846,-0.01409,-0.010434,-0.008953,-0.006389,0.009025,-0.007033,12000.0,19.0,8295.0,553.0,664.0,18824.5,8032.0,21932500.0,2603.444028,2400727.0,179.0,18135.0,-0.6486109,-0.7246487,-0.6664908,-0.289956,-0.268532,-0.230886,-0.130829,-0.143638,-0.48295,0.030483,-0.815951,-0.852958,-0.3766655,-0.5395105,-0.3539963,-0.298758,-0.249755,-0.251508,-0.288573,-0.717679,-0.262668,1.0,1.0,0.0,1.0,1.0
50%,21206.0,2014.0,-0.014706,0.0,0.0,0.003579,0.005775,0.00186,-0.00294,0.012717,0.002755,22000.0,40.0,16750.0,1229.0,1365.0,40678.0,17466.0,48943170.0,2823.711171,5138897.0,411.0,29958.0,-0.1053462,-0.05313857,-0.02363019,0.195888,0.193019,0.263362,0.36849,0.34674,0.034002,0.686825,-0.164635,-0.31192,-0.1234866,-0.1617178,-0.1178903,-0.006773,0.017914,-0.031802,0.091066,-0.165355,0.09006,1.0,1.0,1.0,1.0,1.0
75%,32528.0,2018.0,0.153846,0.224304,0.153846,0.021758,0.020513,0.011787,0.000454,0.016825,0.011653,48000.0,87.0,35665.0,3053.0,3146.0,88834.5,39879.0,122377700.0,3136.648118,13165990.0,963.0,62161.0,0.5988007,0.6526626,0.6676802,0.752025,0.730728,0.767796,0.899647,0.905388,0.710867,1.012551,0.516828,0.469637,0.1246749,0.2947983,0.1201075,0.324056,0.285159,0.186711,0.476281,0.392678,0.452938,1.0,1.0,1.0,1.0,1.0
max,47381.0,2022.0,29.136986,11.565217,84.714286,2.199177,3.608838,3.549172,0.057609,0.12027,0.59417,2900000.0,2419.0,692620.0,214930.0,236014.0,3832957.0,1906224.0,7965148000.0,12667.02,867276500.0,42858.0,1916062.0,4.09298,3.772277,3.363578,3.327361,3.462084,3.258549,3.380084,3.424611,9.642084,3.505448,3.413502,4.107772,26.13016,16.41546,33.64502,31.179627,38.584813,40.093104,6.442511,12.763745,16.181699,1.0,1.0,1.0,1.0,1.0
