In [46]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_built_data_paths,
    get_derived_csv_path,
)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [47]:
asset_type = "building"
years_ahead = 2

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "gmean"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"


In [35]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]

dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_built_data_path = model_built_data_paths[dataset_name]

In [36]:
df = pd.read_csv(core_path)
df = df.assign(asset_type=asset_type)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [37]:
df[df["area_code"] == 13101][["year", "area_code", "area", "count"]]

Unnamed: 0,year,area_code,area,count
183,2005,13101,Tokyo-to Chiyoda-ku,111.0
184,2006,13101,Tokyo-to Chiyoda-ku,165.0
185,2007,13101,Tokyo-to Chiyoda-ku,133.0
186,2008,13101,Tokyo-to Chiyoda-ku,122.0
187,2009,13101,Tokyo-to Chiyoda-ku,129.0
188,2010,13101,Tokyo-to Chiyoda-ku,157.0
189,2011,13101,Tokyo-to Chiyoda-ku,164.0
190,2012,13101,Tokyo-to Chiyoda-ku,196.0
191,2013,13101,Tokyo-to Chiyoda-ku,322.0
192,2014,13101,Tokyo-to Chiyoda-ku,236.0


In [38]:
df[df["area_code"] == 13101][["year", "area_code", "area", "count"]].round(3).to_csv("migration.csv", index=False)

In [39]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)
# df = df[~df[metric_pct_chg].isna()] # I don't want to drop the data from the first two years. I.e. 2007 should have 2005 and 2006 data in it's window

In [40]:
# prepare additional factors
df["count_growth"] = df.groupby(granularity_columns)["count"].pct_change()
df["yearly_price_growth"] = df.groupby(granularity_columns)[metric].pct_change()

for column in ["count", "total_traded_area", metric]:
    df[f"{column}_log"] = df[column].apply(lambda x: np.log10(1 + x))
    df[f"{column}_log_normalized_yearly"] = df.groupby("year")[f"{column}_log"].transform(
        lambda x: (x - x.mean()) / x.std()
    )

for column in ["count_growth", "yearly_price_growth", metric_pct_chg]: # metric_pct_chg_normalized_yearly will be the key metric
    df[f"{column}_normalized_yearly"] = df.groupby("year")[column].transform(
        lambda x: (x - x.mean()) / x.std()
    )

In [41]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)
df["metric_pct_chg_is_available"] = df[metric_pct_chg].notnull().astype(int)

In [42]:
log_normalize_columns = [
    metric,
    "count",
    "total_traded_area",
    "in_migrations",
    "out_migrations",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "total_tax",
    "new_dwellings",
    "existing_dwellings",
]

normalize_columns = [
    metric_pct_chg,
    "count_growth",
    "yearly_price_growth",
    "total_tax_growth",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "taxpayer_count_growth",
]

maintain_columns = [
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "metric_pct_chg_is_available"
]

id_columns = ["area_code", "area", "year", "asset_type"]

feature_columns = (
    [f"{column}_log_normalized_yearly" for column in log_normalize_columns]
    + [f"{column}_normalized_yearly" for column in normalize_columns]
    + maintain_columns
)

final_columns = id_columns + normalize_columns + log_normalize_columns + feature_columns

# add ratios and growths if necessary

df = df[final_columns]

In [43]:
df.to_csv(model_built_data_path, index=False)

In [44]:
df[df[metric_pct_chg] > 2]

Unnamed: 0,area_code,area,year,asset_type,unit_price_gmean_pct_chg,count_growth,yearly_price_growth,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,unit_price_gmean,count,total_traded_area,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,unit_price_gmean_log_normalized_yearly,count_log_normalized_yearly,total_traded_area_log_normalized_yearly,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,unit_price_gmean_pct_chg_normalized_yearly,count_growth_normalized_yearly,yearly_price_growth_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,metric_pct_chg_is_available


In [45]:
df.describe()

Unnamed: 0,area_code,year,unit_price_gmean_pct_chg,count_growth,yearly_price_growth,total_tax_growth,taxable_income_growth,taxable_income_per_taxpayer_growth,net_migration_ratio,new_dwellings_ratio,taxpayer_count_growth,unit_price_gmean,count,total_traded_area,in_migrations,out_migrations,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,total_tax,new_dwellings,existing_dwellings,unit_price_gmean_log_normalized_yearly,count_log_normalized_yearly,total_traded_area_log_normalized_yearly,in_migrations_log_normalized_yearly,out_migrations_log_normalized_yearly,population_log_normalized_yearly,taxpayer_count_log_normalized_yearly,taxable_income_log_normalized_yearly,taxable_income_per_taxpayer_log_normalized_yearly,total_tax_log_normalized_yearly,new_dwellings_log_normalized_yearly,existing_dwellings_log_normalized_yearly,unit_price_gmean_pct_chg_normalized_yearly,count_growth_normalized_yearly,yearly_price_growth_normalized_yearly,total_tax_growth_normalized_yearly,taxable_income_growth_normalized_yearly,taxable_income_per_taxpayer_growth_normalized_yearly,net_migration_ratio_normalized_yearly,new_dwellings_ratio_normalized_yearly,taxpayer_count_growth_normalized_yearly,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,metric_pct_chg_is_available
count,5109.0,5109.0,4437.0,4768.0,4768.0,5076.0,4796.0,4796.0,5109.0,4541.0,4796.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,4796.0,4796.0,4796.0,5086.0,4541.0,4541.0,5109.0,5109.0,5109.0,5109.0,5109.0,5109.0,4796.0,4796.0,4796.0,5086.0,4541.0,4541.0,4437.0,4768.0,4768.0,5076.0,4796.0,4796.0,5109.0,4541.0,4796.0,5109.0,5109.0,5109.0,5109.0,5109.0
mean,19951.210804,2014.494422,0.039301,0.082342,0.024981,0.008147,0.007734,0.000188,0.000389,0.016365,0.007442,249054.9,154.863966,9014.518497,12350.539832,12091.629658,280631.5,127451.9,454158500.0,3476.237026,45399330.0,2360.549659,139342.4,-2.225227e-16,-2.2252270000000002e-17,1.223875e-16,1.241625,1.237928,1.224236,1.355974,1.403237,1.264167,1.324476,0.803948,0.852997,-1.441263e-17,-5.588370999999999e-19,-3.7255809999999997e-19,0.079489,0.130701,-0.048623,0.483039,0.32857,0.344609,1.0,0.938736,0.888824,0.995498,0.868467
std,10154.182987,4.616607,0.182055,0.394982,0.170813,0.039121,0.031494,0.022681,0.005111,0.006224,0.016485,170266.6,351.699096,19580.371772,21150.070179,20283.60613,381227.8,173621.3,662043200.0,797.378101,77758660.0,3634.089902,194170.0,0.9983346,0.9983346,0.9983346,0.591985,0.611627,0.570587,0.572401,0.562659,1.096887,0.509518,0.769987,0.920275,0.9983079,0.9983204,0.9983204,0.770982,0.511721,0.537243,0.577482,0.822158,0.506083,0.0,0.239838,0.314381,0.066951,0.338015
min,1100.0,2005.0,-0.749036,-0.8125,-0.77153,-0.466623,-0.313802,-0.192218,-0.036352,0.001265,-0.337342,39475.62,3.0,100.0,292.0,443.0,16288.0,7118.0,21255980.0,2441.662747,2089891.0,44.0,15789.0,-3.15535,-2.07843,-2.820555,-0.669647,-0.485729,-0.308341,-0.21153,-0.16992,-0.755111,-0.179642,-1.878069,-0.860861,-4.982102,-2.903888,-4.150723,-10.382247,-6.466902,-6.912359,-2.505351,-2.07415,-7.589756,1.0,0.0,0.0,0.0,0.0
25%,12222.0,2011.0,-0.06453,-0.125,-0.063127,-0.008426,-0.002646,-0.007888,-0.002444,0.012358,0.000488,152486.5,20.0,1335.0,3557.0,3653.0,101411.0,44647.25,144287000.0,3092.44605,14094530.0,691.0,48069.0,-0.6547715,-0.7445491,-0.740954,0.848132,0.834077,0.852999,0.972518,1.00314,0.630617,1.038575,0.255703,0.197203,-0.5379974,-0.5289933,-0.5186362,-0.184934,-0.097563,-0.254136,0.14533,-0.187217,0.038677,1.0,1.0,1.0,1.0,1.0
50%,16201.0,2015.0,0.032243,0.021739,0.019481,0.005847,0.010624,0.002027,-0.000168,0.015407,0.007037,198016.9,47.0,3115.0,6046.0,6136.0,166017.0,74390.0,250144900.0,3312.830728,24576000.0,1201.0,77305.0,-0.1432203,-0.116251,-0.08022221,1.163915,1.165349,1.170074,1.300487,1.338956,1.059603,1.236153,0.707814,0.701793,-0.03639362,-0.1430812,-0.04946484,0.03117,0.111695,-0.046308,0.396507,0.232626,0.282068,1.0,1.0,1.0,1.0,1.0
75%,27214.0,2018.0,0.12854,0.2,0.09729,0.019889,0.022611,0.010781,0.002667,0.018901,0.014235,271920.4,123.0,7945.0,11483.0,11343.0,324675.0,144656.0,501938700.0,3625.255824,45887830.0,2569.0,154849.0,0.4650099,0.6056177,0.6360937,1.557649,1.55927,1.603481,1.728595,1.754031,1.642485,1.588653,1.314607,1.419614,0.4600104,0.3182384,0.4119759,0.285299,0.327456,0.143806,0.730117,0.682685,0.609891,1.0,1.0,1.0,1.0,1.0
max,47211.0,2022.0,1.325524,4.333333,1.889849,0.832972,0.440856,0.40472,0.043446,0.084664,0.188827,1457856.0,4215.0,235145.0,211790.0,229585.0,3811873.0,1906224.0,7965148000.0,12667.02,867276500.0,41746.0,1916062.0,3.288027,3.266858,3.235278,3.319728,3.455531,3.247467,3.372613,3.411731,9.642084,3.485539,3.413502,4.075161,6.048566,10.2634,9.856246,17.432008,8.19895,9.010599,5.644308,7.774113,4.995252,1.0,1.0,1.0,1.0,1.0
