In [108]:
import warnings

import numpy as np
import pandas as pd

from jre_utils.datapath import factor_data_paths

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [109]:
all_area_taxpayers_processed_path = factor_data_paths["processed"]["taxpayer"]["submunicipality"]

new_dwellings_processed_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
new_dwellings_unprocessed_path = factor_data_paths["unprocessed"]["new_dwellings"]["municipality"]

existing_dwellings_processed_path = factor_data_paths["processed"]["existing_dwellings"]["municipality"]
existing_dwellings_unprocessed_path = factor_data_paths["unprocessed"]["existing_dwellings"]["municipality"]

In [110]:
new_dwellings_df = pd.read_csv(new_dwellings_unprocessed_path)
existing_dwellings_df = pd.read_csv(existing_dwellings_unprocessed_path)

existing_dwellings_df = existing_dwellings_df[existing_dwellings_df["year"] == 2018]
existing_dwellings_df = existing_dwellings_df.drop_duplicates(subset=["year", "area_code"])
existing_dwellings_df = existing_dwellings_df[existing_dwellings_df["existing_dwellings"].apply(lambda x: x.isdigit())]
existing_dwellings_df["existing_dwellings"] = existing_dwellings_df["existing_dwellings"].astype(float)

new_dwellings_df = new_dwellings_df.drop_duplicates(subset=["year", "area_code"])
new_dwellings_df = new_dwellings_df[new_dwellings_df["new_dwellings"].apply(lambda x: x.isdigit())]
new_dwellings_df["new_dwellings"] = new_dwellings_df["new_dwellings"].astype(float)

new_dwellings_df = new_dwellings_df.merge(existing_dwellings_df, on=["year", "area_code"], suffixes=["", "_y"], how="left")

new_dwellings_df["del_dwellings_count"] = new_dwellings_df["new_dwellings"]
new_dwellings_df.loc[new_dwellings_df["year"] == 2018, "del_dwellings_count"] = 0

new_dwellings_df["existing_dwellings"] = new_dwellings_df.groupby("area_code")["existing_dwellings"].ffill()
new_dwellings_df["existing_dwellings"] = new_dwellings_df.groupby("area_code")["existing_dwellings"].bfill()

new_dwellings_df_pre = new_dwellings_df[new_dwellings_df["year"] <= 2018]
new_dwellings_df_pre = new_dwellings_df_pre.sort_values(by=["year", "area_code"], ascending=[False, True])
new_dwellings_df_pre["cumulative_del_dwellings_count"] = new_dwellings_df_pre.groupby("area_code")["del_dwellings_count"].cumsum()
new_dwellings_df_pre["existing_dwellings"] = new_dwellings_df_pre["existing_dwellings"] - new_dwellings_df_pre["cumulative_del_dwellings_count"]

new_dwellings_df_post= new_dwellings_df[new_dwellings_df["year"] > 2018]
new_dwellings_df_post = new_dwellings_df_post.sort_values(by=["year", "area_code"], ascending=[True, True])
new_dwellings_df_post["cumulative_del_dwellings_count"] = new_dwellings_df_post.groupby("area_code")["new_dwellings"].cumsum()
new_dwellings_df_post["existing_dwellings"] = new_dwellings_df_post["existing_dwellings"] + new_dwellings_df_post["cumulative_del_dwellings_count"]

new_dwellings_df = pd.concat([new_dwellings_df_pre, new_dwellings_df_post], ignore_index=True).sort_values(by=["year", "area_code"], ascending=[False, True])

new_dwellings_df["new_dwellings_ratio"] = new_dwellings_df["new_dwellings"] / new_dwellings_df["existing_dwellings"]

new_dwellings_df = new_dwellings_df.drop(columns=["del_dwellings_count", "cumulative_del_dwellings_count", "area_y"])
new_dwellings_df.to_csv(new_dwellings_processed_path, index=False)


In [122]:
new_dwellings_df[new_dwellings_df["area_code"] == 1202]

Unnamed: 0,year,area_code,area,new_dwellings,existing_dwellings,new_dwellings_ratio
18246,2021,1202,Hokkaido Hakodate-shi,1122.0,151567.0,0.007403
17256,2020,1202,Hokkaido Hakodate-shi,1265.0,150445.0,0.008408
16266,2019,1202,Hokkaido Hakodate-shi,1570.0,149180.0,0.010524
11,2018,1202,Hokkaido Hakodate-shi,1475.0,147610.0,0.009993
1001,2017,1202,Hokkaido Hakodate-shi,1676.0,145934.0,0.011485
1990,2016,1202,Hokkaido Hakodate-shi,1674.0,144260.0,0.011604
2979,2015,1202,Hokkaido Hakodate-shi,1450.0,142810.0,0.010153
3967,2014,1202,Hokkaido Hakodate-shi,1501.0,141309.0,0.010622
4955,2013,1202,Hokkaido Hakodate-shi,1706.0,139603.0,0.01222
5942,2012,1202,Hokkaido Hakodate-shi,1609.0,137994.0,0.01166


In [136]:
existing_plps_df[existing_plps_df["area_code"] == 1202][["year", "area_code", "area", "new_dwellings", "existing_dwellings", "new_dwellings_ratio"]]

Unnamed: 0,year,area_code,area,new_dwellings,existing_dwellings,new_dwellings_ratio
3441,2021,1202,Hokkaido Hakodate-shi,1122.0,151567.0,0.007403
5161,2020,1202,Hokkaido Hakodate-shi,1265.0,150445.0,0.008408
6881,2019,1202,Hokkaido Hakodate-shi,1570.0,149180.0,0.010524
8599,2018,1202,Hokkaido Hakodate-shi,1475.0,147610.0,0.009993
10315,2017,1202,Hokkaido Hakodate-shi,1676.0,145934.0,0.011485
12028,2016,1202,Hokkaido Hakodate-shi,1674.0,144260.0,0.011604
13739,2015,1202,Hokkaido Hakodate-shi,1450.0,142810.0,0.010153
15449,2014,1202,Hokkaido Hakodate-shi,1501.0,141309.0,0.010622
17158,2013,1202,Hokkaido Hakodate-shi,1706.0,139603.0,0.01222
18866,2012,1202,Hokkaido Hakodate-shi,1609.0,137994.0,0.01166


In [112]:
from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)


In [113]:
# There are too many areas skipped by this dataset. I will set those to 0

asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["median"]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg

In [130]:
plps_path = dataset_paths["plps"]
plps_df = pd.read_csv(plps_path)
existing_dwellings_df = pd.read_csv(existing_dwellings_unprocessed_path)
existing_dwellings_df = existing_dwellings_df[existing_dwellings_df["existing_dwellings"].apply(lambda x: x.isdigit())]
existing_dwellings_df["existing_dwellings"] = existing_dwellings_df["existing_dwellings"].astype(float)

In [131]:
plps_df[plps_df["area_code"] == 1202]

Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count
1,1997,1202,Hokkaido Hakodate-shi,99735.121195,56463.048499,118866.666667,79000.0,33541.0,48.0
1400,1998,1202,Hokkaido Hakodate-shi,89807.769595,56853.34873,108654.166667,78250.0,33541.0,48.0
2802,1999,1202,Hokkaido Hakodate-shi,82595.293694,53500.0,101197.826087,79000.0,33317.0,46.0
4206,2000,1202,Hokkaido Hakodate-shi,76001.382797,53500.0,93713.636364,77500.0,32832.0,44.0
5610,2001,1202,Hokkaido Hakodate-shi,60550.427403,52634.52381,76863.636364,74900.0,32756.0,44.0
7018,2002,1202,Hokkaido Hakodate-shi,57585.972864,51545.238095,74286.363636,71250.0,32872.0,44.0
8432,2003,1202,Hokkaido Hakodate-shi,53632.958141,49954.285714,69568.181818,68000.0,32872.0,44.0
9855,2004,1202,Hokkaido Hakodate-shi,50354.981007,46687.022901,66045.454545,65250.0,32644.0,44.0
11301,2005,1202,Hokkaido Hakodate-shi,48616.739856,48916.877256,56112.5,62000.0,28465.0,48.0
12871,2006,1202,Hokkaido Hakodate-shi,47553.138943,46500.0,54664.583333,59750.0,28465.0,48.0


In [132]:
existing_dwellings_df[existing_dwellings_df["area_code"] == 1202]

Unnamed: 0,year,area_code,area,existing_dwellings
11,2018,1202,Hokkaido Hakodate-shi,147610.0
1927,2013,1202,Hokkaido Hakodate-shi,144660.0
3843,2008,1202,Hokkaido Hakodate-shi,147990.0


In [135]:
existing_plps_df = plps_df.merge(existing_dwellings_df, on=["year", "area_code", "area"], suffixes=["", "_y"], how="left")
existing_plps_df = existing_plps_df.sort_values(by=["year", "area_code"], ascending=[False, True])
existing_plps_df["existing_dwellings"] = existing_plps_df.groupby("area_code")["existing_dwellings"].ffill()
existing_plps_df["existing_dwellings"] = existing_plps_df.groupby("area_code")["existing_dwellings"].bfill()

existing_plps_df = existing_plps_df.merge(new_dwellings_df, on=["year", "area_code", "area"], suffixes=["_y", ""], how="left")
existing_plps_df["existing_dwellings"] = np.where(existing_plps_df["existing_dwellings"].isnull(), existing_plps_df["existing_dwellings_y"], existing_plps_df["existing_dwellings"])
existing_plps_df = existing_plps_df.fillna(0)
existing_plps_df = existing_plps_df[(existing_plps_df["year"] >= 2000) & (existing_plps_df["year"] <= 2021)]
existing_plps_df[["year", "area_code", "area", "new_dwellings", "existing_dwellings", "new_dwellings_ratio"]].to_csv(new_dwellings_processed_path, index=False)



In [41]:
# I have a field in a pandas dataframe that is df["existing_dwellings"]
# The existing dwellings are given at 5 year intervals for each area
# I will interpolate the values for the years in between
# For example, is area X has 100 dwellings in 2010 and 200 dwellings in 2015, I will interpolate the values for 2011, 2012, 2013, 2014
# The values for 2011, 2012, 2013, 2014 will be 120, 140, 160, 180 respectively
# Give me the code to do the above 

