In [20]:
import warnings

import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [30]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "main": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["weighted_median"]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric
upcoming_metric_pct_chg = "upcoming_" + metric_pct_chg


In [31]:
core_path = dataset_paths["main"]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]


In [32]:
df = pd.read_csv(core_path)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
)

In [33]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()

# set up target variables
df[upcoming_metric_pct_chg] = df.groupby(granularity_columns)[metric_pct_chg].shift(-1)

# time box
start_year = 2005
end_year = 2023

df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27728 entries, 18191 to 15439
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   year                                 27728 non-null  int64  
 1   area_code                            27728 non-null  int64  
 2   area                                 27728 non-null  object 
 3   unit_price_wmean                     27728 non-null  float64
 4   unit_price_wmedian                   27728 non-null  float64
 5   unit_price_mean                      27728 non-null  float64
 6   unit_price_median                    27728 non-null  float64
 7   total_traded_area                    27728 non-null  float64
 8   count                                27728 non-null  float64
 9   population                           25603 non-null  float64
 10  net_migration_ratio                  25603 non-null  float64
 11  taxable_income               

In [35]:
columns = [
    metric_pct_chg,
    metric,
    "year",
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "new_dwellings",
    "existing_dwellings",
    "net_migration_ratio",

    "area_code", # id
]

# add ratios and growths if necessary

df = df[columns]
df = df.dropna()


In [36]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = metric_pct_chg
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  11365
Filtered Size:  11137


Unnamed: 0,unit_price_wmedian_pct_chg,unit_price_wmedian,year,count,total_traded_area,population,taxpayer_count,taxable_income,new_dwellings,existing_dwellings,net_migration_ratio,area_code
count,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0,11137.0
mean,0.051114,103007.5,2014.494747,134.835683,33987.655114,148151.8,66757.09,227076100.0,1105.607255,69758.67,-0.002375,21987.623058
std,0.422676,235996.0,4.079049,259.943738,50153.308615,269648.2,122980.8,467741500.0,2477.181054,134716.6,0.005458,12947.018475
min,-0.730657,510.355,2006.0,1.0,250.0,2961.0,1124.0,2670080.0,0.0,2177.0,-0.058973,1100.0
25%,-0.19449,20195.5,2011.0,29.0,11440.0,42226.0,18188.0,48646040.0,182.0,18500.0,-0.005622,11245.0
50%,-0.011226,40650.87,2015.0,63.0,19470.0,70332.0,30802.0,93279510.0,411.0,30518.0,-0.002518,21209.0
75%,0.190476,104489.6,2018.0,134.0,35470.0,140843.0,64054.0,205880700.0,964.0,63599.0,0.000453,32204.0
max,2.389143,4417898.0,2021.0,4358.0,666150.0,3811873.0,1906224.0,7965148000.0,41746.0,1916062.0,0.043446,47215.0


In [37]:
filtered_df.to_csv(model_ready_data_paths["sequence"], index=False)