In [1]:
import warnings
import math

import pandas as pd

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

from jre_utils.process import get_most_active_municipalities

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [3]:
asset_type = "building"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = "weighted_median_smoothed"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric

In [4]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]


In [5]:
def years_since_crisis(year):
    year_ranges = {
        (1960, 1973): 1960,
        (1973, 1990): 1973,
        (1990, 1997): 1990,
        (1997, 2008): 1997,
        (2008, 2019): 2008,
        (2019, math.inf): 2019,
    }

    for range_start, range_end in year_ranges:
        if range_start <= year < range_end:
            return year - year_ranges[(range_start, range_end)]
        

In [6]:
df = pd.read_csv(core_path)
df = get_most_active_municipalities(df, 1500)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [7]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)

# Might go back and undo the new dwellings filling for unknown municipalities.
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

# Years since crisis (Yn - 2008) or (Yn - 2020)
df["years_since_crisis"] = df["year"].apply(years_since_crisis)

df = df.fillna(0)
# df

In [8]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)

# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25315 entries, 16845 to 14370
Data columns (total 31 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   year                                 25315 non-null  int64  
 1   area_code                            25315 non-null  int64  
 2   area                                 25315 non-null  object 
 3   unit_price_wmean                     25315 non-null  float64
 4   unit_price_wmedian                   25315 non-null  float64
 5   unit_price_mean                      25315 non-null  float64
 6   unit_price_median                    25315 non-null  float64
 7   total_traded_area                    25315 non-null  float64
 8   count                                25315 non-null  float64
 9   unit_price_wmean_smoothed            25315 non-null  float64
 10  unit_price_wmedian_smoothed          25315 non-null  float64
 11  unit_price_mean_smoothed     

In [10]:
df.columns

Index(['year', 'area_code', 'area', 'unit_price_wmean', 'unit_price_wmedian',
       'unit_price_mean', 'unit_price_median', 'total_traded_area', 'count',
       'unit_price_wmean_smoothed', 'unit_price_wmedian_smoothed',
       'unit_price_mean_smoothed', 'unit_price_median_smoothed', 'population',
       'net_migration_ratio', 'taxable_income', 'taxpayer_count',
       'taxable_income_per_taxpayer', 'taxable_income_growth',
       'taxable_income_per_taxpayer_growth', 'new_dwellings',
       'existing_dwellings', 'new_dwellings_ratio', 'total_tax',
       'total_tax_growth', 'migrations_is_available',
       'taxable_income_is_available', 'total_tax_is_available',
       'dwellings_is_available', 'years_since_crisis',
       'unit_price_wmedian_smoothed_pct_chg'],
      dtype='object')

In [12]:
columns = [
    metric_pct_chg,
    metric,
    "year",
    "years_since_crisis",
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "total_tax_growth",
    "new_dwellings",
    "existing_dwellings",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "area_code",  # id
]

# add ratios and growths if necessary

df = df[columns]
df = df.dropna()

In [13]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = metric_pct_chg
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  22315
Filtered Size:  21867


Unnamed: 0,unit_price_wmedian_smoothed_pct_chg,unit_price_wmedian_smoothed,year,years_since_crisis,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,area_code
count,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0,21867.0
mean,-0.000593,72085.4,2015.890794,4.354415,78.132803,20714.112818,83041.47,35211.77,118132400.0,2522.167423,0.002475,0.002178,12897250.0,0.003247,531.91677,34234.59,-0.003072,0.006383,0.937852,0.873417,0.488636,0.940047,21467.543559
std,0.273908,196255.1,4.321446,3.028851,198.863859,39885.02652,205527.1,92437.31,347128600.0,1095.047512,0.041031,0.031123,40520080.0,0.061631,1794.725477,101638.7,0.006989,0.007971,0.24143,0.332513,0.499882,0.237406,13798.652884
min,-0.621123,1025.209,2007.0,0.0,1.0,65.0,0.0,0.0,0.0,0.0,-0.537047,-0.48076,0.0,-0.677251,0.0,0.0,-0.313065,0.0,0.0,0.0,0.0,0.0,1100.0
25%,-0.167355,13651.16,2012.0,2.0,9.0,3907.5,11126.5,3557.5,9233596.0,2457.30551,-0.008292,-0.006307,1372152.0,-0.014501,0.0,0.0,-0.00657,0.0,1.0,1.0,0.0,1.0,10344.0
50%,-0.027596,26767.41,2016.0,4.0,24.0,9375.0,29927.0,11305.0,30402710.0,2718.857673,0.001458,0.000204,3749085.0,0.0,0.0,0.0,-0.002818,0.0,1.0,1.0,0.0,1.0,20521.0
75%,0.119323,66948.8,2020.0,7.0,70.0,21392.5,73570.5,29910.5,89846810.0,3028.263334,0.018424,0.011466,10382260.0,0.01728,384.0,29705.5,0.0,0.012158,1.0,1.0,1.0,1.0,32449.0
max,1.272794,6241887.0,2023.0,10.0,4358.0,666150.0,3794024.0,1906224.0,7965148000.0,12667.02,1.06604,1.026488,867276500.0,4.759148,39143.0,1916062.0,0.099253,0.12027,1.0,1.0,1.0,1.0,47362.0


In [14]:
filtered_df.to_csv(model_ready_data_paths[f"sequence_{dataset_key}_{metric_key}_{years_ahead}"], index=False)

In [15]:
filtered_df[filtered_df["area_code"] == 13101][["year", metric, metric_pct_chg]]

Unnamed: 0,year,unit_price_wmedian_smoothed,unit_price_wmedian_smoothed_pct_chg
2012,2007,3190579.0,0.121657
2013,2008,3144827.0,0.055103
2014,2009,2910509.0,-0.08778
2015,2010,2697234.0,-0.142327
2016,2011,2354779.0,-0.190939
2017,2012,2224987.0,-0.175086
2018,2013,2791684.0,0.18554
2019,2014,3167834.0,0.423754
2020,2015,3297759.0,0.18128
2021,2016,3560900.0,0.124081
