In [1]:
import warnings
import math

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    factor_data_paths,
    model_built_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
asset_type = "building"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

# metric_key_unsmoothed = "weighted_median"
metric_key_unsmoothed = "median"
metric_unsmoothed = metrics[metric_key_unsmoothed]

metric_key = f"{metric_key_unsmoothed}_smoothed"
metric = metrics[metric_key]

metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric

In [3]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]


In [4]:
def years_since_crisis(year):
    year_ranges = {
        (1960, 1973): 1960,
        (1973, 1990): 1973,
        (1990, 1997): 1990,
        (1997, 2008): 1997,
        (2008, 2019): 2008,
        (2019, math.inf): 2019,
    }

    for range_start, range_end in year_ranges:
        if range_start <= year < range_end:
            return year - year_ranges[(range_start, range_end)]
        

In [5]:
df = pd.read_csv(core_path)
# df = get_most_active_municipalities(df, 1500)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

# Years since crisis (Yn - 2008) or (Yn - 2020)
df["years_since_crisis"] = df["year"].apply(years_since_crisis)

In [6]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)

# Might go back and undo the new dwellings filling for unknown municipalities.
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

In [7]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)
df = df[~df[metric_pct_chg].isna()]

In [8]:
columns = [
    metric_pct_chg,
    metric,
    metric_unsmoothed,
    "year",
    "years_since_crisis",
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "total_tax_growth",
    "new_dwellings",
    "existing_dwellings",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "area_code",  # id
    "area", # additional info
]

# add ratios and growths if necessary

df = df[columns]

In [9]:
df.shape

(18826, 25)

In [10]:
len(df["area_code"].unique())

1377

In [11]:
df.describe()

Unnamed: 0,unit_price_median_smoothed_pct_chg,unit_price_median_smoothed,unit_price_median,year,years_since_crisis,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,area_code
count,18826.0,18826.0,18826.0,18826.0,18826.0,18826.0,18826.0,18826.0,17410.0,17410.0,17410.0,17410.0,17410.0,18790.0,18778.0,10378.0,10378.0,18826.0,10378.0,18826.0,18826.0,18826.0,18826.0,18826.0
mean,-0.002124,87908.53,88678.27,2015.617125,4.430947,81.128014,19321.521035,93876.93,42455.91,142415300.0,2905.419252,0.003295,0.002599,14445680.0,0.003111,1075.371555,69808.3,-0.002912,0.013005,1.0,0.924785,0.551259,0.998088,21603.518963
std,0.215441,170087.2,170751.0,3.973032,3.095357,197.717847,36648.558412,216842.3,99545.37,374571500.0,572.392933,0.043532,0.033404,42396540.0,0.047345,2404.116441,135690.7,0.006135,0.006488,0.0,0.263745,0.497379,0.043689,13633.134424
min,-0.846925,1380.199,100.0,2007.0,0.0,1.0,65.0,1481.0,530.0,1306821.0,1989.133483,-0.529577,-0.48076,130782.0,-0.677251,2.0,5053.0,-0.099818,0.000298,1.0,0.0,0.0,0.0,1100.0
25%,-0.118428,20274.23,20033.0,2012.0,2.0,10.0,3940.0,15909.25,6750.25,17924930.0,2576.276955,-0.010224,-0.007842,1985892.0,-0.014887,175.0,18362.25,-0.006435,0.008755,1.0,1.0,0.0,1.0,11100.0
50%,-0.01554,41072.04,42424.24,2016.0,4.0,26.0,8865.0,35740.0,15485.0,43516530.0,2798.032427,0.006283,0.003193,4564943.0,0.002282,399.5,30439.0,-0.00301,0.012268,1.0,1.0,1.0,1.0,21202.5
75%,0.087146,96127.18,98759.37,2019.0,7.0,76.0,19775.0,83013.5,37605.75,115099200.0,3099.449753,0.021106,0.013201,12198050.0,0.018414,948.75,63576.5,0.000347,0.016135,1.0,1.0,1.0,1.0,32501.0
max,5.32716,4462737.0,4420875.0,2022.0,10.0,4258.0,625095.0,3794024.0,1906224.0,7965148000.0,12667.02,1.06604,1.026488,867276500.0,2.199177,39143.0,1916062.0,0.043446,0.12027,1.0,1.0,1.0,1.0,47362.0


In [12]:
df.isna().sum()

unit_price_median_smoothed_pct_chg       0
unit_price_median_smoothed               0
unit_price_median                        0
year                                     0
years_since_crisis                       0
count                                    0
total_traded_area                        0
population                               0
taxpayer_count                        1416
taxable_income                        1416
taxable_income_per_taxpayer           1416
taxable_income_growth                 1416
taxable_income_per_taxpayer_growth    1416
total_tax                               36
total_tax_growth                        48
new_dwellings                         8448
existing_dwellings                    8448
net_migration_ratio                      0
new_dwellings_ratio                   8448
migrations_is_available                  0
taxable_income_is_available              0
dwellings_is_available                   0
total_tax_is_available                   0
area_code  

In [13]:
# New
# df = df.fillna(0)
df.to_csv(model_built_data_paths[f"sequence_{dataset_key}_{metric_key}_{years_ahead}"], index=False)

In [14]:
# Historical
# df = df.fillna(0)
# df.to_csv(model_ready_data_paths[f"sequence_{dataset_key}_{metric_key}_{years_ahead}"], index=False)


In [15]:
df[df["area_code"] == 13101][["year", metric, metric_pct_chg]]

Unnamed: 0,year,unit_price_median_smoothed,unit_price_median_smoothed_pct_chg
1733,2007,2204983.0,0.227171
1734,2008,2197454.0,0.139858
1735,2009,2069647.0,-0.061378
1736,2010,1825045.0,-0.169473
1737,2011,1620211.0,-0.217156
1738,2012,1684720.0,-0.076888
1739,2013,1947510.0,0.20201
1740,2014,2130575.0,0.264646
1741,2015,2257352.0,0.159097
1742,2016,2645833.0,0.24184
