In [1]:
import warnings
import math

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import statsmodels.api as sm

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
)
from jre_utils.process import get_most_active_municipalities
from jre_utils.visualize import plot_time_series


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
asset_type = "building"
years_ahead = 2

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
    "weighted_mean_smoothed": "unit_price_wmean_smoothed",
    "weighted_median_smoothed": "unit_price_wmedian_smoothed",
    "mean_smoothed": "unit_price_mean_smoothed",
    "median_smoothed": "unit_price_median_smoothed",
}

dataset_paths = {
    "transactions": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric_key = "weighted_median_smoothed"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"
upcoming_metric = "upcoming_" + metric

In [3]:
dataset_key = "transactions"
core_path = dataset_paths[dataset_key]
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]


In [4]:
def years_since_crisis(year):
    year_ranges = {
        (1960, 1973): 1960,
        (1973, 1990): 1973,
        (1990, 1997): 1990,
        (1997, 2008): 1997,
        (2008, 2019): 2008,
        (2019, math.inf): 2019,
    }

    for range_start, range_end in year_ranges:
        if range_start <= year < range_end:
            return year - year_ranges[(range_start, range_end)]
        

In [5]:
df = pd.read_csv(core_path)
df = get_most_active_municipalities(df, 1500)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [6]:
df["migrations_is_available"] = df["net_migration_ratio"].notnull().astype(int)
df["taxable_income_is_available"] = df["taxable_income"].notnull().astype(int)
df["total_tax_is_available"] = df["total_tax"].notnull().astype(int)

# Might go back and undo the new dwellings filling for unknown municipalities.
df["dwellings_is_available"] = df["new_dwellings"].notnull().astype(int)

# Years since crisis (Yn - 2008) or (Yn - 2020)
df["years_since_crisis"] = df["year"].apply(years_since_crisis)

df = df.fillna(0)
# df

In [7]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change(periods=years_ahead)

# time box
# start_year = 2005
# end_year = 2023

# df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25315 entries, 16845 to 14370
Data columns (total 31 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   year                                 25315 non-null  int64  
 1   area_code                            25315 non-null  int64  
 2   area                                 25315 non-null  object 
 3   unit_price_wmean                     25315 non-null  float64
 4   unit_price_wmedian                   25315 non-null  float64
 5   unit_price_mean                      25315 non-null  float64
 6   unit_price_median                    25315 non-null  float64
 7   total_traded_area                    25315 non-null  float64
 8   count                                25315 non-null  float64
 9   unit_price_wmean_smoothed            25315 non-null  float64
 10  unit_price_wmedian_smoothed          25315 non-null  float64
 11  unit_price_mean_smoothed     

In [9]:
df.columns

Index(['year', 'area_code', 'area', 'unit_price_wmean', 'unit_price_wmedian',
       'unit_price_mean', 'unit_price_median', 'total_traded_area', 'count',
       'unit_price_wmean_smoothed', 'unit_price_wmedian_smoothed',
       'unit_price_mean_smoothed', 'unit_price_median_smoothed', 'population',
       'net_migration_ratio', 'taxable_income', 'taxpayer_count',
       'taxable_income_per_taxpayer', 'taxable_income_growth',
       'taxable_income_per_taxpayer_growth', 'new_dwellings',
       'existing_dwellings', 'new_dwellings_ratio', 'total_tax',
       'total_tax_growth', 'migrations_is_available',
       'taxable_income_is_available', 'total_tax_is_available',
       'dwellings_is_available', 'years_since_crisis',
       'unit_price_wmedian_smoothed_pct_chg'],
      dtype='object')

In [10]:
columns = [
    metric_pct_chg,
    metric,
    "year",
    "years_since_crisis",
    "count",
    "total_traded_area",
    "population",
    "taxpayer_count",
    "taxable_income",
    "taxable_income_per_taxpayer",
    "taxable_income_growth",
    "taxable_income_per_taxpayer_growth",
    "total_tax",
    "total_tax_growth",
    "new_dwellings",
    "existing_dwellings",
    "net_migration_ratio",
    "new_dwellings_ratio",
    "migrations_is_available",
    "taxable_income_is_available",
    "dwellings_is_available",
    "total_tax_is_available",
    "area_code",  # id
]

# add ratios and growths if necessary

df = df[columns]
df = df.dropna()

In [11]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = metric_pct_chg
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  23815
Filtered Size:  23337


Unnamed: 0,unit_price_wmedian_smoothed_pct_chg,unit_price_wmedian_smoothed,year,years_since_crisis,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,area_code
count,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0,23337.0
mean,-0.006721,72125.55,2015.39958,4.195312,76.952008,20378.644856,82496.0,35320.44,118613900.0,2543.156033,0.002117,0.001682,12862980.0,0.003644,541.625102,33992.72,-0.003203,0.006551,0.931354,0.87899,0.490166,0.940738,21449.296096
std,0.157404,191798.7,4.62115,3.125035,196.704822,39404.253987,204399.2,92126.47,346153300.0,1081.311494,0.040787,0.030985,40273300.0,0.062035,1845.620085,100765.9,0.007051,0.008211,0.252857,0.326145,0.499914,0.23612,13801.928863
min,-0.440794,1025.209,2006.0,0.0,1.0,65.0,0.0,0.0,0.0,0.0,-0.682739,-0.694693,0.0,-0.677251,0.0,0.0,-0.313065,0.0,0.0,0.0,0.0,0.0,1100.0
25%,-0.098789,13930.34,2011.0,2.0,8.0,3825.0,10952.0,3666.0,9554349.0,2465.198986,-0.009067,-0.006924,1377268.0,-0.01455,0.0,0.0,-0.006728,0.0,1.0,1.0,0.0,1.0,10212.0
50%,-0.013836,27191.43,2015.0,4.0,24.0,9195.0,29690.0,11467.0,31001800.0,2726.746049,0.001134,0.0,3751245.0,0.0,0.0,0.0,-0.002893,0.0,1.0,1.0,0.0,1.0,20521.0
75%,0.070581,67723.6,2019.0,7.0,69.0,21090.0,73150.0,30099.0,90910500.0,3038.636661,0.01797,0.010895,10403140.0,0.017619,390.0,29562.0,0.0,0.012358,1.0,1.0,1.0,1.0,32343.0
max,0.637428,6241887.0,2023.0,10.0,4358.0,666150.0,3811873.0,1906224.0,7965148000.0,12667.02,1.06604,1.026488,867276500.0,4.759148,41746.0,1916062.0,0.043446,0.12027,1.0,1.0,1.0,1.0,47362.0


In [12]:
filtered_df.to_csv(model_ready_data_paths[f"sequence_{dataset_key}_{metric_key}_{years_ahead}"], index=False)

In [14]:
filtered_df[filtered_df["area_code"] == 13101]

Unnamed: 0,unit_price_wmedian_smoothed_pct_chg,unit_price_wmedian_smoothed,year,years_since_crisis,count,total_traded_area,population,taxpayer_count,taxable_income,taxable_income_per_taxpayer,taxable_income_growth,taxable_income_per_taxpayer_growth,total_tax,total_tax_growth,new_dwellings,existing_dwellings,net_migration_ratio,new_dwellings_ratio,migrations_is_available,taxable_income_is_available,dwellings_is_available,total_tax_is_available,area_code
2011,0.047834,2980588.0,2006,9,99.0,17570.0,49650.0,25376.0,208206784.0,8204.870113,0.09699,0.012044,16032661.0,0.104188,1544.0,26891.0,0.019416,0.057417,1,1,1,1,13101
2012,0.070453,3190579.0,2007,10,76.0,12905.0,50614.0,26658.0,230467736.0,8645.349839,0.106918,0.053685,14785900.0,-0.077764,1010.0,28435.0,0.008594,0.03552,1,1,1,1,13101
2013,-0.01434,3144827.0,2008,0,59.0,9445.0,51049.0,27408.0,246365390.0,8988.81312,0.06898,0.039728,15135827.0,0.023666,455.0,29445.0,0.010735,0.015453,1,1,1,1,13101
2014,-0.074509,2910509.0,2009,1,53.0,8580.0,51597.0,27873.0,224939343.0,8070.151867,-0.086969,-0.102201,14429735.0,-0.04665,595.0,29900.0,0.019982,0.0199,1,1,1,1,13101
2015,-0.073278,2697234.0,2010,2,46.0,10700.0,52628.0,28171.0,217272181.0,7712.618686,-0.034085,-0.044303,13945774.0,-0.033539,796.0,30495.0,0.012579,0.026103,1,1,1,1,13101
2016,-0.126965,2354779.0,2011,3,61.0,13575.0,53290.0,28440.0,233634297.0,8214.989346,0.075307,0.065136,15138072.0,0.085495,1761.0,31291.0,0.013117,0.056278,1,1,1,1,13101
2017,-0.055118,2224987.0,2012,4,57.0,10810.0,53989.0,28761.0,219420257.0,7629.089983,-0.060839,-0.071321,14765097.0,-0.024638,1494.0,33052.0,0.019949,0.045202,1,1,1,1,13101
2018,0.254697,2791684.0,2013,5,63.0,12190.0,55066.0,29730.0,233174299.0,7843.064211,0.062684,0.028047,15419584.0,0.044327,1174.0,34546.0,0.035358,0.033984,1,1,1,1,13101
2019,0.13474,3167834.0,2014,6,68.0,15330.0,57013.0,31214.0,280560533.0,8988.291568,0.203222,0.146018,16662524.0,0.080608,1580.0,35720.0,0.043446,0.044233,1,1,1,1,13101
2020,0.041014,3297759.0,2015,7,70.0,11355.0,59490.0,32993.0,279917309.0,8484.142364,-0.002293,-0.05609,17435488.0,0.046389,1440.0,37300.0,0.023197,0.038606,1,1,1,1,13101
