In [1]:
import warnings

import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from jre_utils.datapath import factor_data_paths, get_derived_csv_path, model_ready_data_paths
from jre_utils.config import asset_types, statistics, area_levels, period_cols
from jre_utils.process import get_most_active_municipalities

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [2]:
period = "yearly"
area_level = "municipality"
asset_type = "building"
statistic = "median"

granularity_columns = area_levels[area_level]["columns"]
group_by_columns = granularity_columns + [period_cols[period]]
display_columns = list(statistics[statistic].keys())

metric = asset_types[asset_type]["metric"]
metric_pct_chg = asset_types[asset_type]["metric_pct_chg"]
upcoming_metric_pct_chg = "Upcoming" + metric_pct_chg

core_path = get_derived_csv_path(period, area_level, asset_type, statistic)
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]

In [3]:
df = pd.read_csv(core_path)

population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
)

In [4]:
# prepare metrics
df = df.sort_values(by=group_by_columns, ascending=True)
df[metric_pct_chg] = df.groupby(granularity_columns)[metric].pct_change()

# set up target variables
df[upcoming_metric_pct_chg] = df.groupby(granularity_columns)[metric_pct_chg].shift(-1)

# time box
start_year = 2005
end_year = 2023

df = df[(df["year"] >= start_year) & (df["year"] <= end_year)]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30339 entries, 19363 to 17393
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Prefecture                          30339 non-null  object 
 1   Municipality                        30339 non-null  object 
 2   year                                30339 non-null  int64  
 3   TradePricePerArea                   30339 non-null  float64
 4   UnitPrice                           0 non-null      float64
 5   Count                               30339 non-null  int64  
 6   population                          24235 non-null  float64
 7   net_migration_ratio                 24235 non-null  float64
 8   taxable_income                      23202 non-null  float64
 9   taxpayer_count                      23202 non-null  float64
 10  taxable_income_per_taxpayer         23202 non-null  float64
 11  taxable_income_growth               23202 

In [6]:
# most frequent municipalities
# skip this and build custom cost function using count as parameter. The lower the count, the lower the cost.

# df = get_most_active_municipalities(df, n=800)
# df = df[df["Count"] >= 10]

In [7]:
y_columns = [upcoming_metric_pct_chg]

X_categorical_columns = ["Prefecture", "Municipality"]

X_basic_columns = [
    "year",
    "population",
    "Count",
    "taxable_income_per_taxpayer",
    "taxable_income_per_taxpayer_growth",
]

X_engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    metric_pct_chg,
]

initial_columns = (
    X_categorical_columns + X_basic_columns + X_engineering_columns + y_columns
)

df = df[initial_columns]
df[X_categorical_columns] = df[X_categorical_columns].astype("category")
df = df.dropna()

In [8]:
lag = 1

X_engineered_columns = []

for col in X_engineering_columns:
    df["multiplier"] = df[col] + 1
    df[f"{col}_ma3"] = df.groupby(granularity_columns)[col].transform(
        lambda x: x.rolling(3, 1).mean()
    )
    df[f"{col}_cumu3"] = df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )

    X_engineered_columns.append(f"{col}_ma3")
    X_engineered_columns.append(f"{col}_cumu3")
    print(f"{col}_ma3")
    print(f"{col}_cumu3")

    for i in range(1, lag + 1):
        df[f"{col}_lag{i}"] = df.groupby(granularity_columns)[col].shift(i)
        X_engineered_columns.append(f"{col}_lag{i}")
        print(f"{col}_lag{i}")

df[X_engineered_columns] = df[X_engineered_columns].fillna(0)

net_migration_ratio_ma3
net_migration_ratio_cumu3
net_migration_ratio_lag1
taxable_income_growth_ma3
taxable_income_growth_cumu3
taxable_income_growth_lag1
new_dwellings_ratio_ma3
new_dwellings_ratio_cumu3
new_dwellings_ratio_lag1
TradePricePctChg_ma3
TradePricePctChg_cumu3
TradePricePctChg_lag1


In [12]:
X_engineered_columns

['net_migration_ratio_ma3',
 'net_migration_ratio_cumu3',
 'net_migration_ratio_lag1',
 'taxable_income_growth_ma3',
 'taxable_income_growth_cumu3',
 'taxable_income_growth_lag1',
 'new_dwellings_ratio_ma3',
 'new_dwellings_ratio_cumu3',
 'new_dwellings_ratio_lag1',
 'TradePricePctChg_ma3',
 'TradePricePctChg_cumu3',
 'TradePricePctChg_lag1']

In [1]:
df

NameError: name 'df' is not defined

In [10]:
print(f"Initial Size: ", df.shape[0])
q = 0.01
filtered_df = df.copy()
filter_col = y_columns[0]
filtered_df = filtered_df[
    (filtered_df[filter_col] >= filtered_df[filter_col].quantile(q))
    & (filtered_df[filter_col] <= filtered_df[filter_col].quantile(1 - q))
]
print(f"Filtered Size: ", filtered_df.shape[0])
filtered_df.describe()

Initial Size:  10701
Filtered Size:  10487


Unnamed: 0,year,population,Count,taxable_income_per_taxpayer,taxable_income_per_taxpayer_growth,net_migration_ratio,taxable_income_growth,new_dwellings_ratio,TradePricePctChg,UpcomingTradePricePctChg,multiplier,net_migration_ratio_ma3,net_migration_ratio_cumu3,net_migration_ratio_lag1,taxable_income_growth_ma3,taxable_income_growth_cumu3,taxable_income_growth_lag1,new_dwellings_ratio_ma3,new_dwellings_ratio_cumu3,new_dwellings_ratio_lag1,TradePricePctChg_ma3,TradePricePctChg_cumu3,TradePricePctChg_lag1
count,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0,10487.0
mean,2014.462001,118085.3,103.806618,3022.698681,0.001225,-0.002433,0.003387,0.013322,0.036578,0.025494,1.036578,-0.002612,0.993211,-0.002299,0.001617,1.006916,0.001669,0.013696,1.038157,0.012513,0.038656,1.025911,0.035848
std,4.075304,137623.2,118.645472,668.935295,0.022551,0.005506,0.033416,0.006797,0.473326,0.287359,0.473326,0.005271,0.014421,0.005398,0.024317,0.065955,0.032262,0.006509,0.019467,0.007483,0.418574,0.665836,0.468031
min,2006.0,2961.0,1.0,2053.240341,-0.294513,-0.058973,-0.398456,0.0,-0.886667,-0.614716,0.113333,-0.027463,0.920712,-0.058973,-0.201696,0.483597,-0.398456,0.000762,1.00098,0.0,-0.886667,0.02933,-0.973874
25%,2011.0,41394.5,29.0,2647.120051,-0.007598,-0.005702,-0.008249,0.00886,-0.138136,-0.134442,0.861864,-0.005823,0.984274,-0.005501,-0.012679,0.968971,-0.008042,0.009287,1.024581,0.00794,-0.049417,0.81727,-0.125091
50%,2015.0,68603.0,62.0,2893.738632,0.002385,-0.002634,0.006771,0.012416,-0.005405,-0.003605,0.994595,-0.00288,0.992599,-0.002177,0.005167,1.013121,0.003411,0.012912,1.036014,0.012021,0.008819,0.973214,0.0
75%,2018.0,133438.5,126.0,3218.537565,0.011454,0.000399,0.019931,0.016489,0.128411,0.128733,1.128411,6.4e-05,1.000151,7.7e-05,0.016362,1.047582,0.017098,0.016897,1.048347,0.016294,0.076782,1.133027,0.111764
max,2021.0,1540632.0,886.0,12667.02,0.409227,0.043446,0.440856,0.12027,31.333333,1.45925,32.333333,0.038018,1.105401,0.043446,0.221486,1.809814,0.440856,0.091613,1.299733,0.095865,31.333333,32.666667,31.333333


In [11]:
filtered_df.to_csv(model_ready_data_paths["xgb"], index=False)

In [39]:
test_prefecture, test_municipality = "Tokyo", "Minato"
test_df = filtered_df[(filtered_df["Prefecture"] == test_prefecture) & (filtered_df["Municipality"] == test_municipality)]
test_df[["year", metric_pct_chg, upcoming_metric_pct_chg]]

Unnamed: 0,year,TradePricePctChg,UpcomingTradePricePctChg
3076,2006,0.467742,0.027027
3077,2007,0.027027,0.010341
3078,2008,0.010341,-0.413008
3079,2009,-0.413008,-0.071642
3080,2010,-0.071642,0.052466
3081,2011,0.052466,-0.041267
3082,2012,-0.041267,0.09541
3083,2013,0.09541,0.053292
3084,2014,0.053292,0.16
3085,2015,0.16,0.115481


In [2]:
test_year = 2021
test_prefecture = "Tokyo"
presentation_df = df[(df["Prefecture"] == test_prefecture) & (df["year"] == test_year)]
presentation_df = presentation_df.sort_index(axis=1)
presentation_df["upcoming_return"] = presentation_df["UpcomingTradePricePctChg"]
presentation_df = presentation_df.sort_values(by=upcoming_metric_pct_chg, ascending=False).reset_index(drop=True)

drop_columns = ["UpcomingTradePricePctChg", "year", "Prefecture"]
presentation_df.drop(columns=drop_columns).style.background_gradient(cmap="cividis")

NameError: name 'df' is not defined