In [5]:
import warnings
import json

import pandas as pd
import numpy as np

from jre_utils.datapath import model_ready_data_paths, DATA_DIRECTORY_PATH


from jre_utils.process import get_most_active_municipalities
from jre_utils.backtest import Portfolio, Timeline


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [6]:
n_neighbours = 5
neighbours_path = f"{DATA_DIRECTORY_PATH}/neighbours/{n_neighbours}.json"
with open(neighbours_path) as fd:
     neighbours = json.load(fd)
     print(f"E.g. Maps 1100 to {list(neighbours['1100'].keys())}") 

E.g. Maps 1100 to ['1203', '1234', '1217', '1423', '1231']


In [7]:
# get weighted average prices of the neighbour of an area_code and year
def get_weighted_average_neighbour_prices(df, area_code, year, price_column):
    nd = neighbours[area_code]
    filtered_df = df[(df["year"] == year) & (df["area_code"].isin(nd.keys()))]
    returns_dict = filtered_df.set_index("area_code")[price_column].dropna().to_dict()

    if len(returns_dict) == 0:
        return 0
    
    returns = [returns_dict.get(key, 0) for key in nd.keys()]
    # weights = [1 if returns_dict.get(key, None) else 0 for key, distance in nd.items()]

    weights = [np.log(1 / max(0.01, distance)) for distance in nd.values()]

    return np.average(returns, weights=weights)

In [8]:
asset_type = "combined"
dataset_key = "transactions"
years_ahead = 2

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "gmean"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"

normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"

In [9]:
dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_ready_data_path = model_ready_data_paths[dataset_name]


In [10]:
n = 500 # 500
num_noise = 1000

investment_asset_type = "building"
core_df = pd.read_csv(model_ready_data_path)
core_df["area_code"] = core_df["area_code"].astype(str)
len(core_df['area_code'].unique())

1396

In [127]:
# core_df["noise"] = np.random.normal(0, 1, core_df.shape[0])
for i in range(num_noise):
    core_df[f"noise_{i}"] = np.random.normal(0, 1, core_df.shape[0])

core_df["new_dwellings_over_count"] = (
    core_df["new_dwellings"] / core_df[f"{investment_asset_type}_count"]
)
core_df["neighbour_return"] = core_df.apply(
    lambda x: get_weighted_average_neighbour_prices(
        core_df,
        x["area_code"],
        x["year"],
        f"{investment_asset_type}_yearly_price_growth",
    ),
    axis=1,
)


# liquid_areas = get_most_active_municipalities(core_df, count_column=f"{investment_asset_type}_count", n=n)["area_code"].unique()
liquid_areas = get_most_active_municipalities(core_df, count_column=f"population", n=n)[
    "area_code"
].unique()
# liquid_areas = [area_code for area_code in core_df["area_code"].unique() if area_code // 1000 == 13] # Tokyo

In [128]:
core_df = core_df.fillna(0)

In [129]:
engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    "total_tax_growth",
    "new_dwellings_over_count",
    "neighbour_return",
    f"{investment_asset_type}_count_growth",
    f"{investment_asset_type}_yearly_price_growth",
    f"land_yearly_price_growth",
]

engineered_columns = []

for col in engineering_columns:
    core_df["multiplier"] = core_df[col] + 1
    core_df[f"cumu3_{col}"] = core_df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )
    engineered_columns.append(f"cumu3_{col}")
    print(f"cumu3_{col}")

core_df[engineered_columns] = core_df[engineered_columns].fillna(0)

cumu3_net_migration_ratio
cumu3_taxable_income_growth
cumu3_new_dwellings_ratio
cumu3_total_tax_growth
cumu3_new_dwellings_over_count
cumu3_neighbour_return
cumu3_building_count_growth
cumu3_building_yearly_price_growth
cumu3_land_yearly_price_growth


In [130]:
core_df[core_df["area_code"] == "13101"][group_by_columns + ["cumu3_neighbour_return", f"cumu3_{investment_asset_type}_yearly_price_growth", "noise_0"]]

Unnamed: 0,area,area_code,year,cumu3_neighbour_return,cumu3_building_yearly_price_growth,noise_0
16302,Tokyo-to Chiyoda-ku,13101,2005,1.0,1.0,-0.16444
16303,Tokyo-to Chiyoda-ku,13101,2006,1.158689,1.115103,0.121433
16304,Tokyo-to Chiyoda-ku,13101,2007,1.455563,1.316351,-0.210357
16305,Tokyo-to Chiyoda-ku,13101,2008,1.212487,1.303616,1.352125
16306,Tokyo-to Chiyoda-ku,13101,2009,0.775705,0.886208,-1.006381
16307,Tokyo-to Chiyoda-ku,13101,2010,0.591828,0.806747,1.401919
16308,Tokyo-to Chiyoda-ku,13101,2011,0.706929,0.638402,-0.802281
16309,Tokyo-to Chiyoda-ku,13101,2012,0.934056,0.864205,1.536159
16310,Tokyo-to Chiyoda-ku,13101,2013,1.06831,1.166079,-0.219703
16311,Tokyo-to Chiyoda-ku,13101,2014,1.201032,1.38843,1.110155


In [131]:
# combining_factors
core_df["negative_yearly_returns"] = -core_df[f"{investment_asset_type}_yearly_price_growth"]
core_df["negative_cumulative_returns"] = -core_df[f"cumu3_{investment_asset_type}_yearly_price_growth"]
core_df["inverse_cumulative_returns"] = 1 / np.log(core_df[f"cumu3_{investment_asset_type}_yearly_price_growth"] + 1)

# core_df["combined_factor"] = core_df["cumu3_taxable_income_growth"] + core_df["cumu3_net_migration_ratio"]
core_df["combined_factor"] = core_df["cumu3_taxable_income_growth"] + core_df["cumu3_net_migration_ratio"] - core_df[f"{investment_asset_type}_yearly_price_growth"]

In [132]:
investment_start_year = 2010
investment_end_year = 2020
investment_eval_end_year = 2022
rebalancing_frequency = 2

rebalancing_years = range(
    investment_start_year, investment_end_year + 1, rebalancing_frequency
)

asset_count = 20
shorting_enabled = True

In [133]:
timelines = {}

In [134]:
cash, assets, liabilities = 1000, {}, {}
initial_portfolio = Portfolio(assets, liabilities, cash)

factors = [
    "cumu3_taxable_income_growth",
    "cumu3_net_migration_ratio",
    "cumu3_new_dwellings_ratio",
    "cumu3_total_tax_growth",
    "cumu3_new_dwellings_over_count",
    "cumu3_neighbour_return",
    "negative_yearly_returns",
    "negative_cumulative_returns",
    "inverse_cumulative_returns",
    f"cumu3_{investment_asset_type}_count_growth",
    f"cumu3_{investment_asset_type}_yearly_price_growth",
    "cumu3_land_yearly_price_growth",
    "combined_factor",
]
noise_factors = [f"noise_{i}" for i in range(num_noise)]

In [135]:
# for factor in factors + noise_factors:
for factor in factors:
    timelines[factor] = Timeline(investment_start_year, initial_portfolio)
    for year in range(investment_start_year, investment_eval_end_year + 1):
        current_year_df = core_df[core_df["year"] == year]
        current_year_df = current_year_df.set_index("area_code")

        timelines[factor].remark(
            year, current_year_df, metric=f"{investment_asset_type}_yearly_price_growth"
        )

        # print(f"{year} NAV: {timelines[factor].get_current_portfolio().nav()}")

        if year in rebalancing_years:
            factor_df = current_year_df[current_year_df.index.isin(liquid_areas)]
            top_areas = factor_df.nlargest(asset_count, factor).index
            bottom_areas = factor_df.nsmallest(asset_count, factor).index
            timelines[factor].rebalance(top_areas, bottom_areas if shorting_enabled else [])

In [1]:
# plot_time_series(df, title="Factor vs Noise", ylabel="NAV", xlabel="Year", filename="factor_vs_noise.png")
import plotly.express as px
import matplotlib.pyplot as plt

factor = f"cumu3_neighbour_return"
factor_series = pd.Series(timelines[factor].get_cumulative_returns(), name=factor)
noise_series = [pd.Series(timelines[f"noise_{i}"].get_cumulative_returns(), name=f"noise_{i}") for i in range(num_noise)]

df = pd.concat([factor_series] + noise_series, axis=1)

# Create the line chart for the noise series only
fig = px.line(df, y=[col for col in df.columns if col.startswith('noise_')])

# Update the traces for the noise lines to be black and less prominent
noise_opacity = 0.1  # Semi-transparent
for trace in fig.data:
    trace.update(line=dict(color="black", width=2, dash="dot"), opacity=noise_opacity)

# Now, add the factor line separately to ensure it's on top
fig.add_scatter(x=df.index, y=df[factor], mode='lines+markers', name=factor,
                line=dict(color='red', width=4),
                marker=dict(size=7, color='red'))

fig.update_layout(
        autosize=False,
        width=1000,
        height=600,
        hovermode="closest",
        showlegend=False,
    )
fig.show()

NameError: name 'pd' is not defined

In [133]:
# plot tokyo returns

In [137]:
print(f"Annualized Return: ", timelines[factor].calculate_annualized_return())
print(f"Sharpe Ratio:", timelines[factor].calculate_sharpe_ratio())

Annualized Return:  0.03460124507516582
Sharpe Ratio: 0.7117898666037186


In [None]:
# average population size for 
# signal * count