In [21]:
import warnings
import json

import pandas as pd
import numpy as np

from jre_utils.datapath import model_ready_data_paths, DATA_DIRECTORY_PATH


from jre_utils.process import get_most_active_municipalities
from jre_utils.backtest import Portfolio, Timeline


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [22]:
n_neighbours = 5
neighbours_path = f"{DATA_DIRECTORY_PATH}/neighbours/{n_neighbours}.json"
with open(neighbours_path) as fd:
     neighbours = json.load(fd)
     print(f"E.g. Maps 1100 to {list(neighbours['1100'].keys())}") 

E.g. Maps 1100 to ['1203', '1234', '1217', '1423', '1231']


In [23]:
# get weighted average prices of the neighbour of an area_code and year
def get_weighted_average_neighbour_prices(df, area_code, year, price_column):
    nd = neighbours[area_code]
    filtered_df = df[(df["year"] == year) & (df["area_code"].isin(nd.keys()))]
    returns_dict = filtered_df.set_index("area_code")[price_column].dropna().to_dict()

    if len(returns_dict) == 0:
        return 0
    
    returns = [returns_dict.get(key, 0) for key in nd.keys()]
    # weights = [1 if returns_dict.get(key, None) else 0 for key, distance in nd.items()]

    weights = [np.log(1 / max(0.01, distance)) for distance in nd.values()]

    return np.average(returns, weights=weights)

In [288]:
asset_type = "combined"
dataset_key = "transactions"
years_ahead = 1

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
    "robust": "robust_price_index",
    "ols": "ols_price_index",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "robust"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"

normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"
log_normalized_metric_pct_chg = metric_pct_chg + "_log_normalized_yearly"

In [289]:
dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_ready_data_path = model_ready_data_paths[dataset_name]

n = 500 # 500
num_noise = 1000

investment_asset_type = "building"
core_df = pd.read_csv(model_ready_data_path)
core_df["area_code"] = core_df["area_code"].astype(str)
len(core_df['area_code'].unique())

for i in range(num_noise):
    core_df[f"noise_{i}"] = np.random.normal(0, 1, core_df.shape[0])

In [290]:
core_df["new_dwellings_over_count"] = (
    core_df["new_dwellings"] / core_df[f"{investment_asset_type}_count"]
)

In [291]:
core_df["neighbour_return"] = core_df.apply(
    lambda x: get_weighted_average_neighbour_prices(
        core_df,
        x["area_code"],
        x["year"],
        f"{investment_asset_type}_yearly_price_growth",
    ),
    axis=1,
)

In [292]:
# liquid_areas = get_most_active_municipalities(
#     core_df, count_column=f"{investment_asset_type}_count", n=n
# )["area_code"].unique()

liquid_areas = get_most_active_municipalities(core_df, count_column=f"population", n=n)[
    "area_code"
].unique()

# liquid_areas = get_most_active_municipalities(core_df, count_column=f"{investment_asset_type}_weight", n=n)[
#     "area_code"
# ].unique()

# liquid_areas = [area_code for area_code in core_df["area_code"].unique() if area_code // 1000 == 13] # Tokyo

In [293]:
core_df = core_df.fillna(0)

In [294]:
engineering_columns = [
    "net_migration_ratio",
    "taxable_income_growth",
    "new_dwellings_ratio",
    "total_tax_growth",
    "new_dwellings_over_count",
    "neighbour_return",
    f"{investment_asset_type}_yearly_price_growth",
    f"land_yearly_price_growth",
]

engineered_columns = []

for col in engineering_columns:
    core_df["multiplier"] = core_df[col] + 1
    core_df[f"cumu3_{col}"] = core_df.groupby(granularity_columns)["multiplier"].transform(
        lambda x: x.rolling(3, 1).apply(np.prod, raw=True)
    )
    engineered_columns.append(f"cumu3_{col}")
    print(f"cumu3_{col}")

core_df[engineered_columns] = core_df[engineered_columns].fillna(0)

core_df["inverse_cumulative_returns"] = 1 / np.log(core_df[f"cumu3_{investment_asset_type}_yearly_price_growth"] + 1)

cumu3_net_migration_ratio
cumu3_taxable_income_growth
cumu3_new_dwellings_ratio
cumu3_total_tax_growth
cumu3_new_dwellings_over_count
cumu3_neighbour_return
cumu3_building_yearly_price_growth
cumu3_land_yearly_price_growth


In [295]:
core_df[core_df["area_code"] == "13101"][group_by_columns + ["cumu3_neighbour_return", f"cumu3_{investment_asset_type}_yearly_price_growth", "noise_0"]]

Unnamed: 0,area,area_code,year,cumu3_neighbour_return,cumu3_building_yearly_price_growth,noise_0
14857,Tokyo-to Chiyoda-ku,13101,2006,1.0,1.0,-0.917009
14858,Tokyo-to Chiyoda-ku,13101,2007,1.237683,1.227178,1.831132
14859,Tokyo-to Chiyoda-ku,13101,2008,1.094486,1.171101,-0.920798
14860,Tokyo-to Chiyoda-ku,13101,2009,0.866876,0.839013,-1.352296
14861,Tokyo-to Chiyoda-ku,13101,2010,0.675134,0.748605,1.209079
14862,Tokyo-to Chiyoda-ku,13101,2011,0.785194,0.684736,1.395812
14863,Tokyo-to Chiyoda-ku,13101,2012,0.959919,0.991951,-1.692838
14864,Tokyo-to Chiyoda-ku,13101,2013,1.098189,1.04159,-0.966687
14865,Tokyo-to Chiyoda-ku,13101,2014,1.176847,1.364919,2.775478
14866,Tokyo-to Chiyoda-ku,13101,2015,1.349368,1.339754,0.981737


In [296]:
investment_start_year = 2010
investment_end_year = 2020
investment_eval_end_year = 2022
rebalancing_frequency = years_ahead

rebalancing_years = range(
    investment_start_year, investment_end_year + 1, rebalancing_frequency
)

asset_count = 20
shorting_enabled = True

In [297]:
cash, assets, liabilities = 1000, {}, {}
initial_portfolio = Portfolio(assets, liabilities, cash)

factors = [
    "cumu3_taxable_income_growth",
    "cumu3_net_migration_ratio",
    "cumu3_new_dwellings_ratio",
    "cumu3_total_tax_growth",
    "cumu3_new_dwellings_over_count",
    "cumu3_neighbour_return",
    "inverse_cumulative_returns",
]

noise_factors = [f"noise_{i}" for i in range(num_noise)]

In [298]:
timelines = {}

for factor in noise_factors:
    timelines[factor] = Timeline(investment_start_year, initial_portfolio)
    for year in range(investment_start_year, investment_eval_end_year + 1):
        current_year_df = core_df[core_df["year"] == year]
        current_year_df = current_year_df.set_index("area_code")

        timelines[factor].remark(
            year, current_year_df, metric=f"{investment_asset_type}_yearly_price_growth"
        )

        if year in rebalancing_years:
            factor_df = current_year_df[current_year_df.index.isin(liquid_areas)]
            top_areas = factor_df.nlargest(asset_count, factor).index
            bottom_areas = factor_df.nsmallest(asset_count, factor).index
            timelines[factor].rebalance(top_areas, bottom_areas if shorting_enabled else [])

In [299]:
for factor in factors:
    timelines[factor] = Timeline(investment_start_year, initial_portfolio)
    for year in range(investment_start_year, investment_eval_end_year + 1):
        current_year_df = core_df[core_df["year"] == year]
        current_year_df = current_year_df.set_index("area_code")

        timelines[factor].remark(
            year, current_year_df, metric=f"{investment_asset_type}_yearly_price_growth"
        )

        # print(f"{year} NAV: {timelines[factor].get_current_portfolio().nav()}")

        if year in rebalancing_years:
            factor_df = current_year_df[current_year_df.index.isin(liquid_areas)]
            top_areas = factor_df.nlargest(asset_count, factor).index
            bottom_areas = factor_df.nsmallest(asset_count, factor).index
            timelines[factor].rebalance(top_areas, bottom_areas if shorting_enabled else [])

In [300]:
# combining_factors
def simple_linear_factor(factor_df):
    factor_df["simple_linear_factor"] = factor_df["cumu3_taxable_income_growth"] + factor_df["cumu3_net_migration_ratio"]
    top_areas = factor_df.nlargest(asset_count, "simple_linear_factor").index
    bottom_areas = factor_df.nsmallest(asset_count, "simple_linear_factor").index    
    return top_areas, bottom_areas

def simple_linear_factor_with_mean_reversion(factor_df):
    factor_df["simple_linear_factor_with_mean_reversion"] = factor_df["cumu3_taxable_income_growth"] + factor_df["cumu3_net_migration_ratio"] - factor_df[f"{investment_asset_type}_yearly_price_growth"]
    top_areas = factor_df.nlargest(asset_count, "simple_linear_factor_with_mean_reversion").index
    bottom_areas = factor_df.nsmallest(asset_count, "simple_linear_factor_with_mean_reversion").index    
    return top_areas, bottom_areas

def income_migration_intersection(factor_df):
    factor_df["simple_linear_factor"] = factor_df["cumu3_taxable_income_growth"] + factor_df["cumu3_net_migration_ratio"]

    top_tax_areas = factor_df.nlargest(100, "cumu3_taxable_income_growth").index
    bottom_tax_areas = factor_df.nsmallest(100, "cumu3_taxable_income_growth").index

    top_migration_areas = factor_df.nlargest(100, "cumu3_net_migration_ratio").index
    bottom_migration_areas = factor_df.nsmallest(100, "cumu3_net_migration_ratio").index

    top_areas = np.intersect1d(top_tax_areas, top_migration_areas)
    bottom_areas = np.intersect1d(bottom_tax_areas, bottom_migration_areas)
    
    final_top_areas = factor_df[factor_df.index.isin(top_areas)].nlargest(asset_count, "simple_linear_factor").index
    final_bottom_areas = factor_df[factor_df.index.isin(bottom_areas)].nsmallest(asset_count, "simple_linear_factor").index

    return final_top_areas, final_bottom_areas


def income_migration_intersection_with_mean_reversion(factor_df):
    top_tax_areas = factor_df.nlargest(100, "cumu3_taxable_income_growth").index
    bottom_tax_areas = factor_df.nsmallest(100, "cumu3_taxable_income_growth").index

    top_migration_areas = factor_df.nlargest(100, "cumu3_net_migration_ratio").index
    bottom_migration_areas = factor_df.nsmallest(100, "cumu3_net_migration_ratio").index

    top_areas = np.intersect1d(top_tax_areas, top_migration_areas)
    bottom_areas = np.intersect1d(bottom_tax_areas, bottom_migration_areas)

    final_top_areas = factor_df[factor_df.index.isin(top_areas)].nlargest(asset_count, "inverse_cumulative_returns").index
    final_bottom_areas = factor_df[factor_df.index.isin(bottom_areas)].nsmallest(asset_count, "inverse_cumulative_returns").index

    return final_top_areas, final_bottom_areas

def income_migration_dwellings_intersection(factor_df):
    top_tax_areas = factor_df.nlargest(100, "cumu3_taxable_income_growth").index
    bottom_tax_areas = factor_df.nsmallest(100, "cumu3_taxable_income_growth").index

    top_migration_areas = factor_df.nlargest(100, "cumu3_net_migration_ratio").index
    bottom_migration_areas = factor_df.nsmallest(100, "cumu3_net_migration_ratio").index

    top_new_dwellings_areas = factor_df.nlargest(100, "cumu3_new_dwellings_ratio").index
    bottom_new_dwellings_areas = factor_df.nsmallest(100, "cumu3_new_dwellings_ratio").index

    top_areas = np.intersect1d(np.intersect1d(top_tax_areas, top_migration_areas), top_new_dwellings_areas)
    bottom_areas = np.intersect1d(np.intersect1d(bottom_tax_areas, bottom_migration_areas), bottom_new_dwellings_areas)

    return top_areas[:asset_count], bottom_areas[:asset_count]


# (df) => Tuple(list, list)
combined_factor_generator = {
    "simple_linear_factor": simple_linear_factor,
    "simple_linear_factor_with_mean_reversion": simple_linear_factor_with_mean_reversion,
    "income_migration_intersection": income_migration_intersection,
    "income_migration_intersection_with_mean_reversion": income_migration_intersection_with_mean_reversion,
    "income_migration_dwellings_intersection": income_migration_dwellings_intersection
}

In [301]:
for factor in combined_factor_generator.keys():
    timelines[factor] = Timeline(investment_start_year, initial_portfolio)
    for year in range(investment_start_year, investment_eval_end_year + 1):
        current_year_df = core_df[core_df["year"] == year]
        current_year_df = current_year_df.set_index("area_code")

        timelines[factor].remark(
            year, current_year_df, metric=f"{investment_asset_type}_yearly_price_growth"
        )
        
        if year in rebalancing_years:
            factor_df = current_year_df[current_year_df.index.isin(liquid_areas)]
            top_areas, bottom_areas = combined_factor_generator[factor](factor_df.copy())
            timelines[factor].rebalance(top_areas, bottom_areas if shorting_enabled else [])

In [310]:
# plot_time_series(df, title="Factor vs Noise", ylabel="NAV", xlabel="Year", filename="factor_vs_noise.png")
import plotly.express as px
import matplotlib.pyplot as plt

# factor = f"cumu3_taxable_income_growth"
# factor = f"cumu3_new_dwellings_ratio"
# factor = f"cumu3_net_migration_ratio"
# factor = f"cumu3_neighbour_return"
# factor = f"inverse_cumulative_returns"

# Combined Factors
# factor = f"simple_linear_factor"
# factor = f"simple_linear_factor_with_mean_reversion"
factor = f"income_migration_intersection"
# factor = f"income_migration_intersection_with_mean_reversion"
# factor = f"income_migration_dwellings_intersection"

factor_series = pd.Series(timelines[factor].get_cumulative_returns(), name=factor)
noise_series = [pd.Series(timelines[f"noise_{i}"].get_cumulative_returns(), name=f"noise_{i}") for i in range(num_noise)]

df = pd.concat([factor_series] + noise_series, axis=1)

# Create the line chart for the noise series only
fig = px.line(df, y=[col for col in df.columns if col.startswith('noise_')])

# Update the traces for the noise lines to be black and less prominent
noise_opacity = 0.1  # Semi-transparent
for trace in fig.data:
    trace.update(line=dict(color="black", width=2, dash="dot"), opacity=noise_opacity)

# Now, add the factor line separately to ensure it's on top
fig.add_scatter(x=df.index, y=df[factor], mode='lines+markers', name=factor,
                line=dict(color='red', width=4),
                marker=dict(size=7, color='red'))

fig.update_layout(
        autosize=False,
        width=1000,
        height=600,
        hovermode="closest",
        showlegend=False,
    )

fig.update_xaxes(title_text="year")
fig.update_yaxes(title_text="return")

fig.show()

In [311]:
print(f"Factor: {factor}")
print(f"Annualized Return: ", timelines[factor].calculate_annualized_return())
print(f"Sharpe Ratio:", timelines[factor].calculate_sharpe_ratio())

Factor: income_migration_intersection
Annualized Return:  0.041113538171092534
Sharpe Ratio: 0.8814092395308865


In [304]:
# 1

# Factor: simple_linear_factor
# Annualized Return:  0.03544292298060814
# Sharpe Ratio: 0.9343882481207659

# Factor: income_migration_intersection
# Annualized Return:  0.041113538171092534
# Sharpe Ratio: 0.8814092395308865

# Factor: income_migration_intersection_with_mean_reversion
# Annualized Return:  0.07124638873382394
# Sharpe Ratio: 2.2166109118190125

# 2

# Factor: simple_linear_factor
# Annualized Return:  0.04170540679481882
# Sharpe Ratio: 1.4175914027061376

# Factor: income_migration_intersection
# Annualized Return:  0.04141484482939184
# Sharpe Ratio: 1.0268308958101546

# Factor: income_migration_intersection_with_mean_reversion
# Annualized Return:  0.04600799122095989
# Sharpe Ratio: 1.0101727284825328

# 3

# Factor: simple_linear_factor
# Annualized Return:  0.04248122341008487
# Sharpe Ratio: 1.5712579641443858

# Factor: income_migration_intersection
# Annualized Return:  0.04555115927028419
# Sharpe Ratio: 1.1583924508003751

# Factor: income_migration_intersection_with_mean_reversion
# Annualized Return:  0.04703144769648371
# Sharpe Ratio: 1.3424224117353039

# 4

# Factor: simple_linear_factor
# Annualized Return:  0.04680079925544778
# Sharpe Ratio: 1.3874515803026266

# Factor: income_migration_intersection
# Annualized Return:  0.04917687739422405
# Sharpe Ratio: 1.341415415070784

# Factor: income_migration_intersection_with_mean_reversion
# Annualized Return:  0.043243824710690326
# Sharpe Ratio: 1.1912358283580067



