In [132]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    model_ready_data_paths,
    model_output_data_paths
)

from jre_utils.process import get_most_active_municipalities
from jre_utils.backtest import Portfolio, Timeline

from sklearn.metrics import r2_score

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [133]:
dataset_asset_type = "combined"
dataset_key = "transactions"
years_ahead = 3

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
    "robust": "robust_price_index",
    "ols": "ols_price_index",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "robust"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"

normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"
log_normalized_metric_pct_chg = metric_pct_chg + "_log_normalized_yearly"

In [134]:
dataset_name = f"sequence_{dataset_key}_{dataset_asset_type}_{metric_key}_{years_ahead}"
model_ready_data_path = model_ready_data_paths[dataset_name]


In [135]:
n = 500 # 500
num_noise = 1000

investment_asset_type = "building"

core_df = pd.read_csv(model_ready_data_path)
for i in range(num_noise):
    core_df[f"noise_{i}"] = np.random.normal(0, 1, core_df.shape[0])

In [136]:
# liquid_areas = get_most_active_municipalities(
#     core_df, count_column=f"{investment_asset_type}_count", n=n
# )["area_code"].unique()

liquid_areas = get_most_active_municipalities(core_df, count_column=f"population", n=n)[
    "area_code"
].unique()

# liquid_areas = get_most_active_municipalities(
#     core_df, count_column=f"{investment_asset_type}_weight", n=n
# )["area_code"].unique()

# liquid_areas = [area_code for area_code in core_df["area_code"].unique() if area_code // 1000 == 13] # Tokyo

In [137]:
core_df = core_df.fillna(0)

In [138]:
investment_start_year = 2010
investment_eval_end_year = 2022
investment_end_year = investment_eval_end_year - years_ahead
# investment_end_year = 2019

# rebalancing_frequency = years_ahead
rebalancing_frequency = 3

rebalancing_years = range(
    investment_start_year, investment_end_year + 1, rebalancing_frequency
)

asset_count = 100
shorting_enabled = True

In [139]:
cash, assets, liabilities = 1000, {}, {}
initial_portfolio = Portfolio(assets, liabilities, cash)

noise_factors = [f"noise_{i}" for i in range(num_noise)]

In [140]:
timelines = {}

for factor in noise_factors:
    timelines[factor] = Timeline(investment_start_year, initial_portfolio)
    for year in range(investment_start_year, investment_eval_end_year + 1):
        current_year_df = core_df[core_df["year"] == year]
        current_year_df = current_year_df.set_index("area_code")

        timelines[factor].remark(
            year, current_year_df, metric=f"{investment_asset_type}_yearly_price_growth"
        )

        if year in rebalancing_years:
            factor_df = current_year_df[current_year_df.index.isin(liquid_areas)]
            top_areas = factor_df.nlargest(asset_count, factor).index
            bottom_areas = factor_df.nsmallest(asset_count, factor).index
            timelines[factor].rebalance(top_areas, bottom_areas if shorting_enabled else [])

In [141]:
factor = "predicted_normalized_return"
timelines[factor] = Timeline(investment_start_year, initial_portfolio)

for year in range(investment_start_year, investment_eval_end_year + 1):
    current_year_df = core_df[core_df["year"] == year]
    current_year_df = current_year_df.set_index("area_code")

    timelines[factor].remark(
        year, current_year_df, metric=f"{investment_asset_type}_yearly_price_growth"
    )

    if year in rebalancing_years:
        dataset_name = (
            f"sequence_{dataset_key}_{investment_asset_type}_{metric_key}_{years_ahead}"
        )
        output_dataset_name = f"{dataset_name}_{year + years_ahead}"
        model_output_data_path = model_output_data_paths[output_dataset_name]

        pred_df = pd.read_csv(model_output_data_path)
        pred_df = pred_df[pred_df["area_code"].isin(liquid_areas)]
        pred_df = pred_df.set_index("area_code")

        top_areas = pred_df.nlargest(asset_count, factor).index
        bottom_areas = pred_df.nsmallest(asset_count, factor).index

        timelines[factor].rebalance(
            top_areas, bottom_areas if shorting_enabled else []
        )

In [142]:
# plot_time_series(df, title="Factor vs Noise", ylabel="NAV", xlabel="Year", filename="factor_vs_noise.png")
import plotly.express as px
import matplotlib.pyplot as plt

factor = f"predicted_normalized_return"
factor_series = pd.Series(timelines[factor].get_cumulative_returns(), name=factor)
noise_series = [pd.Series(timelines[f"noise_{i}"].get_cumulative_returns(), name=f"noise_{i}") for i in range(num_noise)]

df = pd.concat([factor_series] + noise_series, axis=1)

# Create the line chart for the noise series only
fig = px.line(df, y=[col for col in df.columns if col.startswith('noise_')])

# Update the traces for the noise lines to be black and less prominent
noise_opacity = 0.1  # Semi-transparent
for trace in fig.data:
    trace.update(line=dict(color="black", width=2, dash="dot"), opacity=noise_opacity)

# Now, add the factor line separately to ensure it's on top
fig.add_scatter(x=df.index, y=df[factor], mode='lines+markers', name=factor,
                line=dict(color='red', width=4),
                marker=dict(size=7, color='red'))

fig.update_layout(
        autosize=False,
        width=1000,
        height=600,
        hovermode="closest",
        showlegend=False,
    )

fig.update_xaxes(title_text="year")
fig.update_yaxes(title_text="return")

fig.show()

In [117]:
rebalancing_ratios = timelines[factor].calculate_rebalancing_ratio()


2017 Rebalancing ratio: 0.54


In [116]:
print(f"Annualized Return: ", timelines[factor].calculate_annualized_return())
print(f"Sharpe Ratio:", timelines[factor].calculate_sharpe_ratio())

Annualized Return:  0.018601857207712236
Sharpe Ratio: 0.5531937294352962
