In [None]:
# What do I want to see with tooling? What is the argument I want to make?

# The model outputs normalized returns for each area for each eval year
# I want to take out one target year, for e.g. 2022
# I want to sort the relative returns for that year
# I want to compare several areas
# I want to see the cumulative factors for each area, this will help me identify trends

# once this is done, write a narrative for the teammates

# target_year = 2022
# prefecture_code = 13 # (Tokyo, 13), (Hokaido, 1)
# results_df = output_df[output_df["year"] == target_year].sort_values(by=["predicted_normalized_return"], ascending=False)
# prefectural_results_df = results_df[(results_df["area_code"].astype(int) // 1000 == prefecture_code)]
# # results_df
# print(len(prefectural_results_df))
# prefectural_results_df.head(10)
# prefectural_results_df.tail(10)

In [3]:
import warnings

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    factor_data_paths,
    model_ready_data_paths,
    model_output_data_paths,
    get_price_index_path
)

from jre_utils.visualize import plot_time_series
from jre_utils.process import get_cumulative_growth, get_cumulative_growth_from_base


warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [18]:
investment_asset_type = "building"
dataset_key = "transactions"
years_ahead = 2
test_year = 2022

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
    "robust": "robust_price_index",
    "ols": "ols_price_index",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "gmean"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"

get_data_path = get_price_index_path


In [19]:
population_path = factor_data_paths["processed"]["population"]["municipality"]
migration_path = factor_data_paths["processed"]["migration"]["municipality"]
taxable_income_path = factor_data_paths["processed"]["taxable_income"]["municipality"]
new_dwellings_path = factor_data_paths["processed"]["new_dwellings"]["municipality"]
lfs_revenue_path = factor_data_paths["processed"]["lfs_revenue_breakdown"]["municipality"]

dataset_name = f"sequence_{dataset_key}_{investment_asset_type}_{metric_key}_{years_ahead}"
output_dataset_name = f"{dataset_name}_{test_year}"
model_output_data_path = model_output_data_paths[output_dataset_name]

In [20]:
df =  pd.read_csv(get_data_path(investment_asset_type))
df = df.sort_values(by=group_by_columns, ascending=True)
df = df[group_by_columns + [metric, "count", "robust_weights"]]
df["asset_type"] = investment_asset_type
# output_df = pd.read_csv(model_output_data_path)

In [21]:
population_df = pd.read_csv(population_path)
migration_df = pd.read_csv(migration_path)
taxable_income_df = pd.read_csv(taxable_income_path)
new_dwellings_df = pd.read_csv(new_dwellings_path)
lfs_revenue_df= pd.read_csv(lfs_revenue_path)

df = (
    df.merge(population_df, on=group_by_columns, how="left")
    .merge(migration_df, on=group_by_columns, how="left")
    .merge(taxable_income_df, on=group_by_columns, how="left")
    .merge(new_dwellings_df, on=group_by_columns, how="left")
    .merge(lfs_revenue_df, on=group_by_columns, how="left")
)

In [22]:
import plotly.express as px


def plot_time_series(
    df,
    column,
    group_by_columns,
    granularity_columns,
    title,
    visible=None,
    width=1600,
    height=800,
    showlegend=True,
    highlight=False,
    highlight_range=("2020", "2022"),
    color_discrete_map={},
):
    plot_df = df.set_index(group_by_columns)
    plot_df = plot_df[column].unstack(list(range(len(granularity_columns))))
    plot_df.columns = [f"{a}" for a in plot_df.columns]

    fig = px.line(
        plot_df, y=plot_df.columns, title=title
    )

    for trace in fig.data:
        if color_discrete_map.get(trace.name) is not None:
            trace.line.color = color_discrete_map[trace.name]

    fig.update_traces(mode="lines+markers", visible=visible)
    fig.update_layout(
        autosize=False,
        width=width,
        height=height,
        hovermode="closest",
        showlegend=showlegend,
    )
    if highlight:
        fig.add_vrect(
            x0=highlight_range[0],
            x1=highlight_range[1],
            fillcolor="LightSalmon",
            opacity=0.5,
            layer="below",
            line_width=0,
        )

    fig.show()

In [23]:
area_codes = [13103, 1100, 12238]
area_df = df[df["area_code"].isin(area_codes)].sort_values(by="year", ascending=True)
area_df[metric] = area_df[metric] / 100

plot_time_series(
    area_df,
    f"{metric}",
    group_by_columns,
    granularity_columns,
    f"Price Index over time",
    # visible="legendonly",
    width=1200,
    height=600,
    # highlight=True,
    # highlight_range=(2020, 2022)
    color_discrete_map={
        "('Tokyo-to Minato-ku', 13103)": "#EF553B",
        "('Hokkaido Sapporo-shi', 1100)": "#636efa",
        "('Chiba-ken Isumi-shi', 12238)": "#00cc96",
    }
)

# Set 13103 to red, 1100 to blue, 12238 to green


In [24]:
area_codes = [13103, 1100, 12238]
area_df = df[df["area_code"].isin(area_codes)].sort_values(by="year", ascending=True)

area_df[f"cumulative_{metric}_growth"] = get_cumulative_growth_from_base(area_df.copy(), metric)

plot_time_series(
    area_df,
    f"cumulative_{metric}_growth",
    group_by_columns,
    granularity_columns,
    f"cumulative_{metric}_growth over time",
    # visible="legendonly",
    width=1200,
    height=600,
    # highlight=True,
    # highlight_range=(2020, 2022)
    color_discrete_map={
        "('Tokyo-to Minato-ku', 13103)": "#EF553B",
        "('Hokkaido Sapporo-shi', 1100)": "#636efa",
        "('Chiba-ken Isumi-shi', 12238)": "#00cc96",
    }
)

growth_factors = [
    "taxable_income_growth",
    "total_tax_growth",
    "net_migration_ratio",
    "new_dwellings_ratio",
]

base_factors = [
    "taxpayer_count",
    "count",
    "population",
    "taxable_income",
    "existing_dwellings",
]

cumulative_factors = [
    f"cumulative_{factor}" for factor in growth_factors
] + [
    f"cumulative_{factor}_growth" for factor in base_factors
]

area_df[growth_factors] = area_df[growth_factors].replace({0: np.nan})
area_df[base_factors] = area_df[base_factors].replace({0: np.nan})

for factor in growth_factors:
    area_df[f"cumulative_{factor}"] = get_cumulative_growth(
        area_df.copy(), factor
    )

for factor in base_factors:
    area_df[f"cumulative_{factor}_growth"] = get_cumulative_growth_from_base(
        area_df.copy(), factor
    )

for cumulative_factor in cumulative_factors:
    plot_time_series(
        area_df[area_df["year"] <= 2021],
        cumulative_factor,
        group_by_columns,
        granularity_columns,
        f"{cumulative_factor} over time",
        # visible="legendonly",
        width=1200,
        height=600,
        # highlight=True,
        # highlight_range=(2015, 2020)
        color_discrete_map={
            "('Tokyo-to Minato-ku', 13103)": "#EF553B",
            "('Hokkaido Sapporo-shi', 1100)": "#636efa",
            "('Chiba-ken Isumi-shi', 12238)": "#00cc96",
        }
    )



In [11]:
# use population as the error scaling parameter instead

# autocorrelation over time series
# cross time series lead lag for pairs

# Forward looking smoothing for prediction column
# backward looking smoothing for factor column

# time series of all transa


In [11]:
# area_df