In [1]:
import warnings
import json

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    factor_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
    DATA_DIRECTORY_PATH
)
from jre_utils.config import asset_types
from jre_utils.visualize import plot_time_series

from jp_prefecture.jp_cities import jp_cities as jp

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     print(f"E.g. Maps 1100 to {area_code_to_area["1100"]}") 

E.g. Maps 1100 to Hokkaido Sapporo-shi


In [3]:
# help(jp)

In [4]:
# Helpers

def get_geocode(area_code):
    # area_code -> (latitude, longitude)
    area_code = str(area_code)
    try:
        return tuple(jp.citycode2geodetic(area_code)) 
    except:
        print(f"Could not find geocode for {area_code}")
        return (0, 0)

def get_euclidian_distance(geocode1, geocode2):
    # lat1, lon1, lat2, lon2 -> distance
    lat1, lon1 = geocode1
    lat2, lon2 = geocode2
    return ((lat1 - lat2)**2 + (lon1 - lon2)**2)**0.5

def find_neighbours(area_code: str, area_to_geocode, n):
    area_code = str(area_code)
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    return sorted(area_to_distances, key=area_to_distances.get)[:n]

def find_neighbours_with_distance(area_code: str, area_to_geocode, n):
    area_code = str(area_code)
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    closest = sorted(area_to_distances, key=area_to_distances.get)[:n]
    return { area : area_to_distances[area] for area in closest }

# get_euclidian_distance(geocode(13101), geocode(13102))

In [5]:
# Smoothing parameters

n_neighbors = 5

all_area_codes = list(area_code_to_area.keys())
area_to_geocode = {area_code: get_geocode(area_code) for area_code in all_area_codes}
area_to_neighbours = {area_code: find_neighbours(area_code, area_to_geocode, 5) for area_code in all_area_codes}

Could not find geocode for 13362
Could not find geocode for 43506


In [6]:
# Get Longitudes and Latitudes
# For Area, get 5 closest neighbors

# area_to_neighbours
find_neighbours_with_distance("13101", area_to_geocode, 5)

{'13103': 0.00832056392317546,
 '13102': 0.032463657480348386,
 '13104': 0.034864796987784656,
 '13113': 0.04489803650493661,
 '13105': 0.05288099209545874}

In [7]:
"""
for each area_code:
    for each year:
        get n neighbours

        data[area_code][year] = value

        # BASIC KERNEL
        # Can do spatial smoothing for each year, then temporal smoothing for the rest
        # or
        # Can do temporal smoothing for each year, then spatial smoothing for the rest

        # Spatial Smoothing
        0.1 for each neighbour
        0.5 for main area

        # Temporal Smoothing
        0.1 (-2)
        0.2 (-1)
        0.5 (0)
        0.2 (-1)
        0.1 (-2)

        
        '''  area_code, n1, n2, n3
        T-2: 
        T-1:
        T0:
        T1:
        T2:
        '''

"""

"""
kennedy_town = 20 -> 30
SYP = 10 -> 15

kennedy_town_smooth_2 = .75 * 20 + .25 * 10 = 17.5
kennedy_town_smooth_2 = .75 * 30 + .25 * 15 = 26.25

kennedy_return_smooth = .75 * 0.5 + .25 * 0.5 = 0.5

"""

'\nkennedy_town = 20 -> 30\nSYP = 10 -> 15\n\nkennedy_town_smooth_2 = .75 * 20 + .25 * 10 = 17.5\nkennedy_town_smooth_2 = .75 * 30 + .25 * 15 = 26.25\n\nkennedy_return_smooth = .75 * 0.5 + .25 * 0.5 = 0.5\n\n'

In [8]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "main": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

In [25]:
granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["weighted_median"]
metric_pct_chg = metric + "_pct_chg"

In [13]:
core_df_path = dataset_paths["main"]
df = pd.read_csv(core_df_path)
df["area_code"] = df["area_code"].astype(str)

In [34]:
# Maybe do a cap on distance
def get_neighbours(year_df, area_code, n_neighbors=5):
    neighbours = find_neighbours(area_code, area_to_geocode, n_neighbors)
    return [
        neighbour
        for neighbour in neighbours
        if neighbour in year_df["area_code"].unique()
    ]


# Should we smooth on future?
def get_years(area_df, year, y_backward, y_forward):
    years = [year - i for i in range(1, y_backward)] + [
        year + i for i in range(1, y_forward)
    ]

    return [year for year in years if year in area_df["year"].unique()]


def get_price(df, area_code, year, metric):
    try:
        return df[(df["area_code"] == area_code) & (df["year"] == year)][metric].values[
            0
        ]
    except:
        print(f"Could not find price for {area_code} in {year}")


def scale_distance_to_weight(distance):
    return 1 / distance


def scale_year_to_weight(year_diff):
    return 1 / year_diff


def normalize_weights(weights, scale=1):
    return [scale * (weight / sum(weights)) for weight in weights]


def smooth(
    df,
    year,
    area_code,
    metric,
    n_neighbors=5,
    n_years_forward=1,
    n_years_backward=1,
    area_smoothing_factor=0.5,
    year_smoothing_factor=0.5,
    distance_scaler=scale_distance_to_weight,
    year_scaler=scale_year_to_weight,
):
    # OG price
    og_price = get_price(df, area_code, year, metric)

    # neighbour weights
    neighbours = get_neighbours(df[df["year"] == year], area_code, n_neighbors)
    neighbour_prices = [
        get_price(df, neighbour, year, metric) for neighbour in neighbours
    ]

    distances = [
        get_euclidian_distance(area_to_geocode[area_code], area_to_geocode[neighbour])
        for neighbour in neighbours
    ]
    distance_weights = normalize_weights(
        [distance_scaler(distance) for distance in distances],
        scale=area_smoothing_factor,
    )

    neighbour_weights = list(zip(distance_weights, neighbour_prices)) + [
        (1 - area_smoothing_factor, og_price)
    ]

    # area smoothing
    area_smoothed_price = np.average(
        [v for (_, v) in neighbour_weights], weights=[k for (k, _) in neighbour_weights]
    )

    # year weights
    years = get_years(
        df[df["area_code"] == area_code], year, n_years_backward, n_years_forward
    )

    year_prices = [get_price(df, area_code, year, metric) for year in years]
    year_diffs = [abs(year - year_) for year_ in years]
    year_diff_weights = normalize_weights(
        [year_scaler(year_diff) for year_diff in year_diffs],
        scale=year_smoothing_factor,
    )

    year_weights = list(zip(year_diff_weights, year_prices)) + [
        (1 - year_smoothing_factor, area_smoothed_price)
    ]

    # year smoothing
    area_year_smoothed_price = np.average(
        [v for (_, v) in year_weights],
        weights=[k for (k, _) in year_weights],
    )

    return area_year_smoothed_price

In [35]:
df[(df["area_code"] == "20602")] # eliminate cases like these

Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count,unit_price_wmedian_smoothed
5327,2022,20602,Nagano-ken Sakae-mura,3193.91635,3193.91635,2371.825397,2371.825397,2630.0,2.0,3193.91635
5397,2021,20602,Nagano-ken Sakae-mura,925.925926,925.925926,925.925926,925.925926,540.0,1.0,925.925926
5537,2019,20602,Nagano-ken Sakae-mura,20000.0,20000.0,20000.0,20000.0,300.0,1.0,20000.0
5675,2017,20602,Nagano-ken Sakae-mura,3750.0,3750.0,3750.0,3750.0,80.0,1.0,3750.0
6089,2011,20602,Nagano-ken Sakae-mura,7407.407407,7407.407407,7407.407407,7407.407407,270.0,1.0,7407.407407
6287,2008,20602,Nagano-ken Sakae-mura,5814.43299,5814.43299,6875.675676,6875.675676,485.0,2.0,5814.43299


In [36]:
df[f"{metric}_smoothed"] = df.apply(
    lambda row: smooth(
        df,
        row["year"],
        row["area_code"],
        metric,
        n_neighbors=5,
        n_years_forward=1,
        n_years_backward=1,
        area_smoothing_factor=0.5,
        year_smoothing_factor=0,
        distance_scaler=scale_distance_to_weight,
        year_scaler=scale_year_to_weight,
    ), axis=1
)

In [53]:
area_code = "13108"
plot_df = df[df["area_code"] == area_code]

In [54]:
plot_df

Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count,unit_price_wmedian_smoothed
2306,2023,13108,Tokyo-to Koto-ku,1052062.0,937500.0,1125485.0,937500.0,9700.0,61.0,2603249.0
2362,2022,13108,Tokyo-to Koto-ku,2089601.0,865093.240093,1215150.0,950000.0,27455.0,193.0,1782537.0
2419,2021,13108,Tokyo-to Koto-ku,1181048.0,990000.0,1061225.0,909545.454545,24910.0,196.0,1652762.0
2477,2020,13108,Tokyo-to Koto-ku,1590672.0,961334.573355,1011893.0,820855.614973,21130.0,172.0,1714374.0
2536,2019,13108,Tokyo-to Koto-ku,1213500.0,875000.0,1043429.0,854545.454545,29370.0,203.0,1840463.0
2594,2018,13108,Tokyo-to Koto-ku,923716.6,763095.238095,905452.1,800000.0,23570.0,149.0,1671550.0
2651,2017,13108,Tokyo-to Koto-ku,1111368.0,888888.888889,987716.1,782307.692308,25590.0,160.0,1662666.0
2709,2016,13108,Tokyo-to Koto-ku,1324150.0,855197.634609,920320.8,800000.0,28385.0,196.0,1418183.0
2767,2015,13108,Tokyo-to Koto-ku,870493.3,777222.222222,746373.3,711111.111111,29800.0,177.0,1347239.0
2824,2014,13108,Tokyo-to Koto-ku,721975.1,579941.604812,646160.0,588888.888889,23290.0,161.0,1195392.0


In [55]:
plot_time_series(
    plot_df,
    metric,
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)

In [56]:
plot_time_series(
    plot_df,
    f"{metric}_smoothed",
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)