In [1]:
import warnings
import json

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    DATA_DIRECTORY_PATH,
    model_ready_data_paths
)
from jre_utils.config import asset_types
from jre_utils.visualize import plot_time_series

from jp_prefecture.jp_cities import jp_cities as jp

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [11]:
# Helpers
def get_geocode(area_code):
    # area_code -> (latitude, longitude)
    area_code = str(area_code)
    try:
        return tuple(jp.citycode2geodetic(area_code)) 
    except:
        print(f"Could not find geocode for {area_code}")
        return (0, 0)

def get_euclidian_distance(geocode1, geocode2):
    # lat1, lon1, lat2, lon2 -> distance
    lat1, lon1 = geocode1
    lat2, lon2 = geocode2
    return ((lat1 - lat2)**2 + (lon1 - lon2)**2)**0.5

def find_neighbours(area_code: str, area_to_geocode, n):
    area_code = str(area_code)
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    return sorted(area_to_distances, key=area_to_distances.get)[:n]

def find_neighbours_with_distance(area_code: str, area_to_geocode, n):
    area_code = str(area_code)
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    closest = sorted(area_to_distances, key=area_to_distances.get)[:n]
    return { area : area_to_distances[area] for area in closest }

get_euclidian_distance(get_geocode(13101), get_geocode(13102))

0.032463657480348386

In [12]:
asset_type = "combined"
dataset_key = "transactions"
years_ahead = 2

metrics = {
    "median": "unit_price_median",
    "gmean": "unit_price_gmean",
}

granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]

metric_key = "gmean"
metric = metrics[metric_key]
metric_pct_chg = metric + "_pct_chg"

normalized_metric_pct_chg = metric_pct_chg + "_normalized_yearly"

dataset_name = f"sequence_{dataset_key}_{asset_type}_{metric_key}_{years_ahead}"
model_ready_data_path = model_ready_data_paths[dataset_name]

df = pd.read_csv(model_ready_data_path)


In [27]:
n_neighbours = 5
all_area_codes = list(df["area_code"].astype(str).unique())
area_to_geocode = {area_code: get_geocode(area_code) for area_code in all_area_codes}
area_to_neighbours = {area_code: find_neighbours(area_code, area_to_geocode, n_neighbours) for area_code in all_area_codes}
area_to_neighbours_with_distance = {area_code: find_neighbours_with_distance(area_code, area_to_geocode, n_neighbours) for area_code in all_area_codes}

In [28]:
find_neighbours_with_distance("13101", area_to_geocode, n_neighbours)

{'13103': 0.00832056392317546,
 '13102': 0.032463657480348386,
 '13104': 0.034864796987784656,
 '13113': 0.04489803650493661,
 '13105': 0.05288099209545874}

In [32]:
neighbours_path = f"{DATA_DIRECTORY_PATH}/neighbours/{n_neighbours}.json"
with open(neighbours_path, 'w') as fd:
    json.dump(area_to_neighbours_with_distance, fd)