In [24]:
import warnings
import json

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    factor_data_paths,
    get_derived_csv_path,
    get_derived_lpa_path,
    get_derived_plps_path,
    DATA_DIRECTORY_PATH
)
from jre_utils.config import asset_types
from jre_utils.visualize import plot_time_series

from jp_prefecture.jp_cities import jp_cities as jp

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     print(f"E.g. Maps 1100 to {area_code_to_area["1100"]}") 

E.g. Maps 1100 to Hokkaido Sapporo-shi


In [3]:
# help(jp)

In [17]:
# Helpers

def get_geocode(area_code):
    # area_code -> (latitude, longitude)
    try:
        return tuple(jp.citycode2geodetic(area_code)) 
    except:
        print(f"Could not find geocode for {area_code}")
        return (0, 0)

def get_euclidian_distance(geocode1, geocode2):
    # lat1, lon1, lat2, lon2 -> distance
    lat1, lon1 = geocode1
    lat2, lon2 = geocode2
    return ((lat1 - lat2)**2 + (lon1 - lon2)**2)**0.5

def find_neighbours(area_code, area_to_geocode, n):
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    return sorted(area_to_distances, key=area_to_distances.get)[:n]

def find_neighbours_with_distance(area_code, area_to_geocode, n):
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    closest = sorted(area_to_distances, key=area_to_distances.get)[:n]
    return { area : area_to_distances[area] for area in closest }

# get_euclidian_distance(geocode(13101), geocode(13102))

In [18]:
# Smoothing parameters

n_neighbors = 5

all_area_codes = list(area_code_to_area.keys())
area_to_geocode = {area_code: get_geocode(area_code) for area_code in all_area_codes}
area_to_neighbours = {area_code: find_neighbours(area_code, area_to_geocode, 5) for area_code in all_area_codes}

Could not find geocode for 13362
Could not find geocode for 43506


In [20]:
# Get Longitudes and Latitudes
# For Area, get 5 closest neighbors

# area_to_neighbours
find_neighbours_with_distance("13101", area_to_geocode, 5)

{'13103': 0.00832056392317546,
 '13102': 0.032463657480348386,
 '13104': 0.034864796987784656,
 '13113': 0.04489803650493661,
 '13105': 0.05288099209545874}

In [None]:
"""
for each area_code:
    for each year:
        get n neighbours

        data[area_code][year] = value

        # BASIC KERNEL
        # Can do spatial smoothing for each year, then temporal smoothing for the rest
        # or
        # Can do temporal smoothing for each year, then spatial smoothing for the rest

        # Spatial Smoothing
        0.1 for each neighbour
        0.5 for main area

        # Temporal Smoothing
        0.1 (-2)
        0.2 (-1)
        0.5 (0)
        0.2 (-1)
        0.1 (-2)

        
        '''  area_code, n1, n2, n3
        T-2: 
        T-1:
        T0:
        T1:
        T2:
        '''

"""

"""
kennedy_town = 20 -> 30
SYP = 10 -> 15

kennedy_town_smooth_2 = .75 * 20 + .25 * 10 = 17.5
kennedy_town_smooth_2 = .75 * 30 + .25 * 15 = 26.25

kennedy_return_smooth = .75 * 0.5 + .25 * 0.5 = 0.5

"""

In [9]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "main": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

In [None]:
granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["weighted_median"]
metric_pct_chg = metric + "_pct_chg"

In [13]:
core_df_path = dataset_paths["main"]
df = pd.read_csv(core_df_path)

In [16]:
def get_years(area_df, year, y_backward, y_forward):
    years = [year - i for i in range(1, y_backward)] + [
        year + i for i in range(1, y_forward)
    ]

    return [year for year in years if year in area_df["year"].unique()]


def get_price(df, area_code, year, metric):
    return df[(df["area_code"] == area_code) & (df["year"] == year)][metric].values[0]


def scale_distance_to_weight(distance):
    return 1 / distance


def scale_year_to_weight(year_diff):
    return 1 / year_diff


def sum_to_one(weights, scale=1):
    return [weight / sum(weights) for weight in weights] * scale


# Should we smooth on future?


def smooth(
    df,
    year,
    area_code,
    metric,
    n_neighbors=5,
    n_years_forward=1,
    n_years_backward=1,
    area_smoothing_factor=0.5,
    year_smoothing_factor=0.5,
    distance_scaler=scale_distance_to_weight,
    year_scaler=scale_year_to_weight,
):
    # neighbour weights
    neighbours = find_neighbours(area_code, area_to_geocode, n_neighbors)
    neighbour_prices = [
        get_price(df, neighbour, year, metric) for neighbour in neighbours
    ]

    distances = [
        get_euclidian_distance(area_to_geocode[area_code], area_to_geocode[neighbour])
        for neighbour in neighbours
    ]
    distance_weights = [distance_scaler(distance) for distance in distances]
    distance_weights_scaled = sum_to_one(
        distance_weights, scale=1 - area_smoothing_factor
    )

    neighbour_weights = zip(distance_weights_scaled, neighbour_prices)

    """
    [
        (weight1, price1),
        (weight2, price2),
    ]

    [
        (area_smoothing_factor, og_price)
    ]
    """

    # year weights
    years = get_years(
        df[df["area_code"] == area_code], year, n_years_backward, n_years_forward
    )

    year_prices = [get_price(df, area_code, year, metric) for year in years]

    year_diffs = [abs(year - year_) for year_ in years]
    year_diff_weights = [year_scaler(year_diff) for year_diff in year_diffs]
    year_diff_weights_scaled = sum_to_one(
        year_diff_weights, scale=1 - year_smoothing_factor
    )

    year_weights = zip(year_diff_weights, year_prices)

    og_price = get_price(df, area_code, year, metric)

    area_weights = [(area_smoothing_factor, og_price)] + neighbour_weights
    area_smoothed_price = np.average(
        [v for (_, v) in area_weights], weights=[k for (k, _) in area_weights]
    )

    # smoothing

    area_year_smoothed_price = year_smoothing_factor * area_smoothed_price + (
        1 - year_smoothing_factor
    ) * sum([weight * price for weight, price in year_weights]) / sum(
        [weight for weight, price in year_weights]
    )

    year_weights_final = [
        (year_smoothing_factor, area_year_smoothed_price)
    ] + year_weights
    area_year_smoothed_price = np.average(
        [v for (_, v) in year_weights_final],
        weights=[k for (k, _) in year_weights_final],
    )

    return area_year_smoothed_price

In [26]:
values = [3.5, 1, 2]
weights = [0.5, 0.3, 0.2]

smoothed_price = np.average(values, weights=weights)
smoothed_price

2.4499999999999997

In [None]:
df[f"{metric}_smoothed"] = df.apply(lambda row: smooth(df, row["year"], row["area_code"], metric), axis=1)

In [23]:
help(np.average)

Help on _ArrayFunctionDispatcher in module numpy:

average(a, axis=None, weights=None, returned=False, *, keepdims=<no value>)
    Compute the weighted average along the specified axis.

    Parameters
    ----------
    a : array_like
        Array containing data to be averaged. If `a` is not an array, a
        conversion is attempted.
    axis : None or int or tuple of ints, optional
        Axis or axes along which to average `a`.  The default,
        axis=None, will average over all of the elements of the input array.
        If axis is negative it counts from the last to the first axis.

        .. versionadded:: 1.7.0

        If axis is a tuple of ints, averaging is performed on all of the axes
        specified in the tuple instead of a single axis or all the axes as
        before.
    weights : array_like, optional
        An array of weights associated with the values in `a`. Each value in
        `a` contributes to the average according to its associated weight.
        

In [None]:
year = 2013
[]