In [8]:
import warnings
import json

import pandas as pd
import numpy as np

from jre_utils.datapath import (
    DATA_DIRECTORY_PATH,
    model_ready_data_paths
)
from jre_utils.config import asset_types
from jre_utils.visualize import plot_time_series

from jp_prefecture.jp_cities import jp_cities as jp

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [9]:
area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     print(f"E.g. Maps 1100 to {area_code_to_area['1100']}") 

E.g. Maps 1100 to Hokkaido Sapporo-shi


In [10]:
# help(jp)

In [7]:
# Helpers

def get_geocode(area_code):
    # area_code -> (latitude, longitude)
    area_code = str(area_code)
    try:
        return tuple(jp.citycode2geodetic(area_code)) 
    except:
        print(f"Could not find geocode for {area_code}")
        return (0, 0)

def get_euclidian_distance(geocode1, geocode2):
    # lat1, lon1, lat2, lon2 -> distance
    lat1, lon1 = geocode1
    lat2, lon2 = geocode2
    return ((lat1 - lat2)**2 + (lon1 - lon2)**2)**0.5

def find_neighbours(area_code: str, area_to_geocode, n):
    area_code = str(area_code)
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    return sorted(area_to_distances, key=area_to_distances.get)[:n]

def find_neighbours_with_distance(area_code: str, area_to_geocode, n):
    area_code = str(area_code)
    geocode = area_to_geocode[area_code]
    area_to_distances = { area : get_euclidian_distance(geocode, area_geocode) for area, area_geocode in area_to_geocode.items() if area != area_code }
    closest = sorted(area_to_distances, key=area_to_distances.get)[:n]
    return { area : area_to_distances[area] for area in closest }

get_euclidian_distance(get_geocode(13101), get_geocode(13102))

0.032463657480348386

In [5]:
# Smoothing parameters

n_neighbors = 5

all_area_codes = list(area_code_to_area.keys())
area_to_geocode = {area_code: get_geocode(area_code) for area_code in all_area_codes}
area_to_neighbours = {area_code: find_neighbours(area_code, area_to_geocode, 5) for area_code in all_area_codes}

Could not find geocode for 13362
Could not find geocode for 43506


In [6]:
# Get Longitudes and Latitudes
# For Area, get 5 closest neighbors

# area_to_neighbours
find_neighbours_with_distance("13101", area_to_geocode, 5)

{'13103': 0.00832056392317546,
 '13102': 0.032463657480348386,
 '13104': 0.034864796987784656,
 '13113': 0.04489803650493661,
 '13105': 0.05288099209545874}

In [7]:
"""
for each area_code:
    for each year:
        get n neighbours

        data[area_code][year] = value

        # BASIC KERNEL
        # Can do spatial smoothing for each year, then temporal smoothing for the rest
        # or
        # Can do temporal smoothing for each year, then spatial smoothing for the rest

        # Spatial Smoothing
        0.1 for each neighbour
        0.5 for main area

        # Temporal Smoothing
        0.1 (-2)
        0.2 (-1)
        0.5 (0)
        0.2 (-1)
        0.1 (-2)

        
        '''  area_code, n1, n2, n3
        T-2: 
        T-1:
        T0:
        T1:
        T2:
        '''

"""

"""
kennedy_town = 20 -> 30
SYP = 10 -> 15

kennedy_town_smooth_2 = .75 * 20 + .25 * 10 = 17.5
kennedy_town_smooth_2 = .75 * 30 + .25 * 15 = 26.25

kennedy_return_smooth = .75 * 0.5 + .25 * 0.5 = 0.5

"""

'\nkennedy_town = 20 -> 30\nSYP = 10 -> 15\n\nkennedy_town_smooth_2 = .75 * 20 + .25 * 10 = 17.5\nkennedy_town_smooth_2 = .75 * 30 + .25 * 15 = 26.25\n\nkennedy_return_smooth = .75 * 0.5 + .25 * 0.5 = 0.5\n\n'

In [8]:
asset_type = "building"

metrics = {
    "weighted_mean": "unit_price_wmean",
    "weighted_median": "unit_price_wmedian",
    "mean": "unit_price_mean",
    "median": "unit_price_median",
}

dataset_paths = {
    "main": get_derived_csv_path(asset_type),
    "lpa": get_derived_lpa_path(),
    "plps": get_derived_plps_path()
}

In [25]:
granularity_columns = ["area", "area_code"]
group_by_columns = granularity_columns + ["year"]
display_columns = ["unit_price", "total_traded_area", "count"]

metric = metrics["weighted_median"]
metric_pct_chg = metric + "_pct_chg"

In [13]:
core_df_path = dataset_paths["main"]
df = pd.read_csv(core_df_path)
df["area_code"] = df["area_code"].astype(str)

In [68]:
# Maybe do a cap on distance
def get_neighbours(year_df, area_code, n_neighbors=5):
    neighbours = find_neighbours(area_code, area_to_geocode, n_neighbors)
    return [
        neighbour
        for neighbour in neighbours
        if neighbour in year_df["area_code"].unique()
    ]


# Should we smooth on future?
def get_years(area_df, year, y_backward, y_forward):
    years = [year - i for i in range(1, y_backward)] + [
        year + i for i in range(1, y_forward)
    ]

    return [year for year in years if year in area_df["year"].unique()]


def get_price(df, area_code, year, metric):
    try:
        return df[(df["area_code"] == area_code) & (df["year"] == year)][metric].values[
            0
        ]
    except:
        print(f"Could not find price for {area_code} in {year}")


def scale_distance_to_weight(distance):
    return 1 / distance


def scale_year_to_weight(year_diff):
    return 1 / year_diff


def normalize_weights(weights, scale=1):
    return [scale * (weight / sum(weights)) for weight in weights]


def smooth(
    df,
    year,
    area_code,
    metric,
    n_neighbors=5,
    n_years_forward=1,
    n_years_backward=1,
    area_smoothing_factor=0.5,
    year_smoothing_factor=0.5,
    distance_scaler=scale_distance_to_weight,
    year_scaler=scale_year_to_weight,
):
    # OG price
    og_price = get_price(df, area_code, year, metric)

    # neighbour weights
    neighbours = get_neighbours(df[df["year"] == year], area_code, n_neighbors)
    neighbour_prices = [
        get_price(df, neighbour, year, metric) for neighbour in neighbours
    ]

    distances = [
        get_euclidian_distance(area_to_geocode[area_code], area_to_geocode[neighbour])
        for neighbour in neighbours
    ]
    distance_weights = normalize_weights(
        [distance_scaler(distance) for distance in distances],
        scale=area_smoothing_factor,
    )

    neighbour_weights = list(zip(distance_weights, neighbour_prices)) + [
        (1 - area_smoothing_factor, og_price)
    ]

    # area smoothing
    area_smoothed_price = np.average(
        [v for (_, v) in neighbour_weights], weights=[k for (k, _) in neighbour_weights]
    )

    # year weights
    years = get_years(
        df[df["area_code"] == area_code], year, n_years_backward, n_years_forward
    )

    year_prices = [get_price(df, area_code, year, metric) for year in years]
    year_diffs = [abs(year - year_) for year_ in years]
    year_diff_weights = normalize_weights(
        [year_scaler(year_diff) for year_diff in year_diffs],
        scale=year_smoothing_factor,
    )

    year_weights = list(zip(year_diff_weights, year_prices)) + [
        (1 - year_smoothing_factor, area_smoothed_price)
    ]

    # year smoothing
    area_year_smoothed_price = np.average(
        [v for (_, v) in year_weights],
        weights=[k for (k, _) in year_weights],
    )

    return area_year_smoothed_price

In [35]:
df[(df["area_code"] == "20602")] # eliminate cases like these

Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count,unit_price_wmedian_smoothed
5327,2022,20602,Nagano-ken Sakae-mura,3193.91635,3193.91635,2371.825397,2371.825397,2630.0,2.0,3193.91635
5397,2021,20602,Nagano-ken Sakae-mura,925.925926,925.925926,925.925926,925.925926,540.0,1.0,925.925926
5537,2019,20602,Nagano-ken Sakae-mura,20000.0,20000.0,20000.0,20000.0,300.0,1.0,20000.0
5675,2017,20602,Nagano-ken Sakae-mura,3750.0,3750.0,3750.0,3750.0,80.0,1.0,3750.0
6089,2011,20602,Nagano-ken Sakae-mura,7407.407407,7407.407407,7407.407407,7407.407407,270.0,1.0,7407.407407
6287,2008,20602,Nagano-ken Sakae-mura,5814.43299,5814.43299,6875.675676,6875.675676,485.0,2.0,5814.43299


In [69]:

df[f"{metrics["weighted_median"]}_smoothed"] = df.apply(
    lambda row: smooth(
        df,
        row["year"],
        row["area_code"],
        metrics["weighted_median"],
        n_neighbors=5,
        n_years_forward=1,
        n_years_backward=1,
        area_smoothing_factor=0.5,
        year_smoothing_factor=0,
        distance_scaler=scale_distance_to_weight,
        year_scaler=scale_year_to_weight,
    ), axis=1
)

df[f"{metrics["median"]}_smoothed"] = df.apply(
    lambda row: smooth(
        df,
        row["year"],
        row["area_code"],
        metrics["median"],
        n_neighbors=5,
        n_years_forward=1,
        n_years_backward=1,
        area_smoothing_factor=0.5,
        year_smoothing_factor=0,
        distance_scaler=scale_distance_to_weight,
        year_scaler=scale_year_to_weight,
    ), axis=1
)

In [None]:
df[f"{metrics["median"]}_smoothed_year"] =  

In [74]:
areas = df['area_code'].unique()
comparables = [
    '13102',  # Tokyo-to Chuo-ku
    '13103',  # Tokyo-to Minato-ku
    '15461',  # Niigata-ken Yuzawa-Machi
    '20321',  # Nagano-ken Karuisawa-machi
]
randomely_selected_areas = list(np.random.choice(areas, 16)) + comparables
randomely_selected_areas

['5214',
 '6213',
 '1217',
 '23205',
 '35211',
 '12205',
 '41208',
 '40448',
 '28223',
 '11369',
 '23232',
 '43204',
 '27223',
 '1202',
 '13106',
 '27211',
 '13102',
 '13103',
 '15461',
 '20321']

In [196]:
area_code = "5214"
plot_df = df[df["area_code"] == area_code]
plot_df


Unnamed: 0,year,area_code,area,unit_price_wmean,unit_price_wmedian,unit_price_mean,unit_price_median,total_traded_area,count,unit_price_wmedian_smoothed,unit_price_median_smoothed
19770,2023,5214,Akita-ken Nikaho-shi,13365.384615,12240.317411,13558.152535,12127.659574,1040.0,3.0,11535.689225,14809.834852
19786,2022,5214,Akita-ken Nikaho-shi,12117.982873,8297.101449,17373.741013,12046.130952,5255.0,14.0,17074.505206,27047.731256
19809,2021,5214,Akita-ken Nikaho-shi,39234.116623,28324.573977,44711.868765,28695.652174,5745.0,19.0,22877.110224,25690.950836
19831,2020,5214,Akita-ken Nikaho-shi,31251.172137,15071.479501,45808.046326,13818.181818,7465.0,17.0,14898.440691,18689.913115
19854,2019,5214,Akita-ken Nikaho-shi,18888.654354,14496.348441,28223.203293,14500.0,9475.0,17.0,13176.148622,16113.942373
19876,2018,5214,Akita-ken Nikaho-shi,25333.333333,8984.375,50408.548361,22417.582418,7350.0,16.0,12012.457352,20625.471482
19897,2017,5214,Akita-ken Nikaho-shi,30060.459492,9883.49071,47738.665324,15789.473684,4135.0,9.0,11086.780548,16708.080534
19921,2016,5214,Akita-ken Nikaho-shi,22928.240741,17866.250731,34750.622565,22727.272727,8640.0,17.0,15846.194662,20155.968725
19945,2015,5214,Akita-ken Nikaho-shi,47010.309278,35218.677318,65602.435894,49163.879599,3395.0,14.0,25595.023515,35291.511083
19967,2014,5214,Akita-ken Nikaho-shi,29635.157546,18175.563081,44406.196511,24388.888889,6030.0,16.0,15991.039323,23549.309089


In [197]:
plot_time_series(
    plot_df,
    metrics["median"],
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)

In [200]:
plot_time_series(
    plot_df,
    metrics["weighted_median"],
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)

In [198]:
plot_time_series(
    plot_df,
    f"{metrics["median"]}_smoothed",
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)

In [205]:
# we want a framework
area_code = "5214"
metric = metrics["median"]

temporal_smoothed_metric = f"{metric}_smoothed_temporal"
area_df = df[df["area_code"] == area_code].sort_values(by="year", ascending=True)
area_df[temporal_smoothed_metric] = area_df[metric].rolling(window=3).mean()

In [206]:
plot_time_series(
    area_df,
    metric,
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)

In [207]:
plot_time_series(
    area_df,
    temporal_smoothed_metric,
    group_by_columns,
    granularity_columns,
    f"Unit Price over time",
    # visible="legendonly",
)

In [199]:
# Nikhil
# This would be amazing for our pitches
# Removing fog

In [None]:
# [1, 2, 3, 4, 5, 6, 7, 8, 9]
# [1, 2, 3] = 2
# [2, 3, 4] = 3
# [3, 4, 5] = 4 
# [4, 5, 6] = 5