In [49]:
import os
import warnings

import pandas as pd
import numpy as np


from jre_utils.datapath import (
    DATA_DIRECTORY_PATH,
)
from jp_prefecture.jp_cities import JpCity, jp_cities
from geopy.geocoders import Nominatim
from tqdm import tqdm

import statsmodels.api as sm

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [54]:
import json
from pprint import pprint

sub_city_to_city_path = f"{DATA_DIRECTORY_PATH}/core_scraped/sub_city_to_city.json"
with open(sub_city_to_city_path) as fd:
     sub_city_to_city = json.load(fd)
     pprint(f"E.g. Maps 1101 to {sub_city_to_city['1101']}")

area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     pprint(f"E.g. Maps 1100 to {area_code_to_area['1100']}") 

def get_city_code(area_code):
     return sub_city_to_city.get(area_code, area_code)

def get_area_from_area_code(area_code):
     return area_code_to_area.get(area_code, "na" )

def get_city_geocode(area_code):
    area_code = str(area_code)
    try:
        return tuple(jp_cities.citycode2geodetic(area_code)) 
    except:
        # print(f"Could not find geocode for {area_code}")
        return np.NaN, np.NaN
    
def find_town_jp(all_towns_df, city_code, town_name, log=False):
    city_df = all_towns_df[all_towns_df["cityCode"] == int(city_code)]
    town_df = city_df[city_df["townAlphabet"].str.contains(town_name)]
    
    if town_df.empty:
        if log:
            print(f"JP could not find {town_name} in {city_code}")
        return None, None
    
    return town_df["longitude"].mean(), town_df["latitude"].mean()

def find_town_geopy(geolocator, address, log=False):
    location_info = geolocator.geocode(address)

    if not location_info:
        if log:
            print(f"Geopy could not find {address}")
        return None, None
    
    return location_info.longitude, location_info.latitude

def get_town_coordinates_jp(all_towns_df, city_name, city_code, town_name, log=False):
    geolocator = Nominatim(user_agent="my_app")

    # Try to locade in df - this is fast
    jp_lon, jp_lat = find_town_jp(all_towns_df, city_code, town_name, log)

    if not jp_lon or not jp_lat:
        address = f"{town_name}, {city_name}, Japan"

        # Fall back and try to locate with geopy - this is slow
        geopy_lon, geopy_lat = find_town_geopy(geolocator, address, log)

        if not geopy_lon or not geopy_lat:
            # if nothing works, just return the city coordinates
            return get_city_geocode(city_code)

        return geopy_lon, geopy_lat

    return jp_lon, jp_lat

def get_town_coordinates_jp_simple(all_towns_df, city_code, town_name, log=False):
    jp_lon, jp_lat = find_town_jp(all_towns_df, city_code, town_name, log)

    if not jp_lon or not jp_lat:
        return get_city_geocode(city_code)

    return jp_lon, jp_lat



'E.g. Maps 1101 to 1100'
'E.g. Maps 1100 to Hokkaido Sapporo-shi'


In [55]:
trade_prices_data_path = f"{DATA_DIRECTORY_PATH}/core"

trade_prices_data_paths = [
    f"{trade_prices_data_path}/{filename}"
    for filename in os.listdir(trade_prices_data_path)
]

In [56]:
area_codes = []
for path in trade_prices_data_paths:
    df = pd.read_csv(path, encoding="unicode_escape")
    df["area_code"] = df["City,Town,Ward,Village code"].astype(str)
    df["area_code"] = df["area_code"].apply(get_city_code).astype(str)
    area_codes = area_codes + df["area_code"].unique().tolist()

In [57]:
len(area_codes)

1738

In [58]:
pbar = tqdm(total=len(area_codes))

coordinates_df = pd.DataFrame(columns=["area", "area_code", "subarea", "long", "lat"])

jp_towns = JpCity(enable_town=True)
towns_df = jp_towns.towns
towns_df["townAlphabet"] = towns_df["townAlphabet"].fillna("")
geolocator = Nominatim(user_agent="japan_locations_2")

for path in trade_prices_data_paths:
    df = pd.read_csv(path, encoding="unicode_escape")
    df["area_code"] = df["City,Town,Ward,Village code"].astype(str)

    # we may want to skip the following step in the future
    df["area_code"] = df["area_code"].apply(get_city_code).astype(str)
    df["area"] = df["area_code"].apply(get_area_from_area_code)
    df["subarea"] = df["Area"].fillna("")

    for area_code in df["area_code"].unique():
        if area_code not in area_codes:
            continue
        
        area_df = df[df["area_code"] == area_code]
        towns_list_df = area_df[["area", "area_code", "subarea"]].drop_duplicates()

        towns_list_df[["long", "lat"]] = towns_list_df.apply(
            lambda x: pd.Series(
                get_town_coordinates_jp_simple(
                    towns_df,
                    x["area_code"],
                    x["subarea"],
                    log=False,
                )
            ),
            axis=1,
        )

        towns_list_df["long"] = towns_list_df["long"].fillna(towns_list_df["long"].mean())
        towns_list_df["lat"] = towns_list_df["lat"].fillna(towns_list_df["lat"].mean())

        coordinates_df = pd.concat([coordinates_df, towns_list_df], axis=0)

        pbar.update()

pbar.close()

100%|██████████| 1738/1738 [01:40<00:00, 17.27it/s]


In [60]:
coordinates_df

Unnamed: 0,area,area_code,subarea,long,lat
0,Wakayama-ken Wakayama-shi,30201,Aoicho,135.163085,34.213480
4,Wakayama-ken Wakayama-shi,30201,Akizuki,135.203712,34.230374
42,Wakayama-ken Wakayama-shi,30201,Akibacho,135.171405,34.200466
47,Wakayama-ken Wakayama-shi,30201,Asahi,135.218759,34.192106
64,Wakayama-ken Wakayama-shi,30201,Azushima,135.229079,34.252869
...,...,...,...,...,...
10199,Tokushima-ken Higashimiyoshi-cho,36489,Keta,133.986142,34.008828
10204,Tokushima-ken Higashimiyoshi-cho,36489,Nakasho,133.972373,33.988013
10210,Tokushima-ken Higashimiyoshi-cho,36489,Nishisho,133.940244,34.008941
10239,Tokushima-ken Higashimiyoshi-cho,36489,Higashiyama,133.855753,34.078389


In [61]:
subarea_coords_simple_path = f"{DATA_DIRECTORY_PATH}/coordinates/subarea_simple.csv"
coordinates_df.to_csv(subarea_coords_simple_path, index=False)


In [42]:
pbar.close()