In [131]:
import os
import sys
import warnings

from tqdm import tqdm
import pandas as pd
import numpy as np

from jre_utils.datapath import DATA_DIRECTORY_PATH, factor_data_paths

from jp_prefecture.jp_cities import jp_cities as jp
from jp_prefecture.address import JpAddressParser, JpAddress

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [44]:
"""
Goal: 
For the PLPS data, 
index by year, prefecture and municipality, then
get the weighted average price per unit area
"""

years = [year for year in range(1997, 2024)]
plps_data_path = f"{DATA_DIRECTORY_PATH}/core_scraped/plps"
plps_data_paths = {
    year: f"{plps_data_path}/{year}.csv"
    for year in range(1997, 2024)
}
plps_data_paths

{1997: '../../data/core_scraped/plps/1997.csv',
 1998: '../../data/core_scraped/plps/1998.csv',
 1999: '../../data/core_scraped/plps/1999.csv',
 2000: '../../data/core_scraped/plps/2000.csv',
 2001: '../../data/core_scraped/plps/2001.csv',
 2002: '../../data/core_scraped/plps/2002.csv',
 2003: '../../data/core_scraped/plps/2003.csv',
 2004: '../../data/core_scraped/plps/2004.csv',
 2005: '../../data/core_scraped/plps/2005.csv',
 2006: '../../data/core_scraped/plps/2006.csv',
 2007: '../../data/core_scraped/plps/2007.csv',
 2008: '../../data/core_scraped/plps/2008.csv',
 2009: '../../data/core_scraped/plps/2009.csv',
 2010: '../../data/core_scraped/plps/2010.csv',
 2011: '../../data/core_scraped/plps/2011.csv',
 2012: '../../data/core_scraped/plps/2012.csv',
 2013: '../../data/core_scraped/plps/2013.csv',
 2014: '../../data/core_scraped/plps/2014.csv',
 2015: '../../data/core_scraped/plps/2015.csv',
 2016: '../../data/core_scraped/plps/2016.csv',
 2017: '../../data/core_scraped/plps/201

In [64]:
year = 2022
main_df = pd.read_csv(plps_data_paths[year])
main_df = main_df.assign(year=2022)

In [86]:
# convert address to city code
addr_column = "所在及び地番"
price_per_unit_area_column = "価格(円/m²)"
area_column = "地積(m²)"

big_cities_df = jp.cities[jp.cities["bigCityFlag"] == 2][["cityCode", "cityAlphabet"]]

sub_cities_df = jp.cities[jp.cities["bigCityFlag"] == 1]
sub_cities_df[["Municipality", "Submunicipality"]] = sub_cities_df["cityAlphabet"].str.split(" ", expand=True)
city_code_map = sub_cities_df.merge(big_cities_df, left_on="Municipality", right_on="cityAlphabet", suffixes=("", "_big"))[["cityCode", "cityCode_big"]]
city_code_map = city_code_map.astype(str)
sub_city_to_city = city_code_map.set_index("cityCode").to_dict()["cityCode_big"]

In [196]:
%%capture
errors = []
def get_city_code(parser, addr):
    try:
        city_code = str(parser.parse_address(addr).cityCode)
        return sub_city_to_city.get(city_code, city_code)
    except:
        errors.append(addr)
        return addr

parser = JpAddressParser(enable_town=True)
main_df["area_code"] = main_df[addr_column].apply(lambda x: get_city_code(parser, x))

In [197]:
def prepare_area_column(area):
    try:
        return int(area.split("(")[0].replace(",", ""))
    except:
        print("error")
        return np.NaN
    
main_df["price_per_unit_area"] = main_df[price_per_unit_area_column].apply(prepare_area_column)
main_df["area"] = main_df[area_column].apply(prepare_area_column)

In [198]:
required_columns = ["year", "area_code", "price_per_unit_area", "area"]
df = main_df[required_columns]

In [202]:
final_df = pd.Series(
    df.groupby(["year", "area_code"]).apply(
        lambda x: np.average(x["price_per_unit_area"], weights=x["area"])
    ),
    name="weighted_average_price_per_area",
).reset_index()
final_df["area_code"] = final_df["area_code"]

In [203]:
final_df

Unnamed: 0,year,area_code,weighted_average_price_per_area
0,2022,10201,46177.503815
1,2022,10202,44520.302774
2,2022,10203,26181.240745
3,2022,10204,33737.650474
4,2022,10205,23544.729718
...,...,...,...
1715,2022,9364,24776.519757
1716,2022,9384,11050.872257
1717,2022,9386,20879.404651
1718,2022,9407,15072.392793


In [201]:
df[df["area_code"] == "1100"]


Unnamed: 0,year,area_code,price_per_unit_area,area
0,2022,1100,235000,237
1,2022,1100,200000,146
2,2022,1100,170000,233
3,2022,1100,57000,279
4,2022,1100,355000,370
...,...,...,...,...
127,2022,1100,92000,195
128,2022,1100,80000,145
129,2022,1100,95000,1015
130,2022,1100,106000,1695


In [127]:
# test for area code 1101

array = [
    (235000, 237),
    (200000, 146),
    (170000, 233),
    (57000, 279),
    (355000, 370),
    (45000, 348),
    (4300000, 718),
    (2350000, 802),
    (1060000, 537),
    (245000, 289),
    (1650000, 463),
    (690000, 533),
    (540000, 312),
    (645000, 784),
    (500000, 462),
]

total_area = sum([x[1] for x in array])
total_price = sum([x[0] * x[1] for x in array])
total_price / total_area

1218551.0517426685

In [76]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20831 entries, 0 to 20997
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   基準地番号             20831 non-null  object 
 1   調査基準日             20831 non-null  object 
 2   所在及び地番            20831 non-null  object 
 3   住居表示              6443 non-null   object 
 4   価格(円/m²)          20831 non-null  object 
 5   交通施設、距離           20831 non-null  object 
 6   地積(m²)            20831 non-null  object 
 7   形状（間口：奥行き）        20831 non-null  object 
 8   利用区分、構造           20831 non-null  object 
 9   利用現況              20831 non-null  object 
 10  給排水等状況            20831 non-null  object 
 11  周辺の土地の利用現況        20831 non-null  object 
 12  前面道路の状況           20791 non-null  object 
 13  その他の接面道路          3052 non-null   object 
 14  用途区分、高度地区、防火・準防火  15570 non-null  object 
 15  建ぺい率（%）、容積率（%）    20831 non-null  object 
 16  都市計画区域区分          20831 non-null  object 
 17

In [82]:
jp.cities.head(20)

Unnamed: 0,prefCode,cityCode,cityName,cityAlphabet,latitude,longitude,bigCityFlag
0,1,1100,札幌市,Sapporo-shi,43.0351,141.2049,2
1,1,1101,札幌市中央区,Sapporo-shi Chuo-ku,43.04223,141.319722,1
2,1,1102,札幌市北区,Sapporo-shi Kita-ku,43.157077,141.39018,1
3,1,1103,札幌市東区,Sapporo-shi Higashi-ku,43.120789,141.394435,1
4,1,1104,札幌市白石区,Sapporo-shi Shiroishi-ku,43.071628,141.437011,1
5,1,1105,札幌市豊平区,Sapporo-shi Toyohira-ku,43.049506,141.365176,1
6,1,1106,札幌市南区,Sapporo-shi Minami-ku,42.946783,141.328331,1
7,1,1107,札幌市西区,Sapporo-shi Nishi-ku,43.079715,141.308758,1
8,1,1108,札幌市厚別区,Sapporo-shi Atsubetsu-ku,43.022577,141.482606,1
9,1,1109,札幌市手稲区,Sapporo-shi Teine-ku,43.145554,141.239503,1


In [170]:
big_cities_df

Unnamed: 0,cityCode,cityAlphabet
0,1100,Sapporo-shi
262,4100,Sendai-shi
525,11100,Saitama-shi
598,12100,Chiba-shi
719,14100,Yokohama-shi
738,14130,Kawasaki-shi
746,14150,Sagamihara-shi
780,15100,Niigata-shi
1015,22100,Shizuoka-shi
1019,22130,Hamamatsu-shi


In [168]:
jp.cities[jp.cities["bigCityFlag"] == 0]

Unnamed: 0,prefCode,cityCode,cityName,cityAlphabet,latitude,longitude,bigCityFlag
11,1,1202,函館市,Hakodate-shi,41.757089,140.716562,0
12,1,1203,小樽市,Otaru-shi,43.191426,141.004925,0
13,1,1204,旭川市,Asahikawa-shi,43.774940,142.368767,0
14,1,1205,室蘭市,Muroran-shi,42.385500,140.936264,0
15,1,1206,釧路市,Kushiro-shi,43.028943,144.391342,0
...,...,...,...,...,...,...,...
1909,47,47361,島尻郡久米島町,Shimajiri-gun Kumejima-cho,26.347359,126.769730,0
1910,47,47362,島尻郡八重瀬町,Shimajiri-gun Yaese-cho,26.125995,127.747182,0
1911,47,47375,宮古郡多良間村,Miyako-gun Tarama-son,24.657759,124.685433,0
1912,47,47381,八重山郡竹富町,Yaeyama-gun Taketomi-cho,24.237087,124.011919,0
