In [5]:
import warnings

from tqdm import tqdm
import pandas as pd
import numpy as np
import weighted

from jre_utils.datapath import DATA_DIRECTORY_PATH
from jp_prefecture.address import JpAddressParser

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [6]:
"""
Goal: 
For the PLPS data, 
index by year, prefecture and municipality, then
get the weighted average price per unit area
DONE!!
"""

plps_data_path = f"{DATA_DIRECTORY_PATH}/core_scraped/plps"
plps_data_paths = {
    year: f"{plps_data_path}/{year}.csv"
    for year in range(1997, 2024)
}
print(plps_data_paths[2022])

plps_pre_derived_data_path = f"{DATA_DIRECTORY_PATH}/pre_derived_lpa"
plps_pre_derived_data_paths = {
    year: f"{plps_pre_derived_data_path}/yearly/{year}.csv"
    for year in range(1971, 2024)
}
print(plps_pre_derived_data_paths[2022])

'../../data/core_scraped/plps/2022.csv'

In [7]:
# convert address to city code
addr_column = "所在及び地番"
price_per_unit_area_column = "価格(円/m²)"
area_column = "地積(m²)"

required_columns = ["year", "area_code", "price_per_unit_area", "area"]


In [8]:
import json
from pprint import pprint

sub_city_to_city_path = f"{DATA_DIRECTORY_PATH}/core_scraped/sub_city_to_city.json"
with open(sub_city_to_city_path) as fd:
     sub_city_to_city = json.load(fd)
     pprint(f"E.g. Maps 1101 to {sub_city_to_city["1101"]}")

area_code_to_area_path = f"{DATA_DIRECTORY_PATH}/core_scraped/area_code_to_area.json"
with open(area_code_to_area_path) as fd:
     area_code_to_area = json.load(fd)
     pprint(f"E.g. Maps 1100 to {area_code_to_area["1100"]}") 

def get_area_from_area_code(area_code):
     return area_code_to_area.get(area_code, "na" )

'E.g. Maps 1101 to 1100'
'E.g. Maps 1100 to Hokkaido Sapporo-shi'


In [9]:
def prepare_area_column(area):
    try:
        return int(area.split("(")[0].replace(",", ""))
    except:
        print("error")
        return np.NaN

def get_city_code(parser, addr):
    try:
        city_code = str(parser.parse_address(addr).cityCode)
        return sub_city_to_city.get(city_code, city_code)
    except:
        return addr

def get_area_from_area_code(area_code):
     return area_code_to_area.get(area_code, "na" )

parser = JpAddressParser(enable_town=True)

In [10]:
def custom_aggregate(x):
    d = {}
    d["unit_price_wmean"] = np.average(x["unit_price"], weights=x["trade_area"])
    d["unit_price_wmedian"] = weighted.median(x["unit_price"], weights=x["trade_area"])
    d["unit_price_mean"] = x["unit_price"].mean()
    d["unit_price_median"] = x["unit_price"].median()
    d["total_traded_area"] = x["trade_area"].sum()
    d["count"] = x["count"].count()
    return pd.Series(
        d,
        index=[
            "unit_price_wmean",
            "unit_price_wmedian",
            "unit_price_mean",
            "unit_price_median",
            "total_traded_area",
            "count",
        ],
    )

In [11]:
%%capture

for year, data_path in plps_data_paths.items():
    main_df = pd.read_csv(data_path)
    main_df = main_df.assign(year=year, count=1)
    main_df["area_code"] = main_df[addr_column].apply(lambda x: get_city_code(parser, x))
    main_df["area"] = main_df["area_code"].apply(get_area_from_area_code)
    main_df["unit_price"] = main_df[price_per_unit_area_column].apply(prepare_area_column)
    main_df["traded_area"] = main_df[area_column].apply(prepare_area_column)
    main_df = main_df[main_df["area_code"].apply(lambda x: x.isdigit())]
    main_df.to_csv(f"{DATA_DIRECTORY_PATH}/pre_derived_plps/yearly/{year}.csv", index=False)

In [242]:
final_df = pd.DataFrame()

for year, data_path in plps_pre_derived_data_paths.items():
    main_df = pd.read_csv(data_path)
    df = main_df[required_columns]
    df = (
        df.groupby(["year", "area_code", "area"])
        .apply(custom_aggregate)
        .reset_index()
    )
    df = df.sort_values(by=["year", "area_code"], ascending=[False, True]).reset_index(drop=True)
    df.to_csv(f"{DATA_DIRECTORY_PATH}/derived_plps/yearly/{year}.csv", index=False)
    final_df = pd.concat([final_df, df])
    

In [244]:
final_df.to_csv(f"{DATA_DIRECTORY_PATH}/derived_plps/wappa.csv", index=False)

In [234]:
final_df = final_df[final_df["area_code"].apply(lambda x: x.isdigit())]

Unnamed: 0,year,area_code,weighted_average_price_per_area
0,1997,10201,230336.239614
1,1997,10202,265522.621710
2,1997,10203,82306.227348
3,1997,10204,76610.715450
4,1997,10205,91248.543596
...,...,...,...
1715,2023,9364,24645.326748
1716,2023,9384,10839.251548
1717,2023,9386,20849.116279
1718,2023,9407,14788.670270


In [239]:
final_df[final_df["area_code"] == "1202"]

Unnamed: 0,year,area_code,weighted_average_price_per_area
89,1997,1202,99735.121195
89,1998,1202,89807.769595
90,1999,1202,82595.293694
90,2000,1202,76001.382797
91,2001,1202,60550.427403
91,2002,1202,57585.972864
92,2003,1202,53632.958141
92,2004,1202,50354.981007
92,2005,1202,48616.739856
97,2006,1202,47553.138943
