In [2]:
import json
from tqdm import tqdm
from pprint import pprint
from bs4 import BeautifulSoup
import pandas as pd

from jre_utils.datapath import DATA_DIRECTORY_PATH

# selenium 4
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
# Do not rerun accidentally
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

In [4]:
def get_mlit_url(mod, year, page=0):
    return f"https://www.land.mlit.go.jp/landPrice/SearchServlet?nccharset=953DFDC5&MOD={mod}&TDK=&SKC=&CHI=&YFR={year}&YTO={year}&YOU=0%2C3%2C5%2C7%2C9&PFR=&PTO=&PG={page}&LATEST_YEAR="


def get_lpa_url(year, page=0):
    return get_mlit_url(0, year, page)


def get_plps_url(year, page=0):
    return get_mlit_url(1, year, page)

In [5]:
plps_year_to_page_count_path = f"{DATA_DIRECTORY_PATH}/core_scraped/plps_year_to_page_count.json"
lpa_year_to_page_count_path = f"{DATA_DIRECTORY_PATH}/core_scraped/lpa_year_to_page_count.json"

with open(plps_year_to_page_count_path) as fd:
     plps_year_to_page_count = json.load(fd)
    #  pprint(plps_year_to_page_count)

with open(lpa_year_to_page_count_path) as fd:
     lpa_year_to_page_count = json.load(fd)
    #  pprint(lpa_year_to_page_count)
     

In [6]:
num_plps_pages = sum(plps_year_to_page_count.values())
num_lpa_pages = sum(lpa_year_to_page_count.values())
num_plps_pages, num_lpa_pages

(30732, 55977)

In [7]:
thought = "Also, I’m about to make 80,000 requests to a server. By my estimate, it’ll take 20 or so hours. This seems borderline illegal haha"

In [8]:
def datalist_element_to_dict(datalist_element):
    """
    This works because each datalist element is similar in structure to the following:
    <div class="datalist">
        <div class="datalistline">
            <div class="datalistkey">Key</div>
            <div class="datalistvalue">Value</div>
            <div class="datalistkey">Key</div>
            <div class="datalistvalue2">Value</div>
        </div>
        ...
    </div>
    """
    
    key_vals = []
    html = datalist_element.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    datalist_lines = soup.find_all(class_='datalistline')

    for datalist_line in datalist_lines:
        children = [" ".join(div.stripped_strings) for div in datalist_line.find_all("div", recursive=False)]
        key_vals += [(children[i], children[i + 1]) for i in range(0, len(children), 2)]

    return dict(key_vals)

def get_land_price_data(driver, url):
    driver.get(url)
    datalist_elements = driver.find_elements(By.CLASS_NAME, 'datalist')
    elements = [datalist_element_to_dict(datalist_element) for datalist_element in datalist_elements]
    return elements

In [13]:
# test
test_year = "X" # 2023
page_count = plps_year_to_page_count[test_year]
pbar = tqdm(total=page_count)

all_elements = []

for page in range(1, page_count + 1):
    url = get_plps_url(2023, page)
    # all_elements += get_land_price_data(driver, url)
    pbar.update()

pbar.close()
# pd.DataFrame.from_records(all_elements).to_csv(f"{DATA_DIRECTORY_PATH}/core_scraped/plps/{test_year}.csv", index=False)
    

  0%|          | 0/1047 [02:58<?, ?it/s]
100%|██████████| 1047/1047 [15:55<00:00,  1.10it/s]


In [25]:
pd.read_csv(f"{DATA_DIRECTORY_PATH}/core_scraped/plps/{1997}.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27426 entries, 0 to 27425
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   基準地番号             27426 non-null  object 
 1   調査基準日             27426 non-null  object 
 2   所在及び地番            27426 non-null  object 
 3   住居表示              7730 non-null   object 
 4   価格(円/m²)          27426 non-null  object 
 5   交通施設、距離           27426 non-null  object 
 6   地積(m²)            27426 non-null  object 
 7   形状（間口：奥行き）        27192 non-null  object 
 8   利用区分、構造           27426 non-null  object 
 9   利用現況              0 non-null      float64
 10  給排水等状況            27426 non-null  object 
 11  周辺の土地の利用現況        0 non-null      float64
 12  前面道路の状況           27339 non-null  object 
 13  その他の接面道路          3320 non-null   object 
 14  用途区分、高度地区、防火・準防火  19083 non-null  object 
 15  建ぺい率（%）、容積率（%）    27426 non-null  object 
 16  都市計画区域区分          27426 non-null  object

In [17]:
plps_completed_years = ["2023"]
lpa_completed_years = ["1971", "1972", "1973", "1974", "1975", "1976"]

In [19]:
# Scraping Prefectural Land Price Survey (PLPS) data
pbar = tqdm(total=num_plps_pages)

for year, page_count in plps_year_to_page_count.items():
    if year in plps_completed_years:
        continue

    all_elements = []
    for page in range(1, page_count + 1):
        url = get_plps_url(year, page)
        # all_elements += get_land_price_data(driver, url)
        pbar.update()

    # pd.DataFrame.from_records(all_elements).to_csv(f"{DATA_DIRECTORY_PATH}/core_scraping/plps/{year}.csv", index=False)

pbar.close()


  0%|          | 0/30732 [00:00<?, ?it/s]

 97%|█████████▋| 29685/30732 [9:17:16<19:39,  1.13s/it]  


In [18]:
# Scraping Land Price Announcement (LPA) data
pbar = tqdm(total=num_lpa_pages)

for year, page_count in lpa_year_to_page_count.items():
    if year in lpa_completed_years:
        continue

    all_elements = []
    for page in range(1, page_count + 1):
        url = get_lpa_url(year, page)
        # all_elements += get_land_price_data(driver, url)
        pbar.update()

    # pd.DataFrame.from_records(all_elements).to_csv(f"{DATA_DIRECTORY_PATH}/core_scraped/lpa/{year}.csv", index=False)

pbar.close()


  5%|▍         | 2549/55977 [1:21:19<28:24:35,  1.91s/it] 
 96%|█████████▌| 53512/55977 [15:41:47<43:22,  1.06s/it]


In [19]:
driver.quit()