Reference

# Scrape SUUMO data

In [2]:
from retry import retry # automatic retry
import requests
from bs4 import BeautifulSoup # parse HTML
import pandas as pd
import gc
import time
import os
import json
import re

In [3]:
@retry(tries=3, delay=10, backoff=2)
def get_html(url):
    """
    Uses BeautifulSoup to parse HTML content from url. Retries 3 times with 10 second delay that doubles each try.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None

## 1. Scrape URLs
- Scrape URLs for individual purchasable property from SUUMO.jp collection pages

In [4]:
if not os.path.exists("/app/data/used_property_urls.csv"):
    
    collection_pages = {
        "used_apartments": "https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=011&sc=13101&sc=13102&sc=13103&sc=13104&sc=13105&sc=13106&sc=13107&sc=13108&sc=13109&sc=13110&sc=13111&sc=13112&sc=13113&sc=13114&sc=13115&sc=13116&sc=13117&sc=13118&sc=13119&sc=13120&sc=13121&sc=13122&sc=13123&ta=13&po=0&pj=1&pc=100",
        # "new_apartments": "https://suumo.jp/jj/bukken/ichiran/JJ011FC001/?ar=030&bs=010&ta=13&po=0&pj=1&initFlg=1&bknlistmodeflg=1&page=1&pc=100",
        "used_houses": "https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=021&cn=9999999&cnb=0&ekTjCd=&ekTjNm=&hb=0&ht=9999999&kb=1&kt=9999999&sc=13101&sc=13102&sc=13103&sc=13104&sc=13105&sc=13113&sc=13106&sc=13107&sc=13108&sc=13118&sc=13121&sc=13122&sc=13123&sc=13109&sc=13110&sc=13111&sc=13112&sc=13114&sc=13115&sc=13120&sc=13116&sc=13117&sc=13119&ta=13&tb=0&tj=0&tt=9999999&po=0&pj=1&pc=100",
        # "new_houses": "https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=020&ekTjCd=&ekTjNm=&hb=0&ht=9999999&kb=1&km=1&kt=9999999&kw=1&sc=13102&sc=13103&sc=13104&sc=13105&sc=13113&sc=13106&sc=13107&sc=13108&sc=13118&sc=13121&sc=13122&sc=13123&sc=13109&sc=13110&sc=13111&sc=13112&sc=13114&sc=13115&sc=13120&sc=13116&sc=13117&sc=13119&ta=13&tb=0&tj=0&tt=9999999&po=0&pj=1&pc=100"
    }

    list_urls = []
    page = 0

    for category, url in collection_pages.items():
        print(f"Category: {category}")

        while True:
            soup = get_html(url.format(page))
            
            if not soup:
                print(f"No soup in {url}")
                break 

            for h2 in soup.find_all('h2', class_='property_unit-title'):
                a_tag = h2.find('a', href=True) 
                if a_tag and a_tag['href']:
                    property_url = "https://suumo.jp" + a_tag['href']
                    list_urls.append((category, property_url))

            # Find the 'next' page link with text "次へ"
            next_page = soup.find('a', text="次へ")
            if next_page and next_page.get('href'):
                url = "https://suumo.jp" + next_page['href']
            else:
                print(f"No more {category} pages.")
                break

    df_urls = pd.DataFrame(list_urls, columns=['Category', 'URL'])
    df_urls.to_csv("/app/data/used_property_urls.csv", index=False)
    df_urls.head()


In [5]:
def parse_price(value):
    try:
        total_price = 0
        price_str = value.strip().split('円')[0]
        # 1億円
        if "億" in price_str:
            oku_pattern = re.compile(r'(\d+)億')       # match int number before '億'
            oku_match = oku_pattern.search(price_str)
            if oku_match:
                total_price += int(oku_match.group(1)) * 100000000
        if "万" in price_str:
            man_pattern = re.compile(r'(\d+)万')
            man_match = man_pattern.search(price_str)
            if man_match:
                total_price += int(man_match.group(1)) * 10000
        if total_price:
            return total_price
        else:
            return int(price_str)
    except ValueError:
        return None

def parse_area(value):
    try:
        area_pattern = re.compile(r'(\d+(\.\d+)?)m')   # match decimal number before 'm'
        area_match = area_pattern.search(value)
        if area_match:
            return float(area_match.group(1))
        else:
            return None
    except ValueError:
        return None

def extract_number(value, i):
    try:
        level_details = value.strip().split('/')
        if len(level_details) > 1:
            match = re.search(r'\d+', level_details[i])
            if match:
                return int(match.group())
        else: 
            return None
    except ValueError:
        return None

def parse_year(value):
    try:
        return int(value.strip().split('年')[0])
    except ValueError:
        return None

def remove_whitespace(text):
    """ Remove white space characters suchs as newline, tab, etc. """
    return re.sub(r'\s+', ' ', text).strip()

def parse_features(soup):
    """ Get soup of data in the 特徴ピックアップ section. """
    features_section = soup.find('h3', string='特徴ピックアップ')
    if features_section:
        parent_div = features_section.find_parent('div', class_='secTitleOuterR')
        if parent_div == None:
            parent_div = features_section.find_parent('div', class_='secTitleOuterK')
        features_div = parent_div.find_next_sibling('div', class_='mt10')
        if features_div:
            features_text = ' '.join(features_div.stripped_strings)
            return remove_whitespace(features_text)
    return None

pd_urls = pd.read_csv("/app/data/used_property_urls.csv", index_col=False)
df_urls = pd.DataFrame(pd_urls, columns=['Category', 'URL'])
df_apt_urls = df_urls.loc[df_urls['Category'] == 'used_apartments']

list_data = []

count = 0
batch_size = 100

list_data = []

if os.path.exists("/app/data/used_property_data.csv"):
    df_partial = pd.read_csv(f"/app/data/used_property_data.csv", index_col=False)
    # Extract URL sets
    urls_set1 = set(df_apt_urls['URL'])
    urls_set2 = set(df_partial['URL'])
    # URLs in df_apt_urls but not in df_partial
    unique_urls1 = list(urls_set1 - urls_set2)
else:
    unique_urls1 = list(df_apt_urls['URL'])

total = len(unique_urls1)

for url in unique_urls1:
    soup = get_html(url)

    if not soup:
        print(f"No soup in {url}")
        continue 

    # Initialize dictionary to store the data
    property_data = {}
    property_data['url'] = url

    # Iterate over each row in the table
    target_table = soup.find('table', summary="表")
    for row in target_table.find_all('tr'):
        # Find all th and td pairs in the row
        ths = row.find_all('th')
        tds = row.find_all('td')
        
        for th, td in zip(ths, tds):
            header = th.text.strip()
            value = td.text.strip()

            if "価格" in header:
                property_data['price'] = parse_price(value)
            elif "間取り" in header:
                property_data['plan'] = value.strip()
            elif "専有面積" in header:
                property_data['area'] = parse_area(value)
            elif "その他面積" in header:
                property_data['balcony_area'] = parse_area(value)
            elif "所在階/構造・階建" in header:
                property_data['level'] = extract_number(value, 0)
                property_data['no_floors'] = extract_number(value, 1)
            elif "完成時期（築年月）" in header:
                property_data['year'] = parse_year(value)
            elif "住所" in header:
                property_data['address'] = value.strip().split('\n')[0]

    property_data['features'] = parse_features(soup)

    list_data.append(property_data)

    count += 1

    # Periodically save property_data and clear the list from memory
    if count % batch_size == 0:
        print(f"Processed {count}/{total} URLs")
        df_data = pd.DataFrame(list_data)
        df_merged = pd.merge(df_apt_urls, df_data, left_on='URL', right_on='url', how='inner')
        df_merged = df_merged.drop(columns=['url'])

        if not os.path.exists("/app/data/used_property_data.csv"):
            # First batch: Write header and data
            df_merged.to_csv(f'/app/data/used_property_data.csv', index=False)
        else:
            df_merged.to_csv(f'/app/data/used_property_data.csv', mode='a', header=False, index=False)

        property_data.clear()
        list_data = []
        gc.collect()  # Manually release memory
        time.sleep(5)  # Rest 5 seconds to reduce server load

Processed 2/25349 URLs
Processed 4/25349 URLs


KeyboardInterrupt: 

In [None]:
df_merged = pd.read_csv(f"/app/data/used_property_data.csv", index_col=False)
df_merged.head()

Unnamed: 0,Category,URL,price,plan,area,balcony_area,level,no_floors,year,address,features
0,used_apartments,https://suumo.jp/ms/chuko/tokyo/sc_adachi/nc_7...,11900000,2DK,41.34,2.43,2,4,1979,東京都足立区花畑５-2-9,年内引渡可 / 即引渡可 / 角住戸 / 陽当り良好 / 全居室収納 / シャワー付洗面化粧...
1,used_apartments,https://suumo.jp/ms/chuko/tokyo/sc_taito/nc_75...,13800000,1K,16.45,2.69,5,9,1991,東京都台東区松が谷３-６－２,瑕疵保証付（不動産会社独自） / ２沿線以上利用可 / 内装リフォーム / 駅まで平坦 / ...
