Reference

# Scrape SUUMO data

In [3]:
from retry import retry # automatic retry
import requests
from bs4 import BeautifulSoup # parse HTML
import pandas as pd
import gc
import time
import os
import json
import re

In [4]:
@retry(tries=3, delay=10, backoff=2)
def get_html(url):
    """
    Uses BeautifulSoup to parse HTML content from url. Retries 3 times with 10 second delay that doubles each try.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException:
        return None

## 1. Scrape URLs
- Scrape URLs for individual purchasable property from SUUMO.jp collection pages

In [5]:
if not os.path.exists("/app/data/property_urls.csv"):
    
    collection_pages = {
        "used_apartments": "https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=011&sc=13101&sc=13102&sc=13103&sc=13104&sc=13105&sc=13106&sc=13107&sc=13108&sc=13109&sc=13110&sc=13111&sc=13112&sc=13113&sc=13114&sc=13115&sc=13116&sc=13117&sc=13118&sc=13119&sc=13120&sc=13121&sc=13122&sc=13123&ta=13&po=0&pj=1&pc=100",
        # "new_apartments": "https://suumo.jp/jj/bukken/ichiran/JJ011FC001/?ar=030&bs=010&ta=13&po=0&pj=1&initFlg=1&bknlistmodeflg=1&page=1&pc=100",
        "used_houses": "https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=021&cn=9999999&cnb=0&ekTjCd=&ekTjNm=&hb=0&ht=9999999&kb=1&kt=9999999&sc=13101&sc=13102&sc=13103&sc=13104&sc=13105&sc=13113&sc=13106&sc=13107&sc=13108&sc=13118&sc=13121&sc=13122&sc=13123&sc=13109&sc=13110&sc=13111&sc=13112&sc=13114&sc=13115&sc=13120&sc=13116&sc=13117&sc=13119&ta=13&tb=0&tj=0&tt=9999999&po=0&pj=1&pc=100",
        # "new_houses": "https://suumo.jp/jj/bukken/ichiran/JJ012FC001/?ar=030&bs=020&ekTjCd=&ekTjNm=&hb=0&ht=9999999&kb=1&km=1&kt=9999999&kw=1&sc=13102&sc=13103&sc=13104&sc=13105&sc=13113&sc=13106&sc=13107&sc=13108&sc=13118&sc=13121&sc=13122&sc=13123&sc=13109&sc=13110&sc=13111&sc=13112&sc=13114&sc=13115&sc=13120&sc=13116&sc=13117&sc=13119&ta=13&tb=0&tj=0&tt=9999999&po=0&pj=1&pc=100"
    }

    list_urls = []
    page = 0

    for category, url in collection_pages.items():
        print(f"Category: {category}")

        while True:
            soup = get_html(url.format(page))
            
            if not soup:
                print(f"No soup in {url}")
                break 

            for h2 in soup.find_all('h2', class_='property_unit-title'):
                a_tag = h2.find('a', href=True) 
                if a_tag and a_tag['href']:
                    property_url = "https://suumo.jp" + a_tag['href']
                    list_urls.append((category, property_url))

            # Find the 'next' page link with text "次へ"
            next_page = soup.find('a', text="次へ")
            if next_page and next_page.get('href'):
                url = "https://suumo.jp" + next_page['href']
            else:
                print(f"No more {category} pages.")
                break

    df_urls = pd.DataFrame(list_urls, columns=['Category', 'URL'])
    df_urls.to_csv("/app/data/property_urls.csv", index=False)
    df_urls.head()


In [18]:


"""
price:          価格                           headerKakakuDisp : ["11900000"],
area:           専有面積                        senyuMensekiDisp : ["41.34"]
level:          所在階/構造・階建   BEFORE /
no_floors:      所在階/構造・階建   AFTER /
plan:           間取り                          madoriDisp : ["2DK"]
year:           完成時期（築年月）  BEFORE 年     kanseiDateDisp : "197911"
address:        住所
building-type:                                 ryoikiShuNm : '中古マンション'


area                            0.201484    area
building_type_apartment         0.064314    building_type_apartment
year                            0.036422    year
is_furnished_with_appliances    0.034590
has_delivery_box                0.031909
latitude                        0.031609    latitude
no_rooms                        0.027744    
longitude                       0.022122    longitude
no_floors                       0.021162    no_floors
layout_2LDK                     0.020122    l
has_system_kitchen              0.019310
layout_1LDK                     0.019068    l
layout_3K                       0.018329    l
has_private_garden              0.016308
layout_4K                       0.016281    l
layout_4DK                      0.014993    l
is_instrument_ok                0.014724
has_basement                    0.014499
is_office_use_allowed           0.012668
has_bathroom_dryer              0.012218
has_soundproof_room             0.011944
building_type_mansion           0.011404    building_type_mansion
has_elevator                    0.011175
is_tower_mansion                0.010824
is_pet_allowed                  0.010696
"""

# def clean_text(text):
#     return text.strip().split('（')[0].strip()
#     # return text #' '.join(text.split())


def clean_text(text):
    return text.strip()


def extract_number(string):
    match = re.search(r'\d+', string)
    if match:
        return int(match.group())
    return None

pd_urls = pd.read_csv("/app/data/property_urls.csv", index_col=False)
df_urls = pd.DataFrame(pd_urls, columns=['Category', 'URL'])
df_urls.head()

list_data = []

count = 0
total = len(df_urls)
batch_size = 1000

for url in df_urls['URL']:
    print(f"1. URL: {url}")
    soup = get_html(url)

    if not soup:
        print(f"No soup in {url}")
        break 

    # Initialize dictionary to store the data
    property_data = {}

    # Iterate over each row in the table
    target_table = soup.find('table', summary="表")
    for row in target_table.find_all('tr'):
        # Find all th and td pairs in the row
        ths = row.find_all('th')
        tds = row.find_all('td')
        
        for th, td in zip(ths, tds):
            header = th.text.strip()
            value = td.text.strip()

            if "価格" in header:
                str_price = clean_text(value).split('円')[0]
                if str_price[-1] == '万':
                    property_data['price'] = int(str_price.replace('万', '')) * 10000
                else:
                    property_data['price'] = int(str_price)
            elif "間取り" in header:
                # print(f"1. {clean_text(value)}")
                property_data['plan'] = clean_text(value)
            elif "専有面積" in header:
                property_data['area'] = float(clean_text(value).split('m2')[0])
            elif "所在階/構造・階建" in header:
                level_details = clean_text(value).split('/')
                property_data['level'] = extract_number(level_details[0])
                property_data['no_floors'] = extract_number(level_details[1])
            elif "完成時期（築年月）" in header:
                # print(f"2. {clean_text(value)}")
                property_data['year'] = int(clean_text(value).split('年')[0])
            elif "住所" in header:
                property_data['address'] = clean_text(value).split('\n')[0]

    
    # scripts = soup.find_all('script')
    # for script in scripts:
    #     # print(f"2. script: {script}")
    #     if 'gapSuumoPcForKr' in script.text:
            
    #         # Extract the JSON-like object
    #         matches = re.search(r'var gapSuumoPcForKr = (\[.*?\]);', script.text, re.DOTALL)
    #         if matches:
    #             json_like = matches.group(1)

    #             # Remove the JavaScript variable declaration
    #             json_string = re.sub(r'^var\s+\w+\s*=\s*', '', json_like.strip(), flags=re.MULTILINE)

    #             # Remove single and multi-line comments
    #             json_string = re.sub(r'//.*?$|/\*.*?\*/', '', json_string, flags=re.MULTILINE | re.DOTALL)

    #             # Ensure all keys and string values are in double quotes
    #             json_string = re.sub(r'(\w+)\s*:', r'"\1":', json_string)  # Quote keys
    #             json_string = re.sub(r':\s*([^"{\d\[\]}\s].*?)(,|\])', r': "\1"\2', json_string)  # Quote non-quoted values

    #             # Handling trailing comma if any before closing brackets
    #             json_string = re.sub(r',\s*([\]}])', r'\1', json_string)

    #             # Convert to Python dictionary
    #             data = json.loads(json_string)

    #             if data['madoriDisp'][0]:
    #                 property_data['plan'] = clean_text(data['madoriDisp'][0])
    #             if data['kanseiDateDisp']:
    #                 property_data['year'] = int(data['kanseiDateDisp'][:4])
    #             if data['headerKakakuDisp']:
    #                 property_data['price'] = int(data['headerKakakuDisp'][0])

    break
property_data



1. URL: https://suumo.jp/ms/chuko/tokyo/sc_adachi/nc_75070928/


{'price': 11900000,
 'plan': '2DK',
 'area': 41.34,
 'level': 2,
 'no_floors': 4,
 'year': 1979,
 'address': '東京都足立区花畑５-2-9'}

In [None]:
# # def get_property_data(url):
    

# pd_urls = pd.read_csv("/app/data/property_urls.csv", index_col=False)
# df_urls = pd.DataFrame(pd_urls, columns=['Category', 'URL'])
# df_urls.head()

# property_data = []

# count = 0
# total = len(df_urls)
# batch_size = 1000

# for url in df_urls['URL']:
#     soup = get_html(url)

#     if not soup:
#         print(f"No soup in {url}")
#         property_data.append({'URL': url, 
#                         '向き': "掲載終了", 
#                         '間取り詳細': "掲載終了", 
#                         '物件の構造': "掲載終了", 
#                         '物件の階建': "掲載終了",
#                         '築年月': "掲載終了", 
#                         '損保': "掲載終了", 
#                         '駐車場': "掲載終了", 
#                         '取引態様': "掲載終了",
#                         '条件': "掲載終了", 
#                         '総戸数': "掲載終了", 
#                         '契約期間': "掲載終了", 
#                         '仲介手数料': "掲載終了",
#                         '保証会社': "掲載終了", 
#                         '備考': "掲載終了", 
#                         '部屋の特徴': "掲載終了"})
#         break 

#     # Find the "Direction"
#     try:
#         element = soup.find("table", {"class": "property_view_table"})
#         direction = element.find('th', string="向き").find_next_sibling('td').get_text(strip=True)
#     except AttributeError:
#         direction = 'Null'
    
#     # Find the required data table
#     tbodys = soup.find("table", {"class": "data_table table_gaiyou"})
    
#     # Define a function to safely fetch the data for each field
#     def get_text_safe(th_string):
#         try:
#             return tbodys.find('th', string=th_string).find_next_sibling('td').get_text(strip=True)
#         except AttributeError:
#             return 'Null'
    
#     # Use the function to fetch data for each field
#     layout_detail = get_text_safe('間取り詳細')
#     structure = get_text_safe('構造')
#     floor = get_text_safe('階建')
#     yearmonth = get_text_safe('築年月')
#     insurance = get_text_safe('損保')
#     parking = get_text_safe('駐車場')
#     transaction_type = get_text_safe('取引態様')
#     conditions = get_text_safe('条件')
#     total_units = get_text_safe('総戸数')
#     lease_term = get_text_safe('契約期間')
#     intermediary_fee = get_text_safe('仲介手数料')
#     guarantee_company = get_text_safe('保証会社')
#     note = get_text_safe('備考')

#     # Find room features
#     try:
#         li_element = soup.find('div', {'id': 'bkdt-option'}).find('li')
#         feature = li_element.get_text(strip=True)
#     except AttributeError:
#         feature = 'Null'

#     # Append the data to the list
#     property_data.append({
#         'URL': url,
#         '向き': direction,
#         '間取り詳細': layout_detail,
#         '物件の構造': structure,
#         '物件の階建': floor,
#         '築年月': yearmonth,
#         '損保': insurance,
#         '駐車場': parking,
#         '取引態様': transaction_type,
#         '条件': conditions,
#         '総戸数': total_units,
#         '契約期間': lease_term,
#         '仲介手数料': intermediary_fee,
#         '保証会社': guarantee_company,
#         '備考': note,
#         '部屋の特徴': feature
#     })
    
#     # Update the counter and print the current progress
#     count += 1
#     print(f"Processed {count}/{total} URLs")

#     # Periodically save property_data and clear the list from memory
#     if count % batch_size == 0:
#         partial_df = pd.DataFrame(property_data)
#         partial_df.to_csv(f'property_data_part_{count // batch_size}.csv', index=False)
#         property_data.clear()
#         gc.collect()  # Manually release memory
#         time.sleep(5)  # Rest 5 seconds to reduce server load
#     partial_df = pd.DataFrame(property_data)
#     partial_df.head()
#     break


In [None]:
partial_df.head()

Unnamed: 0,URL,向き,間取り詳細,物件の構造,物件の階建,築年月,損保,駐車場,取引態様,条件,総戸数,契約期間,仲介手数料,保証会社,備考,部屋の特徴
0,https://suumo.jp/ms/chuko/tokyo/sc_adachi/nc_7...,Null,Null,Null,Null,Null,Null,Null,Null,Null,Null,Null,Null,Null,Null,Null


In [None]:
# for page in range(1, max_page + 1):
#     url = base_url.format(page) # Replace {} with page number
#     soup = get_html(url)        # Get parsed data

#     property_tags = soup.findAll("div", {"class": "cassetteitem"})  # All <div>'s with "cassetteitem" in the class attribute contain propetry info.
    
#     # print(f"Page: {page} - items: {property_tags}")
    
#     for property_tag in property_tags:
#         stations = property_tag.findAll("div", {"class": "cassetteitem_detail-text"})
        
#         # process each station
#         for station in stations:
#             # define variable 
#             base_data = {}

#             # collect base information    
#             # Name
#             base_data["名称"] = property_tag.find("div", {"class": "cassetteitem_content-title"}).getText().strip()
            
#             # category
#             base_data["カテゴリー"] = property_tag.find("div", {"class": "cassetteitem_content-label"}).getText().strip()

#             # address
#             base_data["アドレス"] = property_tag.find("li", {"class": "cassetteitem_detail-col1"}).getText().strip()

#             # nearby station
#             base_data["アクセス"] = station.getText().strip()

#             # building's age
#             base_data["築年数"] = property_tag.find("li", {"class": "cassetteitem_detail-col3"}).findAll("div")[0].getText().strip()

#             # structure
#             base_data["構造"] = property_tag.find("li", {"class": "cassetteitem_detail-col3"}).findAll("div")[1].getText().strip()
            
#             # process data for each room
#             tbodys = property_tag.find("table", {"class": "cassetteitem_other"}).findAll("tbody")
            
#             for tbody in tbodys:
#                 data = base_data.copy()

#                 # floor
#                 data["階数"] = tbody.findAll("td")[2].getText().strip()

#                 # rent
#                 data["家賃"] = tbody.findAll("td")[3].findAll("li")[0].getText().strip()

#                 # Management fee
#                 data["管理費"] = tbody.findAll("td")[3].findAll("li")[1].getText().strip()

#                 # deposit
#                 data["敷金"] = tbody.findAll("td")[4].findAll("li")[0].getText().strip()

#                 #gratuity fee
#                 data["礼金"] = tbody.findAll("td")[4].findAll("li")[1].getText().strip()

#                 #layout
#                 data["間取り"] = tbody.findAll("td")[5].findAll("li")[0].getText().strip()

#                 #area(m^2)
#                 data["面積"] = tbody.findAll("td")[5].findAll("li")[1].getText().strip()

#                 data["URL"] = "https://suumo.jp" + tbody.findAll("td")[8].find("a").get("href")
                
#                 all_data.append(data)
# df = pd.DataFrame(all_data)
# df.head()