In [2]:
import pandas as pd
from retry import retry
import requests
from bs4 import BeautifulSoup
import gc
import time

In [3]:
# Read the Excel file
df = pd.read_csv('output_20240617.csv')

# Extract unique URLs
unique_urls = df['URL'].unique()
print(len(unique_urls))

135758


In [8]:
# Convert the list of unique URLs into a DataFrame
unique_urls_df = pd.DataFrame(unique_urls, columns=['URL'])

unique_urls_df.to_csv('output_20240617_unique_URL.csv', index=False)

In [None]:
# Define crawl_url function to scrape the content of each URL
# retry up to 3 times, with a 10-second delay between retries, and the delay doubles with each retry.
@retry(tries=3, delay=10, backoff=2)
def crawl_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Create an empty list to store the scraping results
results = []

# Initialize the counter
url_count = 0

# Total number of URLs
total_urls = len(unique_urls)

# Batch size for periodic saving
batch_size = 1000

# Record the start time
start_time = time.time()

# Iterate through each unique URL to scrape
for url in unique_urls:
    try:
        # Scrape and parse the web page content
        soup = crawl_url(url)

        # Find the "Direction"
        try:
            element = soup.find("table", {"class": "property_view_table"})
            direction = element.find('th', string="向き").find_next_sibling('td').get_text(strip=True)
        except AttributeError:
            direction = 'Null'
        
        # Find the required data table
        tbodys = soup.find("table", {"class": "data_table table_gaiyou"})
        
        # Define a function to safely fetch the data for each field
        def get_text_safe(th_string):
            try:
                return tbodys.find('th', string=th_string).find_next_sibling('td').get_text(strip=True)
            except AttributeError:
                return 'Null'
        
        # Use the function to fetch data for each field
        layout_detail = get_text_safe('間取り詳細')
        structure = get_text_safe('構造')
        floor = get_text_safe('階建')
        yearmonth = get_text_safe('築年月')
        insurance = get_text_safe('損保')
        parking = get_text_safe('駐車場')
        transaction_type = get_text_safe('取引態様')
        conditions = get_text_safe('条件')
        total_units = get_text_safe('総戸数')
        lease_term = get_text_safe('契約期間')
        intermediary_fee = get_text_safe('仲介手数料')
        guarantee_company = get_text_safe('保証会社')
        note = get_text_safe('備考')

        # Find room features
        try:
            li_element = soup.find('div', {'id': 'bkdt-option'}).find('li')
            feature = li_element.get_text(strip=True)
        except AttributeError:
            feature = 'Null'

        # Append the data to the list
        results.append({
            'URL': url,
            '向き': direction,
            '間取り詳細': layout_detail,
            '物件の構造': structure,
            '物件の階建': floor,
            '築年月': yearmonth,
            '損保': insurance,
            '駐車場': parking,
            '取引態様': transaction_type,
            '条件': conditions,
            '総戸数': total_units,
            '契約期間': lease_term,
            '仲介手数料': intermediary_fee,
            '保証会社': guarantee_company,
            '備考': note,
            '部屋の特徴': feature
        })
        
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")
        results.append({'URL': url, 
                        '向き': "掲載終了", 
                        '間取り詳細': "掲載終了", 
                        '物件の構造': "掲載終了", 
                        '物件の階建': "掲載終了",
                        '築年月': "掲載終了", 
                        '損保': "掲載終了", 
                        '駐車場': "掲載終了", 
                        '取引態様': "掲載終了",
                        '条件': "掲載終了", 
                        '総戸数': "掲載終了", 
                        '契約期間': "掲載終了", 
                        '仲介手数料': "掲載終了",
                        '保証会社': "掲載終了", 
                        '備考': "掲載終了", 
                        '部屋の特徴': "掲載終了"})

    # Update the counter and print the current progress
    url_count += 1
    print(f"Processed {url_count}/{total_urls} URLs")

    # Periodically save results and clear the list from memory
    if url_count % batch_size == 0:
        partial_df = pd.DataFrame(results)
        partial_df.to_csv(f'results_part_{url_count // batch_size}.csv', index=False)
        results.clear()
        gc.collect()  # Manually release memory
        time.sleep(5)  # Rest 5 seconds to reduce server load

# Save remaining results
if results:
    partial_df = pd.DataFrame(results)
    partial_df.to_csv(f'results_part_{(url_count // batch_size) + 1}.csv', index=False)

# Record end time
end_time = time.time()

# Calculate total execution time
total_time = end_time - start_time
print(f"Total execution time: {total_time:.2f} seconds ({total_time/3600:.2f} hours)")

Processed 1/135758 URLs
Processed 2/135758 URLs
Processed 3/135758 URLs
Processed 4/135758 URLs
Processed 5/135758 URLs
Processed 6/135758 URLs
Processed 7/135758 URLs
Processed 8/135758 URLs
Processed 9/135758 URLs
Processed 10/135758 URLs
Processed 11/135758 URLs
Processed 12/135758 URLs
Processed 13/135758 URLs
Processed 14/135758 URLs
Processed 15/135758 URLs
Processed 16/135758 URLs
Processed 17/135758 URLs
Processed 18/135758 URLs
Processed 19/135758 URLs
Processed 20/135758 URLs
Processed 21/135758 URLs
Processed 22/135758 URLs
Processed 23/135758 URLs
Processed 24/135758 URLs
Processed 25/135758 URLs
Processed 26/135758 URLs
Processed 27/135758 URLs
Processed 28/135758 URLs
Processed 29/135758 URLs
Processed 30/135758 URLs
Processed 31/135758 URLs
Processed 32/135758 URLs
Processed 33/135758 URLs
Processed 34/135758 URLs
Processed 35/135758 URLs
Processed 36/135758 URLs
Processed 37/135758 URLs
Processed 38/135758 URLs
Processed 39/135758 URLs
Processed 40/135758 URLs
Processed