In [1]:
import pandas as pd
from retry import retry
import requests
from bs4 import BeautifulSoup
import gc
import time

In [2]:
# Set display options to ensure all content is displayed
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)   # Disable column width limit, show full content

Due to memory limitations, the scraping process in the previous step (02_Scraping_Apt_Detail_1.ipynb) only reached the 71,000th URL.
Next, we will continue the scraping process from the 71,001st URL.

In [5]:
# Read the CSV file
df = pd.read_csv('../output_20240617_unique_URL.csv')

row_70999 = df.iloc[70998]  # Row 70999
row_71000 = df.iloc[70999]  # Row 71000
row_71001 = df.iloc[71000]  # Row 71001

# Print rows 70999, 71000, and 71001, and manually open results_part_71.csv to verify data correctness
print("Row 70999:")
print(row_70999)

print("\nRow71000:")
print(row_71000)

print("\nRow71001:")
print(row_71001)

# Confirm data is correct

Row 70999:
URL    https://suumo.jp/chintai/jnc_000090606649/?bc=100381141802
Name: 70998, dtype: object

Row71000:
URL    https://suumo.jp/chintai/jnc_000090537256/?bc=100384549744
Name: 70999, dtype: object

Row71001:
URL    https://suumo.jp/chintai/jnc_000091253665/?bc=100385486915
Name: 71000, dtype: object


In [7]:
# Extract data from row 71001 to the last row
# Note: Row numbers start from 0, so the index of row 71001 is 71000

urls_to_crawl = df['URL'].iloc[71000:].tolist()
print(len(urls_to_crawl))

64758


In [8]:
# Define crawl_url function to scrape the content of each URL
# retry up to 3 times, with a 10-second delay between retries, and the delay doubles with each retry.
@retry(tries=3, delay=10, backoff=2)
def crawl_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Create an empty list to store the scraping results
results = []

# Initialize the counter
url_count = 0
total_urls = len(urls_to_crawl)
batch_size = 1000  # Batch size for periodic saving

# Record the start time
start_time = time.time()

# Iterate through each unique URL to scrape
for url in urls_to_crawl:
    try:
        # Scrape and parse the web page content
        soup = crawl_url(url)

        # Find the "Direction"
        try:
            element = soup.find("table", {"class": "property_view_table"})
            direction = element.find('th', string="向き").find_next_sibling('td').get_text(strip=True)
        except AttributeError:
            direction = 'Null'
        
        # Find the required data table
        tbodys = soup.find("table", {"class": "data_table table_gaiyou"})
        
        # Define a function to safely fetch the data for each field
        def get_text_safe(th_string):
            try:
                return tbodys.find('th', string=th_string).find_next_sibling('td').get_text(strip=True)
            except AttributeError:
                return 'Null'
        
        # Use the function to fetch data for each field
        layout_detail = get_text_safe('間取り詳細')
        structure = get_text_safe('構造')
        floor = get_text_safe('階建')
        yearmonth = get_text_safe('築年月')
        insurance = get_text_safe('損保')
        parking = get_text_safe('駐車場')
        transaction_type = get_text_safe('取引態様')
        conditions = get_text_safe('条件')
        total_units = get_text_safe('総戸数')
        lease_term = get_text_safe('契約期間')
        intermediary_fee = get_text_safe('仲介手数料')
        guarantee_company = get_text_safe('保証会社')
        note = get_text_safe('備考')

        # Find room features
        try:
            li_element = soup.find('div', {'id': 'bkdt-option'}).find('li')
            feature = li_element.get_text(strip=True)
        except AttributeError:
            feature = 'Null'

        # Append the data to the list
        results.append({
            'URL': url,
            '向き': direction,
            '間取り詳細': layout_detail,
            '物件の構造': structure,
            '物件の階建': floor,
            '築年月': yearmonth,
            '損保': insurance,
            '駐車場': parking,
            '取引態様': transaction_type,
            '条件': conditions,
            '総戸数': total_units,
            '契約期間': lease_term,
            '仲介手数料': intermediary_fee,
            '保証会社': guarantee_company,
            '備考': note,
            '部屋の特徴': feature
        })
        
    except Exception as e:
        print(f"Failed to crawl {url}: {e}")
        results.append({'URL': url, 
                        '向き': "掲載終了", 
                        '間取り詳細': "掲載終了", 
                        '物件の構造': "掲載終了", 
                        '物件の階建': "掲載終了",
                        '築年月': "掲載終了", 
                        '損保': "掲載終了", 
                        '駐車場': "掲載終了", 
                        '取引態様': "掲載終了",
                        '条件': "掲載終了", 
                        '総戸数': "掲載終了", 
                        '契約期間': "掲載終了", 
                        '仲介手数料': "掲載終了",
                        '保証会社': "掲載終了", 
                        '備考': "掲載終了", 
                        '部屋の特徴': "掲載終了"})

    # Update the counter and print the current progress
    url_count += 1
    print(f"Processed {url_count}/{total_urls} URLs")

    # Periodically save results and clear the list from memory
    if url_count % batch_size == 0:
        partial_df = pd.DataFrame(results)
        partial_df.to_csv(f'results_part_{(url_count // batch_size)+71}.csv', index=False)
        results.clear()
        gc.collect()  # Manually release memory
        time.sleep(5)  # Rest 5 seconds to reduce server load

# Save remaining results
if results:
    partial_df = pd.DataFrame(results)
    partial_df.to_csv(f'results_part_{(url_count // batch_size) + 72}.csv', index=False)

# Record end time
end_time = time.time()

# Calculate total execution time
total_time = end_time - start_time
print(f"Total execution time: {total_time:.2f} seconds ({total_time/3600:.2f} hours)")

Processed 1/64758 URLs
Processed 2/64758 URLs
Processed 3/64758 URLs
Processed 4/64758 URLs
Processed 5/64758 URLs
Processed 6/64758 URLs
Processed 7/64758 URLs
Processed 8/64758 URLs
Processed 9/64758 URLs
Processed 10/64758 URLs
Processed 11/64758 URLs
Processed 12/64758 URLs
Processed 13/64758 URLs
Processed 14/64758 URLs
Processed 15/64758 URLs
Processed 16/64758 URLs
Processed 17/64758 URLs
Processed 18/64758 URLs
Processed 19/64758 URLs
Processed 20/64758 URLs
Processed 21/64758 URLs
Processed 22/64758 URLs
Processed 23/64758 URLs
Processed 24/64758 URLs
Processed 25/64758 URLs
Processed 26/64758 URLs
Processed 27/64758 URLs
Processed 28/64758 URLs
Processed 29/64758 URLs
Processed 30/64758 URLs
Processed 31/64758 URLs
Processed 32/64758 URLs
Processed 33/64758 URLs
Processed 34/64758 URLs
Processed 35/64758 URLs
Processed 36/64758 URLs
Processed 37/64758 URLs
Processed 38/64758 URLs
Processed 39/64758 URLs
Processed 40/64758 URLs
Processed 41/64758 URLs
Processed 42/64758 URLs
P

In [16]:
# Combine all parts into one DataFrame
all_parts = [pd.read_csv(f'results_part_{i}.csv') for i in range(1, 137)]
result_df = pd.concat(all_parts, ignore_index=True)

# View the first two rows of the result
result_df.head(2)

Unnamed: 0,URL,向き,間取り詳細,物件の構造,物件の階建,築年月,損保,駐車場,取引態様,条件,総戸数,契約期間,仲介手数料,保証会社,備考,部屋の特徴
0,https://suumo.jp/chintai/jnc_000090405876/?bc=100380092265,南東,洋7.2 洋6 洋5.8 D17.4K3.5,鉄筋コン,1階/地下1地上3階建,2018年12月,要,-,仲介,楽器相談,21戸,定期借家 3年,Null,保証会社利用必 初年度委託料：賃料等合計額の50％（最低保証料2万円）2年目以降：1年毎8千円,弊社ではエリア・物件問わず、他社様が掲載しているお部屋も全てご紹介出来ます。何社も回らずに１回で済むので効率よくお部屋探しが出来ます！他に気になるお部屋がございましたら、お気軽にお申し付けください。最新の空室確認を行い、まとめてご案内致します。,バストイレ別、バルコニー、エアコン、ガスコンロ対応、フローリング、浴室乾燥機、オートロック、室内洗濯置、システムキッチン、追焚機能浴室、温水洗浄便座、エレベーター、駐輪場、宅配ボックス、3口以上コンロ、防犯カメラ、分譲賃貸、ウォークインクロゼット、バイク置場、CS、メゾネット、床暖房、食器洗乾燥機、トランクルーム、ディスポーザー、楽器相談、キッズルーム、セキュリティ会社加入済、シューズWIC、BS
1,https://suumo.jp/chintai/jnc_000091350228/?bc=100386147421,-,LDK16.40,鉄筋コン,2階/地下1地上8階建,2000年9月,2万円2年,-,仲介,楽器相談,-,Null,Null,保証会社利用必 初回：月額総賃料の50％年間保証料：10，000円,通勤管理,バストイレ別、バルコニー、エアコン、フローリング、オートロック、室内洗濯置、システムキッチン、エレベーター、洗面所独立、洗面化粧台、宅配ボックス、CATV、BS・CS、対面式キッチン、グリル付、ウォークインクロゼット、保証人不要、CATVインターネット、ガスレンジ付、楽器相談、東南向き、IT重説 対応物件、初期費用カード決済可


In [17]:
print(len(result_df))

135758


In [18]:
# Save the combined result to a CSV file
result_df.to_csv('output_details_20200617.csv', index=False)

In [19]:
result_df.to_pickle('output_details_20200617.pkl')

In [None]:
# 讀取Excel文件
# df = pd.read_excel('output_20240617.xlsx')

# # 提取前50行
# df_first_50 = df.head(50)

# # 將提取的數據保存到新的Excel文件
# df_first_50.to_excel('output_20240617_first_50.xlsx', index=False)

# 提取唯一的 URL
# unique_urls = df_first_50['URL'].unique() #[:2]
# print(len(unique_urls))

In [3]:
# df = pd.read_excel('output_20240617_first_50.xlsx')
# unique_urls = df['URL'].unique()