In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import time # Import tqdm for the progress bar
import os
import datetime

# Setup Selenium WebDriver
driver = webdriver.Chrome()

# Define the list of URLs and the page limit per URL
urls = [
    "https://www.tokopedia.com/search?st=&q=hoodie&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=keyboard&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=mac&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=keycaps&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=laptop%20gaming&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=switch&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=ps5&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource=",
    "https://www.tokopedia.com/search?st=&q=iphone&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&navsource="
]
pages_per_url = 150
data_limit = 15000
output_file = 'tokopedia_data.csv'
timeout_minutes = 5  # Set the timeout period (5 minutes)

# Helper function to handle missing elements
def get_text_or_na(element):
    return element.get_text().strip() if element else "N/A"


total_pages = pages_per_url * len(urls)

# Function to check if more than 5 minutes have passed since last update
def has_timeout_exceeded(last_update_time):
    return (datetime.datetime.now() - last_update_time).total_seconds() > timeout_minutes * 60

# Loop through each URL
for base_url in urls:
    last_update_time = datetime.datetime.now()  # Initialize the last update time
    for current_page in range(1, pages_per_url + 1):
        # Construct the full URL for the current page
        url = f"{base_url}&page={current_page}"
        driver.get(url)
        
        # Wait for the page to load
        time.sleep(5)

        # Scroll down multiple times to load all products in smaller increments
        for _ in range(10):
            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(3)

        # Extract page content (HTML)
        page_html = driver.page_source
        soup = BeautifulSoup(page_html, "html.parser")

        # Find all product boxes on the current page
        product_boxes = soup.find_all('div', {"class": "css-5wh65g"})

        # Create lists to store data for this page
        product_names = []
        product_prices = []
        sellers = []
        store_locations = []
        sold_quantities = []
        ratings = []

        # Extract product details using the helper function
        for product in product_boxes:
            name = product.find('span', {"class": "OWkG6oHwAppMn1hIBsC3pQ=="})
            price = product.find('div', {"class": "_8cR53N0JqdRc+mQCckhS0g=="})
            seller = product.find('span', {"class": "X6c-fdwuofj6zGvLKVUaNQ== -9tiTbQgmU1vCjykywQqvA== flip"})
            location = product.find('span', {"class": "-9tiTbQgmU1vCjykywQqvA== flip"})
            sold = product.find('span', {"class": "eLOomHl6J3IWAcdRU8M08A=="})
            rating = product.find('span', {"class": "nBBbPk9MrELbIUbobepKbQ=="})

            # Append the extracted data using the helper function
            product_names.append(get_text_or_na(name))
            product_prices.append(get_text_or_na(price))
            sellers.append(get_text_or_na(seller))
            store_locations.append(get_text_or_na(location))
            sold_quantities.append(get_text_or_na(sold))
            ratings.append(get_text_or_na(rating))

        # Create a DataFrame with the current page's data
        page_data = pd.DataFrame({
            'Product': product_names,
            'Price': product_prices,
            'Seller': sellers,
            'Location': store_locations,
            'Sold': sold_quantities,
            'Rating': ratings
        })

        # Drop rows where all values are "N/A"
        page_data.replace("N/A", pd.NA, inplace=True)
        page_data.dropna(how='all', inplace=True)

        # Check if new data was found
        if not page_data.empty:  # Only save if the DataFrame is not empty
            # Save the data to a CSV file, append if the file already exists
            page_data.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
            last_update_time = datetime.datetime.now()  # Reset the timer
        # Check if the data limit has been reached
        if len(product_names) >= data_limit:
            print(f"Data limit reached: {len(product_names)} products scraped.")
            break

        # Check if timeout exceeded (if no new data in the last 5 minutes)
        if has_timeout_exceeded(last_update_time):
            print(f"No data update for {timeout_minutes} minutes. Moving to the next URL.")
            break  # Break the page loop and move to the next URL

# Close the WebDriver after scraping
driver.quit()

print(f"Data has been progressively saved to {output_file}")

No data update for 5 minutes. Moving to the next URL.


KeyboardInterrupt: 