In [21]:
CATEGORIES = [
    "bicycle",
    "bus",
    "truck",
    "boat",
    "parking meter",
    "bench",
    "backpack",
    "umbrella",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "book",
    "clock",
    "chair",
    "sofa",
    "bed"
]



In [None]:
from PIL import Image, ImageOps
import os
import csv
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException

import time


In [None]:
CSV_LINK_FILE = 'image_links.csv'
MAX_IMAGES_PER_CATEGORY = 500

In [24]:
def scroll_to_bottom(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [None]:
def get_image_urls(query, classes, location, source):
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--headless=new')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        search_url = f"https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw={query}&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw={query}"
        print(f"Scraping URL: {search_url}")
        driver.get(search_url)

        scroll_to_bottom(driver)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        image_urls = []
        elements = soup.findAll(attrs={"class": classes})
        for element in elements:
            url = element.find(location)
            if url and url.get(source) and url.get(source) not in image_urls:
                image_urls.append(url.get(source))

        print(f"Found {len(image_urls)} image URLs.")
        return image_urls
    except Exception as e:
        print(f"Failed to scrape image URLs: {str(e)}")
        return []

In [None]:
def write_links_to_csv(csv_path, data):
    existing_urls = set()

    # Load existing URLs if file exists
    if os.path.exists(csv_path):
        with open(csv_path, mode='r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_urls = {row['url'] for row in reader}

    # Filter out duplicates
    new_rows = [row for row in data if row[1] not in existing_urls]
    new_rows.append(False)
    # Write only new rows
    write_header = not os.path.exists(csv_path)
    with open(csv_path, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(['category', 'url', 'source', 'timestamp', 'downloaded'])
        writer.writerows(new_rows)


def scrape_images():
    for category in CATEGORIES:
        query = category.replace(' ', '+')
        print(f'Scraping images for: {category}')
        image_urls = get_image_urls(query, "s-item__image-wrapper image-treatment", "img", "src")
        timestamp = datetime.now()
        link_rows = [(category, url, "ebay", timestamp, False) for url in image_urls[:MAX_IMAGES_PER_CATEGORY]]
        write_links_to_csv(CSV_LINK_FILE, link_rows)
        print(f"Saved {len(link_rows)} links for {category}")

In [29]:
scrape_images()

Scraping images for: bicycle
Scraping URL: https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=bicycle&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw=bicycle


KeyboardInterrupt: 