In [6]:
import os
import hashlib
import requests
import pandas as pd
from PIL import Image
from io import BytesIO
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
import time



In [7]:
CATEGORIES = open('categories.txt').readlines()
CATEGORIES = [x.strip() for x in CATEGORIES][:4]
CATEGORIES = ['bicycle', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
print(CATEGORIES)
OUTPUT_DIR = 'images'
MAX_IMAGES_PER_CATEGORY = 500
CSV_FILE = 'image_metadata.csv'
os.makedirs(OUTPUT_DIR, exist_ok=True)


['bicycle', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [8]:
with open(CSV_FILE, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['category', 'image_url', 'download_time', 'file_path'])

In [9]:
def download_image(url, category):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            image_content = response.content
            file_hash = hashlib.md5(image_content).hexdigest()
            file_name = f'{file_hash}.jpg'
            file_path = os.path.join(OUTPUT_DIR, category, file_name)

            # Check if the file already exists
            if os.path.exists(file_path):
                print(f"Image already exists: {file_path}")
                return False

            # Save the image
            os.makedirs(os.path.join(OUTPUT_DIR, category), exist_ok=True)  # Ensure category folder exists
            image_file = BytesIO(image_content)
            image = Image.open(image_file).convert('RGB')
            image.save(file_path, 'PNG')
            print(f"Saved image: {file_path}")

            # Save metadata
            with open(CSV_FILE, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([category, url, datetime.now(), file_path])
            return True
    except Exception as e:
        print(f"Failed to download {url}: {str(e)}")
    return False


def get_image_urls(query, classes, location, source):
    try:
        # Set up Selenium WebDriver
        options = webdriver.ChromeOptions()
        options.add_argument('--headless=new')  # Run Chrome in headless mode
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        # Construct the search URL
        search_url = f"https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw={query}&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw={query}"
        print(f"Scraping URL: {search_url}")
        driver.get(search_url)

        # Wait for the page to load
        time.sleep(5)

        # Get the page source and parse it with BeautifulSoup
        content = driver.page_source
        soup = BeautifulSoup(content, 'html.parser')

        # Close the WebDriver
        driver.quit()

        # Extract image URLs
        image_urls = []
        elements = soup.findAll(attrs={"class": classes})
        for element in elements:
            url = element.find(location)
            if url and url not in image_urls:
                image_urls.append(url.get(source))

        print(f"Found {len(image_urls)} image URLs.")
        return image_urls
    except Exception as e:
        print(f"Failed to scrape image URLs: {str(e)}")
        return []

def scrape_images():
    for category in CATEGORIES:
        query = category.replace(' ', '+')
        print(f'Scraping images for: {category}')
        image_urls = get_image_urls(query, "s-item__image-wrapper image-treatment", "img", "src")

        count = 0
        for img_url in image_urls:
            if download_image(img_url, category):
                count += 1
                if count >= MAX_IMAGES_PER_CATEGORY:
                    break

def download_image(url, category):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            file_hash = hashlib.md5(response.content).hexdigest()
            file_name = f'{category}_{file_hash}.jpg'
            file_path = os.path.join(OUTPUT_DIR, file_name)
            image.save(file_path)

            with open(CSV_FILE, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([category, url, datetime.now(), file_path])
            return True
    except Exception as e:
        print(f'Failed to download {url}: {e}')
    return False



In [10]:
scrape_images()


Scraping images for: bicycle
Scraping URL: https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=bicycle&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw=bicycle
Found 62 image URLs.
Failed to download https://ir.ebaystatic.com/rs/v/fxxj3ttftm5ltcqnto1o4baovyl.png: cannot write mode P as JPEG
Failed to download https://ir.ebaystatic.com/rs/v/fxxj3ttftm5ltcqnto1o4baovyl.png: cannot write mode P as JPEG


KeyboardInterrupt: 