In [4]:
import os
import hashlib
import requests
import pandas as pd
from PIL import Image, ImageOps
from io import BytesIO
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
import time



In [5]:
CATEGORIES = [
    "bicycle",
    "bus",
    "truck",
    "boat",
    "parking meter",
    "bench",
    "backpack",
    "umbrella",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "book",
    "clock",
    "chair",
    "sofa",
    "bed"
]



In [6]:
CSV_LINK_FILE = 'image_links.csv'
CSV_METADATA_FILE = 'image_metadata.csv'
OUTPUT_DIR = 'scraped_images'
MAX_IMAGES_PER_CATEGORY = 500

os.makedirs(OUTPUT_DIR, exist_ok=True)  
for category in CATEGORIES:
    category_dir = os.path.join(OUTPUT_DIR, category)
    os.makedirs(category_dir, exist_ok=True)  # Create the category folder if it doesn't exist


In [7]:
def write_to_csv(csv_file, row):
    if not os.path.exists(csv_file):
        with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
            csv.writer(file).writerow(['category', 'image_url', 'download_time', 'file_path'])  # Write header

    with open(csv_file, mode='r+', newline='', encoding='utf-8') as file:
        if row not in csv.reader(file):
            csv.writer(file).writerow(row)
        else:
            print(f"Row already exists in CSV: {row}")

def write_links_to_csv(csv_path, data):
    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    write_header = not os.path.exists(csv_path)
    with open(csv_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        if write_header:
            writer.writerow(['category', 'url', 'source', 'timestamp'])
        for row in data:
            writer.writerow(row)



In [8]:
def scroll_to_bottom(driver, max_scrolls=15):
    """Scrolls to the bottom of the page to load more content."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_count = 0

    while scroll_count < max_scrolls:  
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)  
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("Reached the bottom of the page or no new content to load.")
            break

        print(f"Scrolled {scroll_count + 1} times")
        last_height = new_height
        scroll_count += 1

def get_image_urls(query, classes, location, source):
    try:
        options = webdriver.ChromeOptions()
        options.add_argument('--headless=new')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        search_url = f"https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw={query}&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw={query}"
        print(f"Scraping URL: {search_url}")
        driver.get(search_url)

        scroll_to_bottom(driver)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        image_urls = []
        elements = soup.findAll(attrs={"class": classes})
        for element in elements:
            url = element.find(location)
            if url and url.get(source) and url.get(source) not in image_urls:
                image_urls.append(url.get(source))

        print(f"Found {len(image_urls)} image URLs.")
        return image_urls
    except Exception as e:
        print(f"Failed to scrape image URLs: {str(e)}")
        return []



In [9]:
def get_aspect_ratio(image):
    width, height = image.size
    aspect_ratio = width / height
    return aspect_ratio

def find_closest_aspect_ratio(image):
    aspect_ratios = {
        "1:1": 1.0,
        "4:3": 4 / 3,
        "16:9": 16 / 9
    }

    original_aspect_ratio = get_aspect_ratio(image)
    closest_ratio = min(aspect_ratios, key=lambda ratio: abs(aspect_ratios[ratio] - original_aspect_ratio))
    return closest_ratio

def resize_image(image, target_size):
    image = ImageOps.exif_transpose(image)
    new_image = Image.new("RGB", target_size, (255, 255, 255))
    image.thumbnail(target_size)
    x_offset = (target_size[0] - image.size[0]) // 2
    y_offset = (target_size[1] - image.size[1]) // 2
    new_image.paste(image, (x_offset, y_offset))
    return new_image

def resize_and_save(image, file_path):
    closest_ratio = find_closest_aspect_ratio(image)
    if closest_ratio == "1:1":
        resized_img = resize_image(image, (640, 640))
    elif closest_ratio == "4:3":
        resized_img = resize_image(image, (1024, 768))
    elif closest_ratio == "16:9":
        resized_img = resize_image(image, (1280, 720))
    resized_img.save(file_path.replace('.jpg', f'_{closest_ratio.replace(":", "-")}.jpg'))

def download_image_from_csv_row(category, url, output_dir):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            image_content = response.content
            file_hash = hashlib.md5(image_content).hexdigest()
            image = Image.open(BytesIO(image_content))
            image_format = image.format

            if image_format == 'PNG':
                file_ext = 'png'
            elif image_format in ['JPEG', 'JPG', 'WEBP']:
                file_ext = 'jpg'
            else:
                print(f"Unsupported format: {image_format}")
                return

            category_path = os.path.join(output_dir, category)
            os.makedirs(category_path, exist_ok=True)
            file_path = os.path.join(category_path, f"{file_hash}.{file_ext}")

            if not os.path.exists(file_path):
                image.save(file_path, image_format)
                resize_and_save(image, file_path)
                print(f"Saved image: {file_path}")

                with open(CSV_METADATA_FILE, mode='a', newline='') as metafile:
                    meta_writer = csv.writer(metafile)
                    if os.stat(CSV_METADATA_FILE).st_size == 0:
                        meta_writer.writerow(['category', 'url', 'download_time', 'file_path'])
                    meta_writer.writerow([category, url, datetime.now(), file_path])
            else:
                print(f"Image already exists: {file_path}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

def download_images_from_csv(csv_path=CSV_LINK_FILE, output_dir=OUTPUT_DIR):
    df = pd.read_csv(csv_path)
    for _, row in df.iterrows():
        category = row['category']
        url = row['url']
        download_image_from_csv_row(category, url, output_dir)

In [None]:
def scrape_images():
    for category in CATEGORIES:
        query = category.replace(' ', '+')
        print(f'Scraping images for: {category}')
        image_urls = get_image_urls(query, "s-item__image-wrapper image-treatment", "img", "src")
        timestamp = datetime.now()
        link_rows = [(category, url, "ebay", timestamp) for url in image_urls[:MAX_IMAGES_PER_CATEGORY]]
        write_links_to_csv(category, CSV_LINK_FILE, link_rows)
        print(f"Saved {len(link_rows)} links for {category}")

In [11]:
scrape_images()


Scraping images for: bicycle
Scraping URL: https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2334524.m570.l1313&_nkw=bicycle&_sacat=0&LH_TitleDesc=0&_osacat=0&_odkw=bicycle
Scrolled 1 times
Scrolled 2 times
Reached the bottom of the page or no new content to load.
Found 72 image URLs.


FileNotFoundError: [WinError 3] The system cannot find the path specified: ''

In [None]:
download_images_from_csv()