In [None]:
import os
import hashlib
import requests
import pandas as pd
from PIL import Image
from io import BytesIO
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
import time


In [20]:
CATEGORIES = open('categories.txt').read().splitlines()[:2]
OUTPUT_DIR = 'images'
MAX_IMAGES_PER_CATEGORY = 500
CSV_FILE = 'image_metadata.csv'
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [21]:
with open(CSV_FILE, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['category', 'image_url', 'download_time', 'file_path'])

In [None]:
def download_image(url, category):
    try:
        if not url.startswith('http'):
            return False
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            if image.mode in ("RGB", "L"):
                file_hash = hashlib.md5(response.content).hexdigest()
                file_name = f'{category}_{file_hash}.jpg'
                file_path = os.path.join(OUTPUT_DIR, file_name)
                image.save(file_path)

                # Save metadata
                with open(CSV_FILE, mode='a', newline='') as file:
                    writer = csv.writer(file)
                    writer.writerow([category, url, datetime.now(), file_path])
                return True
    except Exception as e:
        print(f'Failed to download {url}: {str(e)}')
    return False


def get_image_urls(query):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    search_url = f'https://www.google.com/search?q={query}&tbm=isch'
    driver.get(search_url)
    time.sleep(5)
    content = driver.page_source
    soup = BeautifulSoup(content, 'html.parser')
    driver.quit()

    image_urls = []
    for a in soup.find_all:
        try:
            img_url = img.get_attribute('src')
            if img_url and img_url.startswith('http') and not img_url.endswith('.svg'):
                image_urls.append(img_url)
        except:
            continue
    return image_urls


def scrape_images():
    for category in CATEGORIES:
        query = category.replace(' ', '+')
        print(f'Scraping images for: {category}')
        image_urls = get_image_urls(query)

        count = 0
        for img_url in image_urls:
            if download_image(img_url, category):
                count += 1
                if count >= MAX_IMAGES_PER_CATEGORY:
                    break



In [23]:
scrape_images()
driver.quit()

Scraping images for: person
Scraping images for: bicycle
