In [32]:
import os
import hashlib
import requests
import pandas as pd
from PIL import Image
from io import BytesIO
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import csv
import time



In [33]:
CATEGORIES = open('categories.txt').readlines()
CATEGORIES = [x.strip() for x in CATEGORIES]
print(CATEGORIES)
OUTPUT_DIR = 'images'
MAX_IMAGES_PER_CATEGORY = 500
CSV_FILE = 'image_metadata.csv'
os.makedirs(OUTPUT_DIR, exist_ok=True)


['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'trains', 'truck', 'boat', 'traffic', 'light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [34]:
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [35]:
with open(CSV_FILE, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['category', 'image_url', 'download_time', 'file_path'])

In [None]:
def click_load_more():
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_count = 0
    while scroll_count < 15:  # Scroll 15 times only
        try:
            load_more_button = driver.find_element(By.CSS_SELECTOR, 'button.cDy10.vbbXa.XKj1X.aZVYw.vqRTY.vGtHf.s5zyR.nwXgM.pYP1f')
            load_more_button.click()
            time.sleep(3)  # Wait for new images to load
        except:
            pass  # No more 'Load More' button found, continue scrolling

        # Scroll to bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        print(f'Scrolled {scroll_count + 1} times')
        last_height = new_height
        scroll_count += 1

def extract_image_urls(driver):
    with open("page_content.txt", "w", encoding="utf-8") as file:
        file.write(BeautifulSoup(driver.page_source).prettify())
    
    img_elements = driver.find_elements(By.CSS_SELECTOR, 'img.czQTa')

    img_urls = set()
    for img in img_elements:
        src = img.get_attribute('src')
        srcset = img.get_attribute('srcset')

        if src and 'https://images.unsplash.com/' in src:
            img_urls.add(src)

        if srcset:
            urls = [url.split(' ')[0] for url in srcset.split(',')]  
            if urls:
                img_urls.add(urls[-1])  
    return img_urls

def download_image(url, category):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content))
            file_hash = hashlib.md5(response.content).hexdigest()
            file_name = f'{category}/{file_hash}.jpg'
            file_path = os.path.join(OUTPUT_DIR, file_name)
            if not os.path.exists(file_path):
                image.save(file_path)
            else:
                print(f'Image already exists: {file_path}')
            
            with open(CSV_FILE, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([category, url, datetime.now(), file_path])
            return True
    except Exception as e:
        print(f'Failed to download {url}: {e}')
    return False


def scrape_images():
    for category in CATEGORIES:
        query = category.strip().replace(' ', '-')
        url = f'https://unsplash.com/s/photos/{query}'

        print(f'Scraping images for: {query}')
        driver.get(url)
        time.sleep(3)

        # Click 'Load More' repeatedly to reveal all images
        click_load_more()

        # Extract image URLs
        img_urls = extract_image_urls(driver)
        print(f'Found {len(img_urls)} images for category: {category}')

        count = 0
        for img_url in img_urls:
            if download_image(img_url, query):
                count += 1
                if count >= MAX_IMAGES_PER_CATEGORY:
                    break


In [37]:
scrape_images()
driver.quit()

Scraping images for: person


KeyboardInterrupt: 