In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime
import os
import csv
import time


In [2]:
CATEGORIES = open('categories.txt').readlines()
CATEGORIES = [x.strip() for x in CATEGORIES]
OUTPUT_DIR = 'images'
CSV_LINK_FILE = 'image_links.csv'
MAX_IMAGES_PER_CATEGORY = 500

os.makedirs(OUTPUT_DIR, exist_ok=True)  
for category in CATEGORIES:
    category_dir = os.path.join(OUTPUT_DIR, category)
    os.makedirs(category_dir, exist_ok=True)  # Create the category folder if it doesn't exist
    print(f"Created folder: {category_dir}")


Created folder: images\person
Created folder: images\bicycle
Created folder: images\car
Created folder: images\motorbike
Created folder: images\aeroplane
Created folder: images\bus
Created folder: images\train
Created folder: images\truck
Created folder: images\boat
Created folder: images\traffic
Created folder: images\light
Created folder: images\fire hydrant
Created folder: images\stop sign
Created folder: images\parking meter
Created folder: images\bench
Created folder: images\bird
Created folder: images\cat
Created folder: images\dog
Created folder: images\horse
Created folder: images\sheep
Created folder: images\cow
Created folder: images\elephant
Created folder: images\bear
Created folder: images\zebra
Created folder: images\giraffe
Created folder: images\backpack
Created folder: images\umbrella
Created folder: images\handbag
Created folder: images\tie
Created folder: images\suitcase
Created folder: images\frisbee
Created folder: images\skis
Created folder: images\snowboard
Creat

In [None]:
def scroll_to_bottom(driver, scroll_times=15, delay=2):
    for i in range(scroll_times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)

        try:
            load_more_button = driver.find_element("css selector", "button.cDy10.vbbXa.XKj1X.aZVYw.vqRTY.vGtHf.s5zyR.nwXgM.pYP1f")
            if load_more_button.is_displayed():
                load_more_button.click()
                print(f"Clicked 'Load more' on scroll {i+1}")
                time.sleep(delay)
        except Exception as e:
            print(f"No 'Load more' button after scroll {i+1} or already clicked.")

        print(f"Scrolled {i+1} times.")


In [4]:
def get_unsplash_image_urls(query):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless=new')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    search_url = f"https://unsplash.com/s/photos/{query}"
    print(f"Scraping URL: {search_url}")
    driver.get(search_url)
    scroll_to_bottom(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    image_elements = soup.find_all("img", {"data-testid": "photo-grid-masonry-img"})
    image_urls = [img.get("src") for img in image_elements if img.get("src")]
    print(f"Found {len(image_urls)} image URLs.")
    return image_urls

In [5]:
def write_links_to_csv(csv_path, data):
    existing_urls = set()

    # Load existing URLs if file exists
    if os.path.exists(csv_path):
        with open(csv_path, mode='r', newline='', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            existing_urls = {row['url'] for row in reader}

    # Filter out duplicates
    new_rows = [row for row in data if row[1] not in existing_urls]

    # Write only new rows
    write_header = not os.path.exists(csv_path)
    with open(csv_path, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(['category', 'url', 'source', 'timestamp'])
        writer.writerows(new_rows)

def scrape_images():
    for category in CATEGORIES:
        print(f"Scraping images for: {category}")
        image_urls = get_unsplash_image_urls(category)
        timestamp = datetime.now()
        link_rows = [(category, url, "unsplash", timestamp) for url in image_urls[:MAX_IMAGES_PER_CATEGORY]]
        write_links_to_csv(CSV_LINK_FILE, link_rows)
        print(f"Saved {len(link_rows)} links for {category}")


In [6]:
scrape_images()

Scraping images for: person
Scraping URL: https://unsplash.com/s/photos/person
Scrolled 1 times.
Reached end of page.
Found 20 image URLs.
Saved 20 links for person
Scraping images for: bicycle
Scraping URL: https://unsplash.com/s/photos/bicycle
Scrolled 1 times.
Scrolled 2 times.
Reached end of page.
Found 20 image URLs.
Saved 20 links for bicycle
Scraping images for: car
Scraping URL: https://unsplash.com/s/photos/car


KeyboardInterrupt: 