In [24]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime
import os
import csv
import time
import pandas as pd

In [25]:
CATEGORIES = open('categories.txt').readlines()
CATEGORIES = [x.strip() for x in CATEGORIES]
OUTPUT_DIR = 'images'
CSV_LINK_FILE = 'image_links.csv'
MAX_IMAGES_PER_CATEGORY = 500

os.makedirs(OUTPUT_DIR, exist_ok=True)  
for category in CATEGORIES:
    category_dir = os.path.join(OUTPUT_DIR, category)
    os.makedirs(category_dir, exist_ok=True)  # Create the category folder if it doesn't exist
    print(f"Created folder: {category_dir}")


Created folder: images\person
Created folder: images\bicycle
Created folder: images\car
Created folder: images\motorbike
Created folder: images\aeroplane
Created folder: images\bus
Created folder: images\train
Created folder: images\truck
Created folder: images\boat
Created folder: images\traffic
Created folder: images\light
Created folder: images\fire hydrant
Created folder: images\stop sign
Created folder: images\parking meter
Created folder: images\bench
Created folder: images\bird
Created folder: images\cat
Created folder: images\dog
Created folder: images\horse
Created folder: images\sheep
Created folder: images\cow
Created folder: images\elephant
Created folder: images\bear
Created folder: images\zebra
Created folder: images\giraffe
Created folder: images\backpack
Created folder: images\umbrella
Created folder: images\handbag
Created folder: images\tie
Created folder: images\suitcase
Created folder: images\frisbee
Created folder: images\skis
Created folder: images\snowboard
Creat

In [26]:
def scroll_to_bottom(driver, scroll_times=15, delay=0.25):
    for i in range(scroll_times):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)

        try:
            load_more_button = driver.find_element("css selector", "button.cDy10.vbbXa.XKj1X.aZVYw.vqRTY.vGtHf.s5zyR.nwXgM.pYP1f")
            if load_more_button.is_displayed():
                load_more_button.click()
                print(f"Clicked 'Load more' on scroll {i+1}")
                time.sleep(delay)
        except Exception as e:
            print(f"No 'Load more' button after scroll {i+1} or already clicked.")

        print(f"Scrolled {i+1} times.")


In [27]:
def get_unsplash_image_urls(query):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless=new')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    existing_urls = pd.read_csv("image_metadata.csv")['url'].tolist()

    search_url = f"https://unsplash.com/s/photos/{query}"
    print(f"Scraping URL: {search_url}")
    driver.get(search_url)
    scroll_to_bottom(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    image_elements = soup.find_all("img", {"data-testid": "photo-grid-masonry-img"})
    image_urls = [img.get("src") for img in image_elements if img.get("src")]
    print(f"Found {len(image_urls)} image URLs.")
    return image_urls

In [28]:
def save_links_to_csv(category, urls):
    new_rows = []
    if os.path.exists(CSV_LINK_FILE):
        existing_df = pd.read_csv(CSV_LINK_FILE)
        existing_urls = set(existing_df['url'])
    else:
        existing_df = pd.DataFrame()
        existing_urls = set()

    for url in urls:
        if url not in existing_urls:
            new_rows.append({
                "category": category,
                "url": url,
                "source": "Wikimedia Commons",
                "timestamp": datetime.now().isoformat(),
                "downloaded": "False"
            })

    if not new_rows:
        print("No new unique URLs to add.")
        return

    df = pd.DataFrame(new_rows)

    if not os.path.exists(CSV_LINK_FILE):
        df.to_csv(CSV_LINK_FILE, index=False)
    else:
        existing_df = pd.read_csv(CSV_LINK_FILE)
        combined_df = pd.concat([existing_df, df])
        combined_df.drop_duplicates(subset=['url'], inplace=True)
        combined_df.to_csv(CSV_LINK_FILE, index=False)

def scrape_images():
    for category in CATEGORIES:
        print(f"Scraping category: {category}")
        urls = get_unsplash_image_urls(category)
        print(f"Found {len(urls)} image URLs for category: {category}")
        save_links_to_csv(category, urls)


In [None]:
scrape_images()

Scraping category: person


Scraping URL: https://unsplash.com/s/photos/person
Clicked 'Load more' on scroll 1
Scrolled 1 times.
No 'Load more' button after scroll 2 or already clicked.
Scrolled 2 times.
No 'Load more' button after scroll 3 or already clicked.
Scrolled 3 times.
No 'Load more' button after scroll 4 or already clicked.
Scrolled 4 times.
No 'Load more' button after scroll 5 or already clicked.
Scrolled 5 times.
No 'Load more' button after scroll 6 or already clicked.
Scrolled 6 times.
No 'Load more' button after scroll 7 or already clicked.
Scrolled 7 times.
No 'Load more' button after scroll 8 or already clicked.
Scrolled 8 times.
No 'Load more' button after scroll 9 or already clicked.
Scrolled 9 times.
No 'Load more' button after scroll 10 or already clicked.
Scrolled 10 times.
No 'Load more' button after scroll 11 or already clicked.
Scrolled 11 times.
No 'Load more' button after scroll 12 or already clicked.
Scrolled 12 times.
No 'Load more' button after scroll 13 or already clicked.
Scrolled 1