In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from datetime import datetime
import pandas as pd
import os
import time


In [7]:
CATEGORIES = open('categories.txt').readlines()
CATEGORIES = [x.strip() for x in CATEGORIES]
OUTPUT_DIR = 'images'
CSV_LINK_FILE = 'image_links.csv'
MAX_IMAGES_PER_CATEGORY = 200
LOAD_WAIT = 2
existing_urls = pd.read_csv(CSV_LINK_FILE)['url'].tolist()


os.makedirs(OUTPUT_DIR, exist_ok=True)  
for category in CATEGORIES:
    category_dir = os.path.join(OUTPUT_DIR, category)
    os.makedirs(category_dir, exist_ok=True)  # Create the category folder if it doesn't exist
    # print(f"Created folder: {category_dir}")


In [8]:
def get_wikimedia_image_urls(query):
    image_urls = set()

    options = webdriver.ChromeOptions()
    # options.add_argument("--headless=new")
    # options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    search_url = f"https://commons.wikimedia.org/w/index.php?search={query}&title=Special:MediaSearch&go=Go&type=image"
    driver.get(search_url)
    time.sleep(LOAD_WAIT)

    last_height = driver.execute_script("return document.body.scrollHeight")
    stagnant_scrolls = 0
    max_stagnant_scrolls = 5

    while len(image_urls) < MAX_IMAGES_PER_CATEGORY:
        try:
            thumbs = driver.find_elements(By.CSS_SELECTOR, 'a.sdms-image-result > img')
            for thumb in thumbs:
                src = thumb.get_attribute("src") or thumb.get_attribute("data-src")
                if src and src not in image_urls:
                    image_urls.add(src)
                    print(f"Image count: {len(image_urls)}. Found image URL for {query}: {src}")
                    if len(image_urls) >= MAX_IMAGES_PER_CATEGORY:
                        break

            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(LOAD_WAIT)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                stagnant_scrolls += 1
            else:
                stagnant_scrolls = 0
                last_height = new_height

            try:
                load_more = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "button.sdms-load-more"))
                )
                driver.execute_script("arguments[0].click();", load_more)
                time.sleep(LOAD_WAIT)
            except TimeoutException:
                print("Load more button not found within timeout.")

            if stagnant_scrolls >= max_stagnant_scrolls:
                print("Reached page end with no more scrolling.")
                break

        except Exception as e:
            print("No more images or general exception:", e)
            break

    driver.quit()
    return list(image_urls)

In [9]:
def save_links_to_csv(category, urls):
    new_rows = []
    if os.path.exists(CSV_LINK_FILE):
        existing_df = pd.read_csv(CSV_LINK_FILE)
        existing_urls = set(existing_df['url'])
    else:
        existing_df = pd.DataFrame()
        existing_urls = set()

    for url in urls:
        if url not in existing_urls:
            new_rows.append({
                "category": category,
                "url": url,
                "source": "Wikimedia Commons",
                "timestamp": datetime.now().isoformat(),
                "downloaded": "False"
            })

    if not new_rows:
        print("No new unique URLs to add.")
        return

    df = pd.DataFrame(new_rows)

    if not os.path.exists(CSV_LINK_FILE):
        df.to_csv(CSV_LINK_FILE, index=False)
    else:
        existing_df = pd.read_csv(CSV_LINK_FILE)
        combined_df = pd.concat([existing_df, df])
        combined_df.drop_duplicates(subset=['url'], inplace=True)
        combined_df.to_csv(CSV_LINK_FILE, index=False)

def scrape_wikimedia_links():
    for category in CATEGORIES:
        print(f"Scraping category: {category}")
        urls = get_wikimedia_image_urls(category)
        print(f"Found {len(urls)} image URLs for category: {category}")
        save_links_to_csv(category, urls)

In [10]:
scrape_wikimedia_links()

Scraping category: person
Image count: 1. Found image URL for person: https://upload.wikimedia.org/wikipedia/commons/thumb/d/da/Person_Odessa.jpg/250px-Person_Odessa.jpg
Image count: 2. Found image URL for person: https://upload.wikimedia.org/wikipedia/commons/thumb/2/25/Aurora_Australis_Over_the_Tasman_Sea_from_SouthWest_National_Park.jpg/270px-Aurora_Australis_Over_the_Tasman_Sea_from_SouthWest_National_Park.jpg
Image count: 3. Found image URL for person: https://upload.wikimedia.org/wikipedia/commons/thumb/c/ca/Human_skeleton_front_en.svg/93px-Human_skeleton_front_en.svg.png
Image count: 4. Found image URL for person: https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/Dalian_Liaoning_China_Two-Chinese-at-Xinghai-Bay-01.jpg/330px-Dalian_Liaoning_China_Two-Chinese-at-Xinghai-Bay-01.jpg
Image count: 5. Found image URL for person: https://upload.wikimedia.org/wikipedia/commons/thumb/8/87/Saami_Family_1900.jpg/250px-Saami_Family_1900.jpg
Image count: 6. Found image URL for person: