In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime
import os
import csv
import time
import pandas as pd

In [2]:
CATEGORIES = open('categories.txt').readlines()
CATEGORIES = [x.strip() for x in CATEGORIES][1:]
CSV_LINK_FILE = 'image_links.csv'
MAX_IMAGES_PER_CATEGORY = 100
OUTPUT_DIR = 'images'

os.makedirs(OUTPUT_DIR, exist_ok=True)  
for category in CATEGORIES:
    category_dir = os.path.join(OUTPUT_DIR, category)
    os.makedirs(category_dir, exist_ok=True)  # Create the category folder if it doesn't exist
    print(f"Created folder: {category_dir}")


Created folder: images\bicycle
Created folder: images\car
Created folder: images\motorbike
Created folder: images\aeroplane
Created folder: images\bus
Created folder: images\train
Created folder: images\truck
Created folder: images\boat
Created folder: images\traffic
Created folder: images\light
Created folder: images\fire hydrant
Created folder: images\stop sign
Created folder: images\parking meter
Created folder: images\bench
Created folder: images\bird
Created folder: images\cat
Created folder: images\dog
Created folder: images\horse
Created folder: images\sheep
Created folder: images\cow
Created folder: images\elephant
Created folder: images\bear
Created folder: images\zebra
Created folder: images\giraffe
Created folder: images\backpack
Created folder: images\umbrella
Created folder: images\handbag
Created folder: images\tie
Created folder: images\suitcase
Created folder: images\frisbee
Created folder: images\skis
Created folder: images\snowboard
Created folder: images\sports ball


In [3]:
def dismiss_flickr_popup(driver):
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "cta-dialog"))
        )
        actions = ActionChains(driver)
        actions.send_keys(Keys.ESCAPE).perform()
        print("Popup dismissed with ESC key.")
    except Exception as e:
        print("Popup not detected or already dismissed.")


def scroll_and_load_flickr(driver, delay=5):
    print("Starting continuous scroll and load cycle...")
    screen_height = driver.execute_script("return window.innerHeight;")
    stagnant_scrolls = 0
    max_stagnant_scrolls = 1
    prev_height = driver.execute_script("return document.body.scrollHeight")
    current_scroll = 0

    while True:
        total_height = driver.execute_script("return document.body.scrollHeight")

        while current_scroll < total_height:
            driver.execute_script(f"window.scrollTo(0, {current_scroll});")
            current_scroll += screen_height // 2
            time.sleep(0.25)
        print("Reached bottom, waiting for more images or load more button...")

        time.sleep(delay)
        dismiss_flickr_popup(driver)

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == prev_height:
            stagnant_scrolls += 1
        else:
            stagnant_scrolls = 0
        prev_height = new_height

        try:
            load_more_btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.alt"))
            )
            if load_more_btn.is_displayed():
                load_more_btn.click()
                print("Clicked 'Load more results' button.")
                time.sleep(delay)
                stagnant_scrolls = 0
                continue
        except:
            print("No 'Load more results' button detected.")

        if stagnant_scrolls >= max_stagnant_scrolls:
            print("Page has stopped growing. Ending scroll.")
            break


In [4]:
def get_flickr_image_urls(query):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless=new')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--enable-javascript')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    search_url = f"https://www.flickr.com/search/?text={query}&view_all=1"
    driver.get(search_url)

    dismiss_flickr_popup(driver)
    scroll_and_load_flickr(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()

    image_elements = soup.select("div.photo-list-photo-container img")
    image_urls = []
    for img in image_elements:
        src = img.get("src")
        if src:
            if src.startswith("//"):
                src = "https:" + src
            if "staticflickr.com" in src and src not in image_urls:
                image_urls.append(src)

    print(f"Found {len(image_urls)} Flickr image URLs.")
    return image_urls

In [5]:
def save_links_to_csv(category, urls):
    new_rows = []
    if os.path.exists(CSV_LINK_FILE):
        existing_df = pd.read_csv(CSV_LINK_FILE)
        existing_urls = set(existing_df['url'])
    else:
        existing_df = pd.DataFrame()
        existing_urls = set()

    for url in urls:
        if url not in existing_urls:
            new_rows.append({
                "category": category,
                "url": url,
                "source": "Wikimedia Commons",
                "timestamp": datetime.now().isoformat(),
                "downloaded": "False"
            })

    if not new_rows:
        print("No new unique URLs to add.")
        return

    df = pd.DataFrame(new_rows)

    if not os.path.exists(CSV_LINK_FILE):
        df.to_csv(CSV_LINK_FILE, index=False)
    else:
        existing_df = pd.read_csv(CSV_LINK_FILE)
        combined_df = pd.concat([existing_df, df])
        combined_df.drop_duplicates(subset=['url'], inplace=True)
        combined_df.to_csv(CSV_LINK_FILE, index=False)

In [6]:
def scrape_images():
    for category in CATEGORIES:
        print(f"Scraping category: {category}")
        urls = get_flickr_image_urls(category)
        print(f"Found {len(urls)} image URLs for category: {category}")
        save_links_to_csv(category, urls)

In [7]:
scrape_images()

Scraping category: bicycle


Popup not detected or already dismissed.
Starting continuous scroll and load cycle...
Reached bottom, waiting for more images or load more button...
Popup dismissed with ESC key.
No 'Load more results' button detected.
Reached bottom, waiting for more images or load more button...
Popup not detected or already dismissed.
No 'Load more results' button detected.
Page has stopped growing. Ending scroll.
Found 45 Flickr image URLs.
Found 45 image URLs for category: bicycle
Scraping category: car
Popup not detected or already dismissed.
Starting continuous scroll and load cycle...
Reached bottom, waiting for more images or load more button...
Popup dismissed with ESC key.
No 'Load more results' button detected.
Reached bottom, waiting for more images or load more button...
Popup not detected or already dismissed.
No 'Load more results' button detected.
Page has stopped growing. Ending scroll.
Found 47 Flickr image URLs.
Found 47 image URLs for category: car
Scraping category: motorbike
Popu