In [None]:
import os
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from PIL import Image
from io import BytesIO
import time

# Define 20 search categories
categories = [
    "Dogs", "Cats", "Cars", "Bikes", "Chairs", "Phones", "Mountains", "Laptops", "Bridges", "Birds",
    "Trains", "Planets", "Monuments", "Flowers", "Street Food", "Waterfalls", "Ancient Temples",
    "Futuristic Cities", "Wildlife", "Space Exploration"
]

# Output directories
output_folder = "downloaded_images"
os.makedirs(output_folder, exist_ok=True)

# CSV file to store metadata
csv_file = open("image_metadata.csv", "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Category", "Image URL", "Filename", "Resolution"])

# Set headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def get_bing_image_urls(category, max_images=50):
    """Fetches actual image URLs from Bing Image Search."""
    search_url = f"https://www.bing.com/images/search?q={category}&form=HDRSC2"

    try:
        response = requests.get(search_url, headers=headers)
        response.raise_for_status()  # Ensure request was successful
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error fetching search results for {category}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    img_urls = []

    # Extract image URLs from the search results
    for img_tag in soup.find_all("a", {"class": "iusc"}, limit=max_images):
        try:
            m_data = eval(img_tag["m"])  # Extract image metadata (JSON-like structure)
            img_url = m_data.get("murl")  # Get the direct image URL

            if img_url and img_url.startswith("http"):
                img_urls.append(img_url)

        except Exception as e:
            print(f"⚠️ Error parsing image data: {e}")

    return img_urls

def download_images(category):
    """Downloads images using working Bing image URLs."""
    category_folder = os.path.join(output_folder, category)
    os.makedirs(category_folder, exist_ok=True)

    img_urls = get_bing_image_urls(category, max_images=50)

    for count, img_url in enumerate(img_urls):
        try:
            img_response = requests.get(img_url, headers=headers, stream=True)

            # Validate image response
            if "image" not in img_response.headers.get("Content-Type", ""):
                print(f"⚠️ Skipping non-image file: {img_url}")
                continue

            # Verify image integrity using Pillow
            img = Image.open(BytesIO(img_response.content))
            img.verify()

            # Save the image
            filename = f"{category}_{count+1}.jpg"
            filepath = os.path.join(category_folder, filename)

            with open(filepath, "wb") as f:
                f.write(img_response.content)

            # Get image resolution
            img = Image.open(filepath)
            resolution = f"{img.width}x{img.height}"

            # Save metadata
            csv_writer.writerow([category, img_url, filename, resolution])
            print(f"✅ Downloaded {filename} ({resolution})")

            time.sleep(1)  # Prevent getting blocked by Bing

        except Exception as e:
            print(f"⚠️ Error downloading {img_url}: {e}")

# Run image scraping
for category in categories:
    print(f"🔍 Searching images for {category}...")
    download_images(category)

# Close CSV file
csv_file.close()
print("✅ Image scraping complete!")
