In [1]:
import os
import time
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO

In [2]:
# URL of the website
base_url = "https://pkmncards.com/page/{}/?s=&sort=date&ord=rev"

max_pages = 19  # Adjust the number of pages you want to scrape
output_dir = "pokemon_card_images"  # Name of the folder
os.makedirs(output_dir, exist_ok=True)

stop_scraping = False

# Lists to store data
img_urls = []
img_filenames = []

for page in range(1, max_pages + 1):
    # Format the URL with the current page number
    url = base_url.format(page)
    print(f"Scraping URL: {url}")
    if stop_scraping:
        print("Stopping scrape.")
        break
    # Fetch the page content
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}. Skipping.")
        continue

    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Locate the main container
    main_div = soup.find("main", class_="content")
    if not main_div:
        print("Could not find the main content area.")
        continue

    # Extract card details
    cards = main_div.find_all("div", class_="entry-content")  # Adjust based on inspection
    for card in cards:
        # Find the image tag
        img_tag = card.find(
            "img", 
            class_=lambda x: x and ('card-image skip-lazy round-corners pad-15' in x or 'card-image round-corners pad-15' in x)
        )
        if img_tag:
            img_url = img_tag.get("src")  # Extract the 'src' attribute
            if img_url:
                img_urls.append(img_url)  # Save the URL
                
                # Download the image
                try:
                    response = requests.get(img_url, timeout=3)  # Increase timeout to 10 seconds
                    print(f"Downloading {img_url}, Status Code: {response.status_code}")

                    if response.status_code == 200:
                        # Open the image using Pillow
                        img = Image.open(BytesIO(response.content))
                        
                        # Compress the image (resize it, reduce quality)
                        img = img.convert("RGB")  # Convert to RGB (if it's not already in RGB)
                        img.thumbnail((825, 600))  # Resize to a max of 800x800 pixels, or adjust as needed

                        # Save the compressed image locally
                        file_name = os.path.join(output_dir, os.path.basename(img_url))
                        img.save(file_name, quality=20)  # Save with 85% quality (can adjust this value)

                        img_filenames.append(file_name)
                    else:
                        print(f"Failed to download image: {img_url}")
                        img_filenames.append("Download Failed")

                except Exception as e:
                    print(f"Error downloading {img_url}: {e}")
                    img_filenames.append("Download Failed")
            else:
                img_filenames.append("No Image Found")

    # Wait for a random time between 3 to 6 seconds to avoid overloading the server
    time.sleep(random.uniform(3, 6))

# Create DataFrame
data = pd.DataFrame({
    "Image_URL": img_urls,
    "Saved_File_Name": img_filenames
})

# Save to CSV
data.to_csv("pokemon_card_images.csv", index=False)
print("Scraping completed. Data saved to 'pokemon_card_images.csv'.")

Scraping URL: https://pkmncards.com/page/1/?s=&sort=date&ord=rev
Downloading https://pkmncards.com/wp-content/uploads/alakazam-base-set-bs-1.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/blastoise-base-set-bs-2.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/chansey-base-set-bs-3.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/charizard-base-set-bs-4.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/clefairy-base-set-bs-5.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/gyarados-base-set-bs-6.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/hitmonchan-base-set-bs-7.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/machamp-base-set-bs-8.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/uploads/magneton-base-set-bs-9.jpg, Status Code: 200
Downloading https://pkmncards.com/wp-content/upload