In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import re

# Function to sanitize filename
def sanitize_filename(filename):
    # Remove characters that are not allowed in Windows filenames
    sanitized_filename = re.sub(r'[<>:"/\\|?*\x00-\x1F\x7F]', '_', filename)
    return sanitized_filename[:255]  # Limit filename length to 255 characters

# Function to scrape images from a website
def scrape_images(url, output_folder='images'):
    try:
        # Create output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # Fetch HTML content
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all img tags
        img_tags = soup.find_all('img')

        # Download and save images
        for img_tag in img_tags:
            img_url = img_tag.get('src')
            if img_url:
                # Handle relative URLs
                img_url = urljoin(url, img_url)

                # Extract image filename from URL
                img_filename = os.path.basename(urlparse(img_url).path)

                # Sanitize filename
                img_filename = sanitize_filename(img_filename)

                # Download image
                img_data = requests.get(img_url).content
                with open(os.path.join(output_folder, img_filename), 'wb') as f:
                    f.write(img_data)

                print(f"Downloaded: {img_filename}")

        print("Image scraping complete.")
    except Exception as e:
        print(f"Error scraping images: {e}")

# Example usage
if __name__ == "__main__":
    url = 'https://medium.com/@shaikhrayyan123/a-comprehensive-guide-to-understanding-bert-from-beginners-to-advanced-2379699e2b51'
    scrape_images(url)


Downloaded: 1_dmbNkD5D-u45r44go_cf0g.png
Downloaded: 1_of9MweCjaJnlLr1tpXP4Lg.png
Downloaded: 1_of9MweCjaJnlLr1tpXP4Lg.png
Image scraping complete.


In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import re

# Function to sanitize filename
def sanitize_filename(filename):
    # Remove characters that are not allowed in Windows filenames
    sanitized_filename = re.sub(r'[<>:"/\\|?*\x00-\x1F\x7F]', '_', filename)
    return sanitized_filename[:255]  # Limit filename length to 255 characters

# Function to scrape images (including WebP) from a website
def scrape_images(url, output_folder='images'):
    try:
        # Create output folder if it doesn't exist
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        # Fetch HTML content
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all img tags
        img_tags = soup.find_all('img')

        # Download and save images
        for img_tag in img_tags:
            img_url = img_tag.get('src')
            if img_url:
                # Handle relative URLs
                img_url = urljoin(url, img_url)

                # Extract image filename from URL
                img_filename = os.path.basename(urlparse(img_url).path)

                # Sanitize filename
                img_filename = sanitize_filename(img_filename)

                # Determine image format
                img_format = img_url.split('.')[-1].split('?')[0].lower()

                # Download image
                img_data = requests.get(img_url, headers=headers, timeout=10).content
                with open(os.path.join(output_folder, img_filename), 'wb') as f:
                    f.write(img_data)

                print(f"Downloaded: {img_filename} ({img_format})")

        print("Image scraping complete.")
    except Exception as e:
        print(f"Error scraping images: {e}")

# Example usage
if __name__ == "__main__":
    url = 'https://medium.com/@shaikhrayyan123/a-comprehensive-guide-to-understanding-bert-from-beginners-to-advanced-2379699e2b51'
    scrape_images(url)


Downloaded: 1_dmbNkD5D-u45r44go_cf0g.png (png)
Downloaded: 1_of9MweCjaJnlLr1tpXP4Lg.png (png)
Downloaded: 1_of9MweCjaJnlLr1tpXP4Lg.png (png)
Image scraping complete.
