# Website Image Scraper

### Install & Import Dependencies

In [None]:
%pip install -q -U requests bs4 Pillow selenium

In [None]:
import imghdr
import os
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display

### Create `download_images(url, filename_length)` function

In [None]:
def download_images(url, max_filename_length=100):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get the domain name from the URL
    domain = url.split('//')[-1].split('/')[0]

    # Create a directory for the images with the domain name
    image_dir = f'{domain}'
    if not os.path.exists(image_dir):
        os.makedirs(image_dir)

    downloaded_images_count = 0
    for img in soup.find_all('img'):
        img_url = img.get('src')
        if not img_url.startswith('http'):
            img_url = url + '/' + img_url

        img_data = requests.get(img_url).content
        img_name = os.path.splitext(os.path.splitext(img_url.split('/')[-1])[0])[0]
        if len(img_name) > max_filename_length:
            img_name = img_name[:max_filename_length]

        # Use the imghdr module to get the file type of the image
        img_type = imghdr.what(None, h=img_data)

        # If the file type couldn't be determined, use 'jpg' as the default
        if img_type is None:
            img_type = 'jpeg'

        img_path = os.path.join(image_dir, img_name + '.' + img_type)

        try:
            with open(img_path, 'wb') as f:
                f.write(img_data)
                downloaded_images_count += 1
        except Exception as e:
            print(f"Failed to download {img_name} - {e}")

    print(f"Downloaded {downloaded_images_count} images from {url} to /{image_dir}")
    
    return image_dir

### Set URL to scrape

In [None]:
url = 'https://engadget.com'
image_dir = download_images(url)

### Get a list of images sorted by file-size

In [None]:
img_files = sorted([f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))],
                     key=lambda f: os.path.getsize(os.path.join(image_dir, f)),
                     reverse=True)

print(f"Found {len(img_files)} images in {image_dir}")

### Create an HTML table with the images as cells

In [None]:
table = '<table>'
for i, image_file in enumerate(img_files):
    if i % 3 == 0:
        table += '<tr>'

    table += f'<td><img src="{os.path.join(image_dir, image_file)}" style="width: 200px;"></td>'

    if i % 3 == 2:
        table += '</tr>'

table += '</table>'

### Display the image table

In [None]:
display(HTML(table))