In [None]:
import re
import requests
from bs4 import BeautifulSoup
from IPython.display import display
from PIL import Image, ImageOps
from io import BytesIO
import json

In [None]:
class BirdImage():
    def __init__(self, url, scientific_name, author, license_url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find original image
        link = soup.find("a", string="Original file")
        image_url = link["href"]
        
        print(image_url)
        
        # Fetch the image
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36"
        }
        response = requests.get(image_url, headers=headers)
        image = Image.open(BytesIO(response.content))
        # Auto-rotate using EXIF
        image = ImageOps.exif_transpose(image)

        self.image = image
        
        self.data = {
            "scientific_name": scientific_name,
            "url": url,
            "author": author,
            "license_url": license_url,
            "image_url": image_url,
            "crop_box": [0, 0, self.image.width, self.image.height],
            "x": 0.0,
            "y": 0.0,
            "s": 1.0,
        }

        self.set_crop()

    def set_crop(self, x=None, y=None, s=None):
        self.data["x"] = self.data["x"] if x is None else x
        self.data["y"] = self.data["y"] if y is None else y
        self.data["s"] = self.data["s"] if s is None else s
        
        x = self.data["x"] * self.image.width
        y = self.data["y"] * self.image.height
        s = self.data["s"] * self.image.width
        
        self.data["crop_box"] = (
            int(x), 
            int(y), 
            int(x + s), 
            int(y + s),
        )

        return self

    def display_image(self, size=150):
        cropped_img = self.image.crop(self.data['crop_box'])
        resized = cropped_img.resize((size, size))
        display(resized)
    
    def save_image(self, size=150, path="images/"):
        if self.data['crop_box'] is None:
            self.set_crop()

        cropped_img = self.image.crop(self.data['crop_box'])
        resized = cropped_img.resize((size, size))
        resized.save(f"{path}{self.data["scientific_name"]}.jpg")
        
    def write_data(self, filepath="image_data.jsonl"):
        with open("image_data.jsonl", "a") as f:
            f.write(json.dumps(self.data) + "\n")

Select an image/page from commons.wikimedia.org, providing some additional information that could not be scraped consistently.

In [None]:
bi = BirdImage(
    url="https://commons.wikimedia.org/wiki/File:Myrtle_Warbler_-_Houston,_TX_2.jpg",
    scientific_name="Setophaga coronata",
    author="Dan Pancamo",
    license_url="https://creativecommons.org/licenses/by-sa/2.0/deed.en"
)

Display the (small form) of the original image.

In [None]:
bi.display_image()

Experiment with values for cropping.

In [None]:
bi.set_crop(.0, .1, .9).display_image()

Once satisfied, save the image and record the metadata.

In [None]:
bi.save_image()
bi.write_data()