## Scrapping images of cetaceans from Wikipedia page

In [1]:
import pandas as pd
import requests, os
from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
species = pd.read_csv("../../data/PhilogeneticTree/species_tree_of_life.csv")
species

Unnamed: 0,Scientific name,Common name,Wikipedia page URL,Genus,Genus Wikipedia URL,Family,Family Wikipedia URL,Subfamily,Subfamily Wikipedia URL,Superfamily,Superfamily Wikipedia URL
0,Stenella clymene,Clymene dolphin,https://en.wikipedia.org/wiki/Clymene_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
1,Stenella longirostris,Spinner dolphin,https://en.wikipedia.org/wiki/Spinner_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
2,Lagenodelphis hosei,Fraser's dolphin,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Lagenodelphis,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
3,Delphinus delphis,Common dolphin,https://en.wikipedia.org/wiki/Common_dolphin,Delphinus,https://en.wikipedia.org/wiki/Common_dolphin,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
4,Stenella coeruleoalba,Striped dolphin,https://en.wikipedia.org/wiki/Striped_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
...,...,...,...,...,...,...,...,...,...,...,...
70,Caperea marginata,Pygmy right whale,https://en.wikipedia.org/wiki/Pygmy_right_whale,Caperea,https://en.wikipedia.org/wiki/Pygmy_right_whale,Cetotheriidae,https://en.wikipedia.org/wiki/Cetotheriidae,Neobalaenidae,https://en.wikipedia.org/wiki/Neobalaenidae,,
71,Eubalaena glacialis,North Atlantic right whale,https://en.wikipedia.org/wiki/North_Atlantic_r...,Eubalaena,https://en.wikipedia.org/wiki/Right_whale,Balaenidae,https://en.wikipedia.org/wiki/Balaenidae,,,,
72,Eubalaena japonica,North Pacific right whale,https://en.wikipedia.org/wiki/North_Pacific_ri...,Eubalaena,https://en.wikipedia.org/wiki/Right_whale,Balaenidae,https://en.wikipedia.org/wiki/Balaenidae,,,,
73,Eubalaena australis,Southern right whale,https://en.wikipedia.org/wiki/Southern_right_w...,Eubalaena,https://en.wikipedia.org/wiki/Right_whale,Balaenidae,https://en.wikipedia.org/wiki/Balaenidae,,,,


In [26]:
# Set the folder name where images will be stored
my_folder = 'wiki_images'
os.makedirs(my_folder, exist_ok=True)

# Base URL for Wikipedia API
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

# Wikipedia article to download the main image from
#my_list = philo_common_names#species_list#IDs_and_names["Common name"].dropna().tolist()

# Define User-Agent header to respect Wikipedia's user-client policy :,)
HEADERS = {
    "User-Agent": "WhereWereWhales: cetaceans images from Wikipedia"
}


def get_image_url(page_url):
    """ Gets the main image URL from a Wikipedia article """
    try:
        response = requests.get(page_url, headers=HEADERS)
    except Exception as exc:
        print(exc)
    soup = BeautifulSoup(response.content, "html.parser")
    og_image = soup.find("meta", property="og:image")

    if og_image:
        return og_image["content"]
    else:
        print("og:image tag not found")
    return None

    # try:
    #     api_res = requests.get(query + partial_url, headers=HEADERS).json()
    #     first_part = api_res['query']['pages']
    #     for key, value in first_part.items():
    #         if 'original' in value:
    #             return value['original']['source']
    # except Exception as exc:
    #     print(exc)
    #     print("Partial URL: " + partial_url)
    # return None


def download_image(the_url, the_page):
    res = requests.get(the_url, headers=HEADERS) 
    res.raise_for_status()
    
    if res.status_code != 200:
        print(f"Error scraping {the_page}: {res.status_code}")

    # Get the correct file extension
    file_ext = '.' + the_url.split('.')[-1].lower()
    image_file = os.path.join(my_folder, os.path.basename(the_page + file_ext))

    # Save the image
    with open(image_file, 'wb') as f:
        for chunk in res.iter_content(100000):
            f.write(chunk)

    #print(f"Downloaded: {image_file}")

# Download images from Wikipedia articles
image_urls = dict()
for _, (co_name, sc_name, the_url) in tqdm(species[["Common name", "Scientific name", "Wikipedia page URL"]].iterrows(), desc="Scraping Wikipedia images", unit="pages"):
    #the_url = get_image_url(the_page)
    #print(the_page, the_url)
    try:
        #print(f"Downloading an image of a magnificent {the_page}...")
        #download_image(the_url, the_page)
        image_urls[sc_name] = (co_name, get_image_url(the_url))
        
    except:
        print(f"No image found for {co_name} :(")

print("All done!")


Scraping Wikipedia images: 75pages [00:18,  3.98pages/s]

All done!





In [27]:
image_urls

{'Stenella clymene': ('Clymene dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/a/aa/Clymenes.jpg'),
 'Stenella longirostris': ('Spinner dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/A_spinner_dolphin_in_the_Red_Sea.jpg/1200px-A_spinner_dolphin_in_the_Red_Sea.jpg'),
 'Lagenodelphis hosei': ("Fraser's dolphin",
  'https://upload.wikimedia.org/wikipedia/commons/2/2b/Frazer%C2%B4s_dolphin_group.jpg'),
 'Delphinus delphis': ('Common dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Common_dolphin_noaa.jpg/1200px-Common_dolphin_noaa.jpg'),
 'Stenella coeruleoalba': ('Striped dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Stenella_coeruleoalba_Ligurian_Sea_02_-_brighter.jpg/1200px-Stenella_coeruleoalba_Ligurian_Sea_02_-_brighter.jpg'),
 'Stenella attenuata': ('Pantropical spotted dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/0/06/Pantropical_spotted_dolphin_swimming_off_the_coast_of_Costa_Ric

We manually inspected all the images and discovered that some of the retrieved images either did not correspond to the true main page of the Wikipedia article (mainly because our method for fetching the images is not robust enough) or they would depict the animal suffering or dying which could represent sensitive content to the user of our website. We manually explore the Creative Commons database to replace those images with more appropriate ones.

In [28]:
image_urls["Phocoena dioptrica"] = ('Spectacled porpoise', "https://upload.wikimedia.org/wikipedia/commons/c/c4/Subadult_female_spectacled_porpoise.png")
image_urls["Phocoena spinipinnis"] = ("Burmeister's porpoise","https://upload.wikimedia.org/wikipedia/commons/1/1c/Burmeister%27s_porpoise_-_Phocoena_spinipinnis_-_2022-02-24.png")
image_urls["Lipotes vexillifer"] = ('Baiji',"https://upload.wikimedia.org/wikipedia/commons/5/5e/Lipotes_vexillifer.png")
image_urls["Mesoplodon peruvianus"] = ('Pygmy beaked whale', "https://upload.wikimedia.org/wikipedia/commons/9/93/Mesoplodon_peruvianus.jpg")
image_urls["Mesoplodon densirostris"] = ("Blaincille's beaked whale", "https://upload.wikimedia.org/wikipedia/commons/b/b9/Mesoplodon_densirostris.jpg")
image_urls["Mesoplodon grayi"] = ("Gray's beaked whale", "https://upload.wikimedia.org/wikipedia/commons/6/6a/Mesoplodon_grayi_2.jpg")
image_urls["Mesoplodon hectori"] = ("Hector's beaked whale", "https://upload.wikimedia.org/wikipedia/commons/b/b5/Mesoplodon_hectori.jpg")
image_urls["Berardius bairdii"] = ("Baird's beaked whale", "https://upload.wikimedia.org/wikipedia/commons/e/e2/Berardius_bairdii_2.jpg")
image_urls['Kogia breviceps'] = ('Pygmy sperm whale', "https://upload.wikimedia.org/wikipedia/commons/b/b6/FMIB_50832_Pygmy_Sperm_Whale.jpeg")

In [29]:
len(image_urls), species.shape[0]

(75, 75)

In [30]:
image_urls

{'Stenella clymene': ('Clymene dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/a/aa/Clymenes.jpg'),
 'Stenella longirostris': ('Spinner dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/A_spinner_dolphin_in_the_Red_Sea.jpg/1200px-A_spinner_dolphin_in_the_Red_Sea.jpg'),
 'Lagenodelphis hosei': ("Fraser's dolphin",
  'https://upload.wikimedia.org/wikipedia/commons/2/2b/Frazer%C2%B4s_dolphin_group.jpg'),
 'Delphinus delphis': ('Common dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Common_dolphin_noaa.jpg/1200px-Common_dolphin_noaa.jpg'),
 'Stenella coeruleoalba': ('Striped dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Stenella_coeruleoalba_Ligurian_Sea_02_-_brighter.jpg/1200px-Stenella_coeruleoalba_Ligurian_Sea_02_-_brighter.jpg'),
 'Stenella attenuata': ('Pantropical spotted dolphin',
  'https://upload.wikimedia.org/wikipedia/commons/thumb/0/06/Pantropical_spotted_dolphin_swimming_off_the_coast_of_Costa_Ric