## Scrapping images of cetaceans from Wikipedia page

In [22]:
import pandas as pd
import requests, os
from tqdm import tqdm
from bs4 import BeautifulSoup, Tag
import json

In [23]:
species = pd.read_csv("../../data/PhylogeneticTree/species_tree_of_life.csv")
species

Unnamed: 0,Scientific name,Common name,Wikipedia page URL,Genus,Genus Wikipedia URL,Family,Family Wikipedia URL,Subfamily,Subfamily Wikipedia URL,Superfamily,Superfamily Wikipedia URL
0,Stenella clymene,Clymene dolphin,https://en.wikipedia.org/wiki/Clymene_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
1,Stenella longirostris,Spinner dolphin,https://en.wikipedia.org/wiki/Spinner_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
2,Lagenodelphis hosei,Fraser's dolphin,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Lagenodelphis,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
3,Delphinus delphis,Common dolphin,https://en.wikipedia.org/wiki/Common_dolphin,Delphinus,https://en.wikipedia.org/wiki/Common_dolphin,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
4,Stenella coeruleoalba,Striped dolphin,https://en.wikipedia.org/wiki/Striped_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
...,...,...,...,...,...,...,...,...,...,...,...
70,Caperea marginata,Pygmy right whale,https://en.wikipedia.org/wiki/Pygmy_right_whale,Caperea,https://en.wikipedia.org/wiki/Pygmy_right_whale,Cetotheriidae,https://en.wikipedia.org/wiki/Cetotheriidae,Neobalaenidae,https://en.wikipedia.org/wiki/Neobalaenidae,,
71,Eubalaena glacialis,North Atlantic right whale,https://en.wikipedia.org/wiki/North_Atlantic_r...,Eubalaena,https://en.wikipedia.org/wiki/Right_whale,Balaenidae,https://en.wikipedia.org/wiki/Balaenidae,,,,
72,Eubalaena japonica,North Pacific right whale,https://en.wikipedia.org/wiki/North_Pacific_ri...,Eubalaena,https://en.wikipedia.org/wiki/Right_whale,Balaenidae,https://en.wikipedia.org/wiki/Balaenidae,,,,
73,Eubalaena australis,Southern right whale,https://en.wikipedia.org/wiki/Southern_right_w...,Eubalaena,https://en.wikipedia.org/wiki/Right_whale,Balaenidae,https://en.wikipedia.org/wiki/Balaenidae,,,,


In [24]:
from image_url_scrapper import WikiImageScrapper

In [25]:
# Set the folder name where images will be stored
my_folder = 'wiki_images'
os.makedirs(my_folder, exist_ok=True)

# Base URL for Wikipedia API
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

# Wikipedia article to download the main image from
#my_list = philo_common_names#species_list#IDs_and_names["Common name"].dropna().tolist()

# Define User-Agent header to respect Wikipedia's user-client policy :,)
HEADERS = {
    "User-Agent": "WhereWereWhales: cetaceans images from Wikipedia"
}


def get_image_url(page_url):
    """ Gets the main image URL from a Wikipedia article """
    response = None
    try:
        response = requests.get(page_url, headers=HEADERS)
    except Exception as exc:
        print(exc)
        return None
    
    if response is None or response.status_code != 200:
        return None
        
    soup = BeautifulSoup(response.content, "html.parser")
    og_image = soup.find("meta", property="og:image")

    if isinstance(og_image, Tag):
        try:
            return og_image.get("content")
        except Exception as exc:
            print(exc)
            return None
    else:
        print("og:image tag not found")
    return None

    # try:
    #     api_res = requests.get(query + partial_url, headers=HEADERS).json()
    #     first_part = api_res['query']['pages']
    #     for key, value in first_part.items():
    #         if 'original' in value:
    #             return value['original']['source']
    # except Exception as exc:
    #     print(exc)
    #     print("Partial URL: " + partial_url)
    # return None


def download_image(the_url, the_page):
    res = requests.get(the_url, headers=HEADERS) 
    res.raise_for_status()
    
    if res.status_code != 200:
        print(f"Error scraping {the_page}: {res.status_code}")

    # Get the correct file extension
    file_ext = '.' + the_url.split('.')[-1].lower()
    image_file = os.path.join(my_folder, os.path.basename(the_page + file_ext))

    # Save the image
    with open(image_file, 'wb') as f:
        for chunk in res.iter_content(100000):
            f.write(chunk)

    #print(f"Downloaded: {image_file}")

# Download images from Wikipedia articles
image_urls = dict()
for _, (co_name, sc_name, the_url) in tqdm(species[["Common name", "Scientific name", "Wikipedia page URL"]].iterrows(), desc="Scraping Wikipedia images", unit="pages"):
    #the_url = get_image_url(the_page)
    #print(the_page, the_url)
    try:
        #print(f"Downloading an image of a magnificent {the_page}...")
        #download_image(the_url, the_page)
        image_urls[sc_name] = {"common_name":co_name, "image": get_image_url(the_url)}
        
    except:
        print(f"No image found for {co_name} :(")

print("All done!")


Scraping Wikipedia images: 75pages [00:14,  5.18pages/s]

All done!





In [26]:
wiki_urls = species[["Scientific name", "Common name", "Wikipedia page URL"]].copy()
scrapper = WikiImageScrapper(wiki_urls)
scrapper.scrap_image_URLs()

image_urls = scrapper.wiki_urls_df[["Scientific name", "Common name", "Image URL"]].copy()

dict_image_urls = {f"{sc_name}":{"common_name": co_name, "image": img_url} for _, (sc_name, co_name, img_url) in image_urls.iterrows()}

Scraping Wikipedia images: 75pages [00:14,  5.11pages/s]

All done!





In [27]:
dict_image_urls

{'Stenella clymene': {'common_name': 'Clymene dolphin',
  'image': 'https://upload.wikimedia.org/wikipedia/commons/a/aa/Clymenes.jpg'},
 'Stenella longirostris': {'common_name': 'Spinner dolphin',
  'image': 'https://upload.wikimedia.org/wikipedia/commons/thumb/6/64/A_spinner_dolphin_in_the_Red_Sea.jpg/1200px-A_spinner_dolphin_in_the_Red_Sea.jpg'},
 'Lagenodelphis hosei': {'common_name': "Fraser's dolphin",
  'image': 'https://upload.wikimedia.org/wikipedia/commons/2/2b/Frazer%C2%B4s_dolphin_group.jpg'},
 'Delphinus delphis': {'common_name': 'Common dolphin',
  'image': 'https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/Common_dolphin_noaa.jpg/1200px-Common_dolphin_noaa.jpg'},
 'Stenella coeruleoalba': {'common_name': 'Striped dolphin',
  'image': 'https://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/Stenella_coeruleoalba_Ligurian_Sea_02_-_brighter.jpg/1200px-Stenella_coeruleoalba_Ligurian_Sea_02_-_brighter.jpg'},
 'Stenella attenuata': {'common_name': 'Pantropical spotted

We manually inspected all the images and discovered that some of the retrieved images either did not correspond to the true main page of the Wikipedia article (mainly because our method for fetching the images is not robust enough) or they would depict the animal suffering or dying which could represent sensitive content to the user of our website. We manually explore the Creative Commons database to replace those images with more appropriate ones.

In [28]:
dict_image_urls["Phocoena dioptrica"] = {"common_name":'Spectacled porpoise', "image": "https://upload.wikimedia.org/wikipedia/commons/c/c4/Subadult_female_spectacled_porpoise.png"}
dict_image_urls["Phocoena spinipinnis"] = {"common_name":"Burmeister's porpoise", "image":"https://upload.wikimedia.org/wikipedia/commons/1/1c/Burmeister%27s_porpoise_-_Phocoena_spinipinnis_-_2022-02-24.png"}
dict_image_urls["Lipotes vexillifer"] = {"common_name":'Baiji', "image":"https://upload.wikimedia.org/wikipedia/commons/5/5e/Lipotes_vexillifer.png"}
dict_image_urls["Mesoplodon peruvianus"] = {"common_name":'Pygmy beaked whale', "image": "https://upload.wikimedia.org/wikipedia/commons/9/93/Mesoplodon_peruvianus.jpg"}
dict_image_urls["Mesoplodon densirostris"] = {"common_name":"Blaincille's beaked whale", "image": "https://upload.wikimedia.org/wikipedia/commons/b/b9/Mesoplodon_densirostris.jpg"}
dict_image_urls["Mesoplodon grayi"] = {"common_name":"Gray's beaked whale", "image": "https://upload.wikimedia.org/wikipedia/commons/6/6a/Mesoplodon_grayi_2.jpg"}
dict_image_urls["Mesoplodon hectori"] = {"common_name":"Hector's beaked whale", "image": "https://upload.wikimedia.org/wikipedia/commons/b/b5/Mesoplodon_hectori.jpg"}
dict_image_urls["Berardius bairdii"] = {"common_name":"Baird's beaked whale", "image": "https://upload.wikimedia.org/wikipedia/commons/e/e2/Berardius_bairdii_2.jpg"}
dict_image_urls['Kogia breviceps'] = {"common_name":'Pygmy sperm whale', "image": "https://upload.wikimedia.org/wikipedia/commons/b/b6/FMIB_50832_Pygmy_Sperm_Whale.jpeg"}

In [29]:
len(dict_image_urls), species.shape[0]

(75, 75)

In [31]:
tree_json_file = open("../../data/PhylogeneticTree/hierarchical_tree.json")
hier_tree_dict = json.load(tree_json_file)
hier_tree_dict

{'scientific_name': 'CETACEAN',
 'common_name': None,
 'image': None,
 'parent_branch_name': None,
 'children': [{'scientific_name': 1,
   'common_name': None,
   'image': None,
   'parent_branch_name': 'Odontoceti',
   'children': [{'scientific_name': 2,
     'common_name': None,
     'image': None,
     'parent_branch_name': None,
     'children': [{'scientific_name': 4,
       'common_name': None,
       'image': None,
       'parent_branch_name': 'Kogiidae',
       'family': 'Kogiidae',
       'children': [{'scientific_name': 'Kogia sima',
         'common_name': 'Dwarf sperm whale',
         'image': 'https://upload.wikimedia.org/wikipedia/commons/5/58/Dwarf_sperm_whale_%28NOAA_Pitman%29.jpg',
         'parent_branch_name': None,
         'children': [],
         'family': 'Kogiidae',
         'genus': 'Kogia'},
        {'scientific_name': 'Kogia breviceps',
         'common_name': 'Pygmy sperm whale',
         'image': 'https://upload.wikimedia.org/wikipedia/commons/b/b6/FMIB_508

In [32]:
species_df = species.set_index("Scientific name", drop=False)
species_df.loc["Tursiops truncatus"].Family

'Delphinidae'

In [33]:
def sets_common_name_and_image_url(node):
    children_list = node["children"]
    if len(children_list) == 0:
        sc_name = node["scientific_name"]
        node["common_name"] = dict_image_urls[sc_name]["common_name"]
        node["image"] = dict_image_urls[sc_name]["image"]
        node["genus"] = species_df.loc[sc_name]["Genus"]
        node["family"] = species_df.loc[sc_name]["Family"]
    else:
        for child in children_list:
            sets_common_name_and_image_url(child)

In [34]:
sets_common_name_and_image_url(hier_tree_dict)

In [35]:
#write test_tree into a json file
if True:
    with open("../../data/PhylogeneticTree/hierarchical_tree.json", "w") as f: 
        json.dump(hier_tree_dict, f, indent=4)