In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import requests
from bs4 import BeautifulSoup
import re

### Wikipedia list of cetacea

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_cetaceans'

response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

html_tables = soup.find_all('table', {'class': 'wikitable'})
family_names = soup.find_all('div', {'class': 'mw-heading mw-heading3'})
family_names = [name.find("h3")["id"] for name in family_names]

In [3]:
all_rows = []
for i, wiki_table in enumerate(html_tables):

    family_name = family_names[i]
    rows = wiki_table.find_all("tr")

    for row in rows:

        if 'Genus' in row.get_text():
            try:
                genus_name = row.find("i").find("b").get_text(strip=True)
            except:
                raise ValueError('problem with genus row: ', row)

        if 'Genus' not in row.get_text() and 'Scientific' not in row.get_text():
            info_row = []
            for col in row.find_all("td"):
                if col.find('i') is not None:
                    info = col.find('i').get_text(strip=True)
                elif col.find('b') is not None:
                    info = col.find('b').get_text(strip=True)
                else:
                    info = col.get_text(strip=True)
                info_row.append(info)
            row_info = [family_name, genus_name] + info_row
            all_rows.append(row_info)

first_table = html_tables[0].find_all("tr")
wiki_df = pd.DataFrame(all_rows, columns=['Family', 'Genus'] + [th.get_text(strip=True) for th in first_table[1].find_all("th")])

names = wiki_df.Family

family_latin = []
family_common = []

for name in names:
    result = re.search('Family_(.*):_', name)
    family_latin.append(result.group(1))
    result = re.search(':_(.*)', name)
    family_common.append(result.group(1))

wiki_df.drop(columns='Family', inplace=True)
wiki_df.insert(loc=0, column='Family_Greek', value=family_latin)
wiki_df.insert(loc=0, column='Family_common', value=family_common)

wiki_df.head()


Unnamed: 0,Family_common,Family_Greek,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,Range,Size,Picture
0,right_whales,Balaenidae,Balaena,Bowhead whale,Balaena mysticetus,LC,10000,,60 t (66 short tons),
1,right_whales,Balaenidae,Eubalaena,North Atlantic right whale,Eubalaena glacialis,CR,350,,40–80 t (44–88 short tons),
2,right_whales,Balaenidae,Eubalaena,North Pacific right whale,Eubalaena japonica,EN,"404–2,108[12]",,60–80 t (66–88 short tons),
3,right_whales,Balaenidae,Eubalaena,Southern right whale,Eubalaena australis,LC,"13,600[13]",,40–80 t (44–88 short tons),
4,rorquals,Balaenopteridae,Eubalaena,Balaenopteridae,,,,,,


#### Extracting images from the table

In [4]:
rows = html_tables[0].find_all("tr")
cols = rows[2].find_all("td")

pics = [cols[i] for i in range(len(cols)) if 'upload.wikimedia' in str(cols[i])]
base_url = "https:"



headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

for td in pics:
    soup = BeautifulSoup(str(td), "html.parser")
    img_tag = soup.find("img")
    if img_tag and "src" in img_tag.attrs:
        image_url = base_url + img_tag["src"]
        image_name = image_url.split("/")[-1]  # Extract filename
        
        response = requests.get(image_url, headers=headers, stream=True)
        if response.status_code == 200:
            with open(image_name, "wb") as file:
                for chunk in response.iter_content(1024):
                    file.write(chunk)
            print(f"Downloaded: {image_name}")
        else:
            print(f"Failed to download {image_url}, status code: {response.status_code}")

Downloaded: 150px-Cetacea_range_map_Bowhead_Whale.png
Downloaded: 250px-Bowhead_whale_size.svg.png
Downloaded: 150px-A_bowhead_whale_breaches_off_the_coast_of_western_Sea_of_Okhotsk_by_Olga_Shpak%2C_Marine_Mammal_Council%2C_IEE_RAS.jpg


### Wikipedia API

In [6]:
import requests

def get_wikipedia_summary(title):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True,
        "explaintext": True,
    }
    
    response = requests.get(url, params=params)
    data = response.json()
    return data

In [None]:
https://en.wikipedia.org/w/api.php?action=query&format=json&origin=*&exintro=&explaintext=&titles=${encodeURIComponent(species)}&prop=extracts

In [8]:
data = get_wikipedia_summary('Blue_whale')
data

{'batchcomplete': '',
 'query': {'normalized': [{'from': 'Blue_whale', 'to': 'Blue whale'}],
  'pages': {'4925': {'pageid': 4925,
    'ns': 0,
    'title': 'Blue whale',
    'extract': "The blue whale (Balaenoptera musculus) is a marine mammal and a baleen whale. Reaching a maximum confirmed length of 29.9 m (98 ft) and weighing up to 199 t (196 long tons; 219 short tons), it is the largest animal known ever to have existed. The blue whale's long and slender body can be of various shades of greyish-blue on its upper surface and somewhat lighter underneath. Four subspecies are recognized: B. m. musculus in the North Atlantic and North Pacific, B. m. intermedia in the Southern Ocean, B. m. brevicauda (the pygmy blue whale) in the Indian Ocean and South Pacific Ocean, and B. m. indica in the Northern Indian Ocean. There is a population in the waters off Chile that may constitute a fifth subspecies.\nIn general, blue whale populations migrate between their summer feeding areas near the pol

In [13]:
pages = data['query']['pages']
page_id = next(iter(pages))
pages[page_id]['extract']

"The blue whale (Balaenoptera musculus) is a marine mammal and a baleen whale. Reaching a maximum confirmed length of 29.9 m (98 ft) and weighing up to 199 t (196 long tons; 219 short tons), it is the largest animal known ever to have existed. The blue whale's long and slender body can be of various shades of greyish-blue on its upper surface and somewhat lighter underneath. Four subspecies are recognized: B. m. musculus in the North Atlantic and North Pacific, B. m. intermedia in the Southern Ocean, B. m. brevicauda (the pygmy blue whale) in the Indian Ocean and South Pacific Ocean, and B. m. indica in the Northern Indian Ocean. There is a population in the waters off Chile that may constitute a fifth subspecies.\nIn general, blue whale populations migrate between their summer feeding areas near the poles and their winter breeding grounds near the tropics. There is also evidence of year-round residencies, and partial or age/sex-based migration. Blue whales are filter feeders; their di

### Tree of life

Check if the ceataceans are in a kaggle dataset with the tree of life of most animals.
Download link for tree of life files: https://www.kaggle.com/datasets/konivat/tree-of-life

In [5]:
links_df = pd.read_csv('data//treeoflife_links.xls')
nodes_df = pd.read_csv('data//treeoflife_nodes.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data//treeoflife_links.xls'

In [None]:
nodes_df.head()

Unnamed: 0,node_id,node_name,child_nodes,leaf_node,tolorg_link,extinct,confidence,phylesis
0,1,Life on Earth,4,0,1,0,0,0
1,2,Eubacteria,24,0,1,0,0,0
2,2285,Aquificae,10,0,1,0,0,0
3,59615,Aquifex,0,0,0,0,0,0
4,59616,Calderobacterium,0,0,0,0,0,0


In [None]:
node_names = list(nodes_df['node_name'])
wiki_names = list(wiki_df['Scientific name'])
#wiki_names = [s.split()[0] for s in wiki_names if s!=None]
common = [i for i in node_names if i in wiki_names]
common

['Balaenoptera musculus',
 'Balaenoptera physalus',
 'Balaenoptera borealis',
 'Balaenoptera edeni',
 'Balaenoptera acutorostrata',
 'Megaptera novaeangliae',
 'Balaena mysticetus',
 'Eubalaena glacialis',
 'Eubalaena australis']

Since there are only 9 out of almost 100, we do not use the kaggle tree of life.