## Which species are observed in all three data sources?

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import requests
from bs4 import BeautifulSoup
import re

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_cetaceans'

response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

html_tables = soup.find_all('table', {'class': 'wikitable'})
family_names = soup.find_all('div', {'class': 'mw-heading mw-heading3'})
family_names = [name.find("h3")["id"] for name in family_names]

In [3]:
all_rows = []
for i, wiki_table in enumerate(html_tables):

    family_name = family_names[i]
    rows = wiki_table.find_all("tr")

    for row in rows:

        if 'Genus' in row.get_text():
            try:
                genus_name = row.find("i").find("b").get_text(strip=True)
            except:
                raise ValueError('problem with genus row: ', row)

        if 'Genus' not in row.get_text() and 'Scientific' not in row.get_text():
            info_row = []
            for col in row.find_all("td"):
                if col.find('i') is not None:
                    info = col.find('i').get_text(strip=True)
                elif col.find('b') is not None:
                    info = col.find('b').get_text(strip=True)
                else:
                    info = col.get_text(strip=True)
                info_row.append(info)
            row_info = [family_name, genus_name] + info_row
            all_rows.append(row_info)

first_table = html_tables[0].find_all("tr")
wiki_df = pd.DataFrame(all_rows, columns=['Family', 'Genus'] + [th.get_text(strip=True) for th in first_table[1].find_all("th")])

names = wiki_df.Family

family_latin = []
family_common = []

for name in names:
    result = re.search('Family_(.*):_', name)
    family_latin.append(result.group(1))
    result = re.search(':_(.*)', name)
    family_common.append(result.group(1))

wiki_df.drop(columns='Family', inplace=True)
wiki_df.insert(loc=0, column='Family_Latin', value=family_latin)
wiki_df.insert(loc=0, column='Family_common', value=family_common)

wiki_df = wiki_df.dropna()

display(wiki_df.head(5))
wiki_df['Scientific name'].unique()

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,Range,Size,Picture
0,right_whales,Balaenidae,Balaena,Bowhead whale,Balaena mysticetus,LC,10000,,60 t (66 short tons),
1,right_whales,Balaenidae,Eubalaena,North Atlantic right whale,Eubalaena glacialis,CR,350,,40–80 t (44–88 short tons),
2,right_whales,Balaenidae,Eubalaena,North Pacific right whale,Eubalaena japonica,EN,"404–2,108[12]",,60–80 t (66–88 short tons),
3,right_whales,Balaenidae,Eubalaena,Southern right whale,Eubalaena australis,LC,"13,600[13]",,40–80 t (44–88 short tons),
5,rorquals,Balaenopteridae,Balaenoptera,Blue whale,Balaenoptera musculus,EN,"5,000–15,000",,50–150 t (55–165 short tons),


array(['Balaena mysticetus', 'Eubalaena glacialis', 'Eubalaena japonica',
       'Eubalaena australis', 'Balaenoptera musculus',
       'Balaenoptera brydei', 'Balaenoptera edeni',
       'Balaenoptera acutorostrata', 'Balaenoptera ricei',
       'Balaenoptera physalus', 'Balaenoptera omurai',
       'Balaenoptera borealis', 'Balaenoptera bonaerensis',
       'Megaptera novaeangliae', 'Eschrichtius robustus',
       'Caperea marginata', 'Cephalorhynchus eutropia',
       'Cephalorhynchus commersonii', 'Cephalorhynchus heavisidii',
       'Cephalorhynchus hectori', 'Delphinus delphis', 'Feresa attenuata',
       'Globicephala melas', 'Globicephala macrorhynchus',
       'Grampus griseus', 'Lagenodelphis hosei',
       'Lagenorhynchus albirostris', 'Lagenorhynchus acutus',
       'Lagenorhynchus obscurus', 'Lagenorhynchus cruciger',
       'Lagenorhynchus obliquidens', 'Lagenorhynchus australis',
       'Lissodelphis borealis', 'Lissodelphis peronii',
       'Orcaella heinsohni', 'Orcael

In [4]:
map_df = pd.read_csv('../../data/grouped_species.csv')
display(map_df.head(3))
map_df['scientific_name'].unique()

Unnamed: 0,species_name,common_name,scientific_name,event_month,latitude,longitude,individual_count,IUCN_Red_List_status
0,Bowhead,Bowhead,Balaena Mysticetus,2012-08,79.5,50.0,1.0,LC
1,Bowhead,Bowhead,Balaena Mysticetus,2017-06,80.311508,54.810083,7.5,LC
2,Bowhead,Bowhead,Balaena Mysticetus,2017-06,79.998846,47.449725,2.0,LC


array(['Balaena Mysticetus', 'Balaenoptera Acutorostrata',
       'Balaenoptera Borealis', 'Balaenoptera Brydei',
       'Balaenoptera Musculus', 'Balaenoptera Omurai',
       'Balaenoptera Physalus', 'Berardius Arnuxii', 'Berardius Bairdii',
       'Cephalorhynchus Commersonii', 'Cephalorhynchus Eutropia',
       'Cephalorhynchus Heavisidii', 'Cephalorhynchus Hectori',
       'Delphinapterus Leucas', 'Delphinus Delphis',
       'Eschrichtius Robustus', 'Eubalaena Glacialis',
       'Eubalaena Japonica', 'Globicephala Melas', 'Grampus Griseus',
       'Hyperoodon Ampullatus', 'Hyperoodon Planifrons',
       'Inia Geoffrensis', 'Kogia Breviceps', 'Kogia Sima',
       'Lagenodelphis Hosei', 'Lagenorhynchus Acutus',
       'Lagenorhynchus Australis', 'Lagenorhynchus Cruciger',
       'Lagenorhynchus Obscurus', 'Lissodelphis Borealis',
       'Lissodelphis Peronii', 'Megaptera Novaeangliae',
       'Mesoplodon Densirostris', 'Mesoplodon Europaeus',
       'Mesoplodon Layardii', 'Monodon Mo

In [5]:
wiki_names = list(map(lambda x: x.lower(), list(wiki_df['Scientific name'])))

map_names = list(map(lambda x: x.lower(), list(map_df['scientific_name'])))
map_names = list(set(map_names))
print('n map names: ', len(map_names))

common = [name for name in map_names if name in wiki_names]
print('n common lower case names map and wiki: ', len(common))
print(common)

n map names:  53
n common lower case names map and wiki:  53
['lagenorhynchus obscurus', 'peponocephala electra', 'balaenoptera omurai', 'cephalorhynchus hectori', 'lagenorhynchus acutus', 'phocoenoides dalli', 'balaenoptera physalus', 'eubalaena glacialis', 'globicephala melas', 'kogia sima', 'sousa plumbea', 'tursiops truncatus', 'lagenorhynchus cruciger', 'balaenoptera borealis', 'orcaella brevirostris', 'delphinus delphis', 'lagenorhynchus australis', 'balaenoptera musculus', 'hyperoodon planifrons', 'grampus griseus', 'mesoplodon europaeus', 'ziphius cavirostris', 'berardius arnuxii', 'inia geoffrensis', 'stenella frontalis', 'kogia breviceps', 'mesoplodon densirostris', 'balaenoptera brydei', 'pseudorca crassidens', 'berardius bairdii', 'mesoplodon layardii', 'cephalorhynchus commersonii', 'delphinapterus leucas', 'orcinus orca', 'eschrichtius robustus', 'lagenodelphis hosei', 'lissodelphis borealis', 'steno bredanensis', 'hyperoodon ampullatus', 'stenella clymene', 'balaena myst

## Map names match when lower case

In [6]:
tree_df = pd.read_csv('../../data/PhylogeneticTree/species_tree_of_life.csv')
tree_df.head(3)

Unnamed: 0,Scientific name,Common name,Wikipedia page URL,Genus,Genus Wikipedia URL,Family,Family Wikipedia URL,Subfamily,Subfamily Wikipedia URL,Superfamily,Superfamily Wikipedia URL
0,Stenella clymene,Clymene dolphin,https://en.wikipedia.org/wiki/Clymene_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
1,Stenella longirostris,Spinner dolphin,https://en.wikipedia.org/wiki/Spinner_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
2,Lagenodelphis hosei,Fraser's dolphin,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Lagenodelphis,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,


In [7]:
common_rows_tree_wiki = tree_df[tree_df['Scientific name'].isin(wiki_df['Scientific name'])]
not_common_rows_tree_wiki = tree_df[~tree_df['Scientific name'].isin(wiki_df['Scientific name'])]
print('tree wiki common names (intersection): ', len(common_rows_tree_wiki['Scientific name'].unique()))
print(len(not_common_rows_tree_wiki['Scientific name'].unique()),
      ' tree names that are not in wiki: ',
      list(not_common_rows_tree_wiki['Scientific name']))
print()

tree wiki common names (intersection):  71
4  tree names that are not in wiki:  ['Sagmatias australis', 'Sagmatias obscurus', 'Sagmatias obliquidens', 'Leucopleurus acutus']



In [8]:
common_rows_tree_wiki = wiki_df[wiki_df['Scientific name'].isin(tree_df['Scientific name'])]
not_common_rows_tree_wiki = wiki_df[~wiki_df['Scientific name'].isin(tree_df['Scientific name'])]
print('intersection: ', len(common_rows_tree_wiki['Scientific name'].unique()))
print('union minus intersection', len(not_common_rows_tree_wiki['Scientific name'].unique()))
common_rows_tree_wiki['Scientific name'].unique()

intersection:  71
union minus intersection 25


array(['Balaena mysticetus', 'Eubalaena glacialis', 'Eubalaena japonica',
       'Eubalaena australis', 'Balaenoptera musculus',
       'Balaenoptera edeni', 'Balaenoptera acutorostrata',
       'Balaenoptera physalus', 'Balaenoptera borealis',
       'Balaenoptera bonaerensis', 'Megaptera novaeangliae',
       'Eschrichtius robustus', 'Caperea marginata',
       'Cephalorhynchus commersonii', 'Cephalorhynchus heavisidii',
       'Delphinus delphis', 'Feresa attenuata', 'Globicephala melas',
       'Globicephala macrorhynchus', 'Grampus griseus',
       'Lagenodelphis hosei', 'Lagenorhynchus albirostris',
       'Lissodelphis borealis', 'Lissodelphis peronii',
       'Orcaella heinsohni', 'Orcaella brevirostris', 'Orcinus orca',
       'Peponocephala electra', 'Pseudorca crassidens', 'Sousa chinensis',
       'Sotalia guianensis', 'Stenella frontalis', 'Stenella clymene',
       'Stenella attenuata', 'Stenella longirostris',
       'Stenella coeruleoalba', 'Steno bredanensis', 'Tursiop

- There are more names in wikipedia than in maps and tree
- All scientific names used for the map can be matched to scientific names from wikipedia
- There are 4 names from the tree that can not be matched to the wiki names

## unique identification for all three sources

In [9]:
wiki_df_merge = wiki_df[['Family_common', 'Family_Latin', 'Genus', 'Common name',
       'Scientific name', 'IUCN Red List status', 'Global population estimate']].copy()
wiki_df_merge['wiki'] = 1
wiki_df_merge['Scientific name'] = wiki_names

map_df_merge = pd.DataFrame({
    'Scientific name': map_names,  # column with your strings
    'map': 1        # column with all values set to True
})

tree_names = list(map(lambda x: x.lower(), list(tree_df['Scientific name'])))
tree_df_merge = pd.DataFrame({
    'Scientific name': tree_names,  # column with your strings
    'tree': 1        # column with all values set to True
})


display(wiki_df_merge.head())
display(map_df_merge.head())
display(tree_df_merge.head())

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki
0,right_whales,Balaenidae,Balaena,Bowhead whale,balaena mysticetus,LC,10000,1
1,right_whales,Balaenidae,Eubalaena,North Atlantic right whale,eubalaena glacialis,CR,350,1
2,right_whales,Balaenidae,Eubalaena,North Pacific right whale,eubalaena japonica,EN,"404–2,108[12]",1
3,right_whales,Balaenidae,Eubalaena,Southern right whale,eubalaena australis,LC,"13,600[13]",1
5,rorquals,Balaenopteridae,Balaenoptera,Blue whale,balaenoptera musculus,EN,"5,000–15,000",1


Unnamed: 0,Scientific name,map
0,lagenorhynchus obscurus,1
1,peponocephala electra,1
2,balaenoptera omurai,1
3,cephalorhynchus hectori,1
4,lagenorhynchus acutus,1


Unnamed: 0,Scientific name,tree
0,stenella clymene,1
1,stenella longirostris,1
2,lagenodelphis hosei,1
3,delphinus delphis,1
4,stenella coeruleoalba,1


In [10]:
merged_df = wiki_df_merge.merge(map_df_merge, how='left', on='Scientific name')
merged_df = merged_df.merge(tree_df_merge, how='outer', on='Scientific name')
merged_df[['wiki', 'map', 'tree']] = merged_df[['wiki', 'map', 'tree']].fillna(0)
merged_df['unique_id'] = list(map(lambda s: s.replace(" ", "_"), list(merged_df['Scientific name'])))
merged_df

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id
0,right_whales,Balaenidae,Balaena,Bowhead whale,balaena mysticetus,LC,10000,1.0,1.0,1.0,balaena_mysticetus
1,rorquals,Balaenopteridae,Balaenoptera,Common minke whale,balaenoptera acutorostrata,LC,200000,1.0,1.0,1.0,balaenoptera_acutorostrata
2,rorquals,Balaenopteridae,Balaenoptera,Antarctic minke whale,balaenoptera bonaerensis,NT,"515,000[18]",1.0,0.0,1.0,balaenoptera_bonaerensis
3,rorquals,Balaenopteridae,Balaenoptera,Sei whale,balaenoptera borealis,EN,"80,000[17]",1.0,1.0,1.0,balaenoptera_borealis
4,rorquals,Balaenopteridae,Balaenoptera,Bryde's whale,balaenoptera brydei,LC,"90,000–100,000",1.0,1.0,0.0,balaenoptera_brydei
...,...,...,...,...,...,...,...,...,...,...,...
95,beaked_whales,Ziphiidae,Tasmacetus,Shepherd's beaked whale,tasmacetus shepherdi,DD,Unknown[af],1.0,0.0,1.0,tasmacetus_shepherdi
96,oceanic_dolphins,Delphinidae,Tursiops,Indo-Pacific bottlenose dolphin,tursiops aduncus,NT,Unknown,1.0,0.0,1.0,tursiops_aduncus
97,oceanic_dolphins,Delphinidae,Tursiops,Tamanend's bottlenose dolphin,tursiops erebennus,NE,Unknown,1.0,0.0,0.0,tursiops_erebennus
98,oceanic_dolphins,Delphinidae,Tursiops,Common bottlenose dolphin,tursiops truncatus,LC,"600,000[22]",1.0,1.0,1.0,tursiops_truncatus


In [11]:
merged_df.shape

(100, 11)

In [12]:
merged_df['Common name'].unique().shape

(97,)

In [13]:
non_unique = merged_df['Common name'].value_counts()[lambda x: x > 1].index.tolist()
non_unique

[]

In [14]:
non_unique = merged_df.loc[merged_df['Common name'].duplicated(keep=False), 'Common name'].unique()
non_unique

array([nan], dtype=object)

In [15]:
merged_df = merged_df.fillna('null')

In [16]:
merged_df[merged_df['IUCN Red List status'] == 'CR']

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id
9,rorquals,Balaenopteridae,Balaenoptera,Rice's whale[16],balaenoptera ricei,CR,30–100,1.0,0.0,0.0,balaenoptera_ricei
22,right_whales,Balaenidae,Eubalaena,North Atlantic right whale,eubalaena glacialis,CR,350,1.0,1.0,1.0,eubalaena_glacialis
43,baiji,Lipotidae,Lipotes,Baiji,lipotes vexillifer,CR,0–13[u],1.0,0.0,1.0,lipotes_vexillifer
72,porpoises,Phocoenidae,Phocoena,Vaquita,phocoena sinus,CR,12[29],1.0,0.0,0.0,phocoena_sinus
88,oceanic_dolphins,Delphinidae,Sousa,Atlantic humpback dolphin,sousa teuszii,CR,1500,1.0,0.0,0.0,sousa_teuszii


In [17]:
tree_images = pd.read_csv("../../data/WikipediaImages/image_urls.csv")
tree_images["Common name"] = tree_images["Common name"].apply(str.lower)
tree_images

Unnamed: 0,Common name,Image URL
0,clymene dolphin,https://upload.wikimedia.org/wikipedia/commons...
1,spinner dolphin,https://upload.wikimedia.org/wikipedia/commons...
2,fraser's dolphin,https://upload.wikimedia.org/wikipedia/commons...
3,common dolphin,https://upload.wikimedia.org/wikipedia/commons...
4,striped dolphin,https://upload.wikimedia.org/wikipedia/commons...
...,...,...
70,pygmy right whale,https://upload.wikimedia.org/wikipedia/commons...
71,north atlantic right whale,https://upload.wikimedia.org/wikipedia/commons...
72,north pacific right whale,https://upload.wikimedia.org/wikipedia/commons...
73,southern right whale,https://upload.wikimedia.org/wikipedia/commons...


In [18]:
tree_wiki_pages = pd.read_csv("../../data/PhylogeneticTree/species_tree_of_life.csv", usecols=["Scientific name", "Common name", "Wikipedia page URL"])
tree_wiki_pages[["Scientific name", "Common name"]] = tree_wiki_pages[["Scientific name", "Common name"]].apply(lambda x: x.str.lower())
tree_wiki_pages

Unnamed: 0,Scientific name,Common name,Wikipedia page URL
0,stenella clymene,clymene dolphin,https://en.wikipedia.org/wiki/Clymene_dolphin
1,stenella longirostris,spinner dolphin,https://en.wikipedia.org/wiki/Spinner_dolphin
2,lagenodelphis hosei,fraser's dolphin,https://en.wikipedia.org/wiki/Fraser%27s_dolphin
3,delphinus delphis,common dolphin,https://en.wikipedia.org/wiki/Common_dolphin
4,stenella coeruleoalba,striped dolphin,https://en.wikipedia.org/wiki/Striped_dolphin
...,...,...,...
70,caperea marginata,pygmy right whale,https://en.wikipedia.org/wiki/Pygmy_right_whale
71,eubalaena glacialis,north atlantic right whale,https://en.wikipedia.org/wiki/North_Atlantic_r...
72,eubalaena japonica,north pacific right whale,https://en.wikipedia.org/wiki/North_Pacific_ri...
73,eubalaena australis,southern right whale,https://en.wikipedia.org/wiki/Southern_right_w...


In [19]:
# Checking that all names from the tree do appear in the merged_df table
untraced_names = [name for name in tree_wiki_pages["Common name"] if name not in merged_df["Common name"].apply(str.lower).values]
untraced_names # no names from the tree are untraced: we are happy

['white beaked dolphin',
 'beluga whale',
 "blaincille's beaked whale",
 "gervais's beaked whale",
 'antartic minke whale']

In [20]:
merged_tree_images = tree_images.merge(tree_wiki_pages, on="Common name", how="left").drop("Common name", axis=1)
merged_df_with_images = merged_df.merge(merged_tree_images, on="Scientific name", how="left")
display(merged_df_with_images)

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id,Image URL,Wikipedia page URL
0,right_whales,Balaenidae,Balaena,Bowhead whale,balaena mysticetus,LC,10000,1.0,1.0,1.0,balaena_mysticetus,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Bowhead_whale
1,rorquals,Balaenopteridae,Balaenoptera,Common minke whale,balaenoptera acutorostrata,LC,200000,1.0,1.0,1.0,balaenoptera_acutorostrata,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Common_minke_whale
2,rorquals,Balaenopteridae,Balaenoptera,Antarctic minke whale,balaenoptera bonaerensis,NT,"515,000[18]",1.0,0.0,1.0,balaenoptera_bonaerensis,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Antarctic_minke_...
3,rorquals,Balaenopteridae,Balaenoptera,Sei whale,balaenoptera borealis,EN,"80,000[17]",1.0,1.0,1.0,balaenoptera_borealis,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Sei_whale
4,rorquals,Balaenopteridae,Balaenoptera,Bryde's whale,balaenoptera brydei,LC,"90,000–100,000",1.0,1.0,0.0,balaenoptera_brydei,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,beaked_whales,Ziphiidae,Tasmacetus,Shepherd's beaked whale,tasmacetus shepherdi,DD,Unknown[af],1.0,0.0,1.0,tasmacetus_shepherdi,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Shepherd%27s_bea...
96,oceanic_dolphins,Delphinidae,Tursiops,Indo-Pacific bottlenose dolphin,tursiops aduncus,NT,Unknown,1.0,0.0,1.0,tursiops_aduncus,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Indo-Pacific_bot...
97,oceanic_dolphins,Delphinidae,Tursiops,Tamanend's bottlenose dolphin,tursiops erebennus,NE,Unknown,1.0,0.0,0.0,tursiops_erebennus,,
98,oceanic_dolphins,Delphinidae,Tursiops,Common bottlenose dolphin,tursiops truncatus,LC,"600,000[22]",1.0,1.0,1.0,tursiops_truncatus,https://upload.wikimedia.org/wikipedia/commons...,https://en.wikipedia.org/wiki/Common_bottlenos...


In [21]:
missing_articles = merged_df_with_images[merged_df_with_images["Wikipedia page URL"].isna()]
missing_articles = missing_articles[["Scientific name", "Common name"]].copy()

In [22]:
%load_ext autoreload
%autoreload 2

In [23]:
from image_url_scrapper import WikiImageScrapper

wiki_scrapper = WikiImageScrapper(missing_articles)
wiki_scrapper.get_wiki_urls_from_names()
wiki_scrapper.scrap_image_URLs()
scrapped_images_urls_df = wiki_scrapper.wiki_urls_df[["Scientific name", "Image URL"]]

Searching Wikipedia pages: 26 names [00:12,  2.04 names/s]
Scraping Wikipedia images: 26pages [00:11,  2.29pages/s]

All done!





In [24]:
merged_tree_images

Unnamed: 0,Image URL,Scientific name,Wikipedia page URL
0,https://upload.wikimedia.org/wikipedia/commons...,stenella clymene,https://en.wikipedia.org/wiki/Clymene_dolphin
1,https://upload.wikimedia.org/wikipedia/commons...,stenella longirostris,https://en.wikipedia.org/wiki/Spinner_dolphin
2,https://upload.wikimedia.org/wikipedia/commons...,lagenodelphis hosei,https://en.wikipedia.org/wiki/Fraser%27s_dolphin
3,https://upload.wikimedia.org/wikipedia/commons...,delphinus delphis,https://en.wikipedia.org/wiki/Common_dolphin
4,https://upload.wikimedia.org/wikipedia/commons...,stenella coeruleoalba,https://en.wikipedia.org/wiki/Striped_dolphin
...,...,...,...
70,https://upload.wikimedia.org/wikipedia/commons...,caperea marginata,https://en.wikipedia.org/wiki/Pygmy_right_whale
71,https://upload.wikimedia.org/wikipedia/commons...,eubalaena glacialis,https://en.wikipedia.org/wiki/North_Atlantic_r...
72,https://upload.wikimedia.org/wikipedia/commons...,eubalaena japonica,https://en.wikipedia.org/wiki/North_Pacific_ri...
73,https://upload.wikimedia.org/wikipedia/commons...,eubalaena australis,https://en.wikipedia.org/wiki/Southern_right_w...


In [25]:
images_urls = pd.concat([scrapped_images_urls_df, merged_tree_images[["Scientific name", "Image URL"]]], axis=0)
images_urls

Unnamed: 0,Scientific name,Image URL
0,balaenoptera brydei,https://upload.wikimedia.org/wikipedia/commons...
1,balaenoptera omurai,https://upload.wikimedia.org/wikipedia/commons...
2,balaenoptera ricei,https://upload.wikimedia.org/wikipedia/commons...
3,berardius minimus,https://upload.wikimedia.org/wikipedia/commons...
4,cephalorhynchus eutropia,https://upload.wikimedia.org/wikipedia/commons...
...,...,...
70,caperea marginata,https://upload.wikimedia.org/wikipedia/commons...
71,eubalaena glacialis,https://upload.wikimedia.org/wikipedia/commons...
72,eubalaena japonica,https://upload.wikimedia.org/wikipedia/commons...
73,eubalaena australis,https://upload.wikimedia.org/wikipedia/commons...


In [26]:
total_merged_df = merged_df.merge(images_urls, on= "Scientific name", how="left")
total_merged_df

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id,Image URL
0,right_whales,Balaenidae,Balaena,Bowhead whale,balaena mysticetus,LC,10000,1.0,1.0,1.0,balaena_mysticetus,https://upload.wikimedia.org/wikipedia/commons...
1,rorquals,Balaenopteridae,Balaenoptera,Common minke whale,balaenoptera acutorostrata,LC,200000,1.0,1.0,1.0,balaenoptera_acutorostrata,https://upload.wikimedia.org/wikipedia/commons...
2,rorquals,Balaenopteridae,Balaenoptera,Antarctic minke whale,balaenoptera bonaerensis,NT,"515,000[18]",1.0,0.0,1.0,balaenoptera_bonaerensis,https://upload.wikimedia.org/wikipedia/commons...
3,rorquals,Balaenopteridae,Balaenoptera,Sei whale,balaenoptera borealis,EN,"80,000[17]",1.0,1.0,1.0,balaenoptera_borealis,https://upload.wikimedia.org/wikipedia/commons...
4,rorquals,Balaenopteridae,Balaenoptera,Bryde's whale,balaenoptera brydei,LC,"90,000–100,000",1.0,1.0,0.0,balaenoptera_brydei,https://upload.wikimedia.org/wikipedia/commons...
...,...,...,...,...,...,...,...,...,...,...,...,...
95,beaked_whales,Ziphiidae,Tasmacetus,Shepherd's beaked whale,tasmacetus shepherdi,DD,Unknown[af],1.0,0.0,1.0,tasmacetus_shepherdi,https://upload.wikimedia.org/wikipedia/commons...
96,oceanic_dolphins,Delphinidae,Tursiops,Indo-Pacific bottlenose dolphin,tursiops aduncus,NT,Unknown,1.0,0.0,1.0,tursiops_aduncus,https://upload.wikimedia.org/wikipedia/commons...
97,oceanic_dolphins,Delphinidae,Tursiops,Tamanend's bottlenose dolphin,tursiops erebennus,NE,Unknown,1.0,0.0,0.0,tursiops_erebennus,https://upload.wikimedia.org/wikipedia/commons...
98,oceanic_dolphins,Delphinidae,Tursiops,Common bottlenose dolphin,tursiops truncatus,LC,"600,000[22]",1.0,1.0,1.0,tursiops_truncatus,https://upload.wikimedia.org/wikipedia/commons...


In [27]:
sci_to_common = {
    'sagmatias australis': "Peale's dolphin",
    'sagmatias obscurus': "Dusky dolphin",
    'sagmatias obliquidens': "Pacific white-sided dolphin",
    'leucopleurus acutus': "Atlantic white-sided dolphin"
}

# Update the 'Common name' column where 'Scientific name' matches
for sci_name, common_name in sci_to_common.items():
    mask = total_merged_df['Scientific name'].str.lower() == sci_name.lower()
    total_merged_df.loc[mask, 'Common name'] = common_name

# Display the updated rows
display(total_merged_df[
    total_merged_df['Scientific name'].str.lower().isin(sci_to_common.keys())
])

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id,Image URL
42,,,,Atlantic white-sided dolphin,leucopleurus acutus,,,0.0,0.0,1.0,leucopleurus_acutus,https://upload.wikimedia.org/wikipedia/commons...
80,,,,Peale's dolphin,sagmatias australis,,,0.0,0.0,1.0,sagmatias_australis,https://upload.wikimedia.org/wikipedia/commons...
81,,,,Pacific white-sided dolphin,sagmatias obliquidens,,,0.0,0.0,1.0,sagmatias_obliquidens,https://upload.wikimedia.org/wikipedia/commons...
82,,,,Dusky dolphin,sagmatias obscurus,,,0.0,0.0,1.0,sagmatias_obscurus,https://upload.wikimedia.org/wikipedia/commons...


In [28]:
total_merged_df[total_merged_df['Common name'].str.contains(r'\[|\]', na=False)]

Unnamed: 0,Family_common,Family_Latin,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id,Image URL
5,rorquals,Balaenopteridae,Balaenoptera,Eden's whale[a],balaenoptera edeni,LC,Unknown,1.0,0.0,1.0,balaenoptera_edeni,https://upload.wikimedia.org/wikipedia/commons...
9,rorquals,Balaenopteridae,Balaenoptera,Rice's whale[16],balaenoptera ricei,CR,30–100,1.0,0.0,0.0,balaenoptera_ricei,https://upload.wikimedia.org/wikipedia/commons...
31,river_dolphins,Iniidae,Inia,Araguaian river dolphin[r],inia araguaiaensis,NE,Unknown,1.0,0.0,0.0,inia_araguaiaensis,https://upload.wikimedia.org/wikipedia/commons...


In [29]:
# remove the brackets and content
total_merged_df['Common name'] = total_merged_df['Common name'].str.replace(r'\[.*?\]', '', regex=True)

In [30]:
import json
df_json = total_merged_df.to_json(orient='records')

with open("cetacean_names.json", "w") as outfile:
    outfile.write(df_json)