## Which species are observed in all three data sources?

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import requests
from bs4 import BeautifulSoup
import re

In [5]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_cetaceans'

response = requests.get(wiki_url)
soup = BeautifulSoup(response.text, 'html.parser')

html_tables = soup.find_all('table', {'class': 'wikitable'})
family_names = soup.find_all('div', {'class': 'mw-heading mw-heading3'})
family_names = [name.find("h3")["id"] for name in family_names]

In [51]:
all_rows = []
for i, wiki_table in enumerate(html_tables):

    family_name = family_names[i]
    rows = wiki_table.find_all("tr")

    for row in rows:

        if 'Genus' in row.get_text():
            try:
                genus_name = row.find("i").find("b").get_text(strip=True)
            except:
                raise ValueError('problem with genus row: ', row)

        if 'Genus' not in row.get_text() and 'Scientific' not in row.get_text():
            info_row = []
            for col in row.find_all("td"):
                if col.find('i') is not None:
                    info = col.find('i').get_text(strip=True)
                elif col.find('b') is not None:
                    info = col.find('b').get_text(strip=True)
                else:
                    info = col.get_text(strip=True)
                info_row.append(info)
            row_info = [family_name, genus_name] + info_row
            all_rows.append(row_info)

first_table = html_tables[0].find_all("tr")
wiki_df = pd.DataFrame(all_rows, columns=['Family', 'Genus'] + [th.get_text(strip=True) for th in first_table[1].find_all("th")])

names = wiki_df.Family

family_latin = []
family_common = []

for name in names:
    result = re.search('Family_(.*):_', name)
    family_latin.append(result.group(1))
    result = re.search(':_(.*)', name)
    family_common.append(result.group(1))

wiki_df.drop(columns='Family', inplace=True)
wiki_df.insert(loc=0, column='Family_Greek', value=family_latin)
wiki_df.insert(loc=0, column='Family_common', value=family_common)

wiki_df = wiki_df.dropna()

display(wiki_df.head(5))
wiki_df['Scientific name'].unique()

Unnamed: 0,Family_common,Family_Greek,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,Range,Size,Picture
0,right_whales,Balaenidae,Balaena,Bowhead whale,Balaena mysticetus,LC,10000,,60 t (66 short tons),
1,right_whales,Balaenidae,Eubalaena,North Atlantic right whale,Eubalaena glacialis,CR,350,,40–80 t (44–88 short tons),
2,right_whales,Balaenidae,Eubalaena,North Pacific right whale,Eubalaena japonica,EN,"404–2,108[12]",,60–80 t (66–88 short tons),
3,right_whales,Balaenidae,Eubalaena,Southern right whale,Eubalaena australis,LC,"13,600[13]",,40–80 t (44–88 short tons),
5,rorquals,Balaenopteridae,Balaenoptera,Blue whale,Balaenoptera musculus,EN,"5,000–15,000",,50–150 t (55–165 short tons),


array(['Balaena mysticetus', 'Eubalaena glacialis', 'Eubalaena japonica',
       'Eubalaena australis', 'Balaenoptera musculus',
       'Balaenoptera brydei', 'Balaenoptera edeni',
       'Balaenoptera acutorostrata', 'Balaenoptera ricei',
       'Balaenoptera physalus', 'Balaenoptera omurai',
       'Balaenoptera borealis', 'Balaenoptera bonaerensis',
       'Megaptera novaeangliae', 'Eschrichtius robustus',
       'Caperea marginata', 'Cephalorhynchus eutropia',
       'Cephalorhynchus commersonii', 'Cephalorhynchus heavisidii',
       'Cephalorhynchus hectori', 'Delphinus delphis', 'Feresa attenuata',
       'Globicephala melas', 'Globicephala macrorhynchus',
       'Grampus griseus', 'Lagenodelphis hosei',
       'Lagenorhynchus albirostris', 'Lagenorhynchus acutus',
       'Lagenorhynchus obscurus', 'Lagenorhynchus cruciger',
       'Lagenorhynchus obliquidens', 'Lagenorhynchus australis',
       'Lissodelphis borealis', 'Lissodelphis peronii',
       'Orcaella heinsohni', 'Orcael

In [12]:
map_df = pd.read_csv('data/grouped_species.csv')
display(map_df.head(3))
map_df['scientific_name'].unique()

Unnamed: 0,species_name,common_name,scientific_name,event_month,latitude,longitude,individual_count,endangered,vulnerable
0,Bowhead,Bowhead,Balaena Mysticetus,2012-08,79.5,50.0,1.0,False,False
1,Bowhead,Bowhead,Balaena Mysticetus,2017-06,80.311508,54.810083,7.5,False,False
2,Bowhead,Bowhead,Balaena Mysticetus,2017-06,79.998846,47.449725,2.0,False,False


array(['Balaena Mysticetus', 'Balaenoptera Acutorostrata',
       'Balaenoptera Borealis', 'Balaenoptera Brydei',
       'Balaenoptera Musculus', 'Balaenoptera Omurai',
       'Balaenoptera Physalus', 'Berardius Arnuxii', 'Berardius Bairdii',
       'Cephalorhynchus Commersonii', 'Cephalorhynchus Eutropia',
       'Cephalorhynchus Heavisidii', 'Cephalorhynchus Hectori',
       'Delphinapterus Leucas', 'Delphinus Delphis',
       'Eschrichtius Robustus', 'Eubalaena Glacialis',
       'Eubalaena Japonica', 'Globicephala Melas', 'Grampus Griseus',
       'Hyperoodon Ampullatus', 'Hyperoodon Planifrons',
       'Inia Geoffrensis', 'Kogia Breviceps', 'Kogia Sima',
       'Lagenodelphis Hosei', 'Lagenorhynchus Acutus',
       'Lagenorhynchus Australis', 'Lagenorhynchus Cruciger',
       'Lagenorhynchus Obscurus', 'Lissodelphis Borealis',
       'Lissodelphis Peronii', 'Megaptera Novaeangliae',
       'Mesoplodon Densirostris', 'Mesoplodon Europaeus',
       'Mesoplodon Layardii', 'Monodon Mo

In [72]:
wiki_names = list(map(lambda x: x.lower(), list(wiki_df['Scientific name'])))

map_names = list(map(lambda x: x.lower(), list(map_df['scientific_name'])))
map_names = list(set(map_names))
print('n map names: ', len(map_names))

common = [name for name in map_names if name in wiki_names]
print('n common names map and wiki: ', len(common))
print(common)

n map names:  53
n common names map and wiki:  53
['balaenoptera omurai', 'megaptera novaeangliae', 'lagenorhynchus acutus', 'delphinus delphis', 'stenella coeruleoalba', 'sousa plumbea', 'steno bredanensis', 'eubalaena glacialis', 'mesoplodon europaeus', 'berardius bairdii', 'balaenoptera physalus', 'tursiops truncatus', 'lagenorhynchus cruciger', 'cephalorhynchus hectori', 'grampus griseus', 'hyperoodon ampullatus', 'cephalorhynchus heavisidii', 'eubalaena japonica', 'kogia sima', 'balaenoptera brydei', 'lagenorhynchus obscurus', 'pontoporia blainvillei', 'balaenoptera musculus', 'inia geoffrensis', 'mesoplodon layardii', 'ziphius cavirostris', 'lagenodelphis hosei', 'phocoena phocoena', 'stenella frontalis', 'eschrichtius robustus', 'berardius arnuxii', 'orcaella brevirostris', 'phocoena spinipinnis', 'phocoenoides dalli', 'pseudorca crassidens', 'balaenoptera borealis', 'balaenoptera acutorostrata', 'balaena mysticetus', 'cephalorhynchus commersonii', 'globicephala melas', 'lagenor

In [10]:
tree_df = pd.read_csv('data/PhilogeneticTree/species_tree_of_life.csv')
tree_df.head(3)

Unnamed: 0,Scientific name,Common name,Wikipedia page URL,Genus,Genus Wikipedia URL,Family,Family Wikipedia URL,Subfamily,Subfamily Wikipedia URL,Superfamily,Superfamily Wikipedia URL
0,Stenella clymene,Clymene dolphin,https://en.wikipedia.org/wiki/Clymene_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
1,Stenella longirostris,Spinner dolphin,https://en.wikipedia.org/wiki/Spinner_dolphin,Stenella,https://en.wikipedia.org/wiki/Stenella,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,
2,Lagenodelphis hosei,Fraser's dolphin,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Lagenodelphis,https://en.wikipedia.org/wiki/Fraser%27s_dolphin,Delphinidae,https://en.wikipedia.org/wiki/Oceanic_dolphin,Delphininae,https://en.wikipedia.org/wiki/Oceanic_dolphin#...,,


In [57]:
common_rows_tree_wiki = tree_df[tree_df['Scientific name'].isin(wiki_df['Scientific name'])]
not_common_rows_tree_wiki = tree_df[~tree_df['Scientific name'].isin(wiki_df['Scientific name'])]
print('tree wiki intersection: ', len(common_rows_tree_wiki['Scientific name'].unique()))
print(len(not_common_rows_tree_wiki['Scientific name'].unique()),
      ' tree names that are not in wiki: ',
      list(not_common_rows_tree_wiki['Scientific name']))
print()

tree wiki intersection:  71
4  tree names that are not in wiki:  ['Sagmatias australis', 'Sagmatias obscurus', 'Sagmatias obliquidens', 'Leucopleurus acutus']



In [53]:
common_rows_tree_wiki = wiki_df[wiki_df['Scientific name'].isin(tree_df['Scientific name'])]
not_common_rows_tree_wiki = wiki_df[~wiki_df['Scientific name'].isin(tree_df['Scientific name'])]
print('intersection: ', len(common_rows_tree_wiki['Scientific name'].unique()))
print('union minus intersection', len(not_common_rows_tree_wiki['Scientific name'].unique()))
common_rows_tree_wiki['Scientific name'].unique()

intersection:  71
union minus intersection 25


array(['Balaena mysticetus', 'Eubalaena glacialis', 'Eubalaena japonica',
       'Eubalaena australis', 'Balaenoptera musculus',
       'Balaenoptera edeni', 'Balaenoptera acutorostrata',
       'Balaenoptera physalus', 'Balaenoptera borealis',
       'Balaenoptera bonaerensis', 'Megaptera novaeangliae',
       'Eschrichtius robustus', 'Caperea marginata',
       'Cephalorhynchus commersonii', 'Cephalorhynchus heavisidii',
       'Delphinus delphis', 'Feresa attenuata', 'Globicephala melas',
       'Globicephala macrorhynchus', 'Grampus griseus',
       'Lagenodelphis hosei', 'Lagenorhynchus albirostris',
       'Lissodelphis borealis', 'Lissodelphis peronii',
       'Orcaella heinsohni', 'Orcaella brevirostris', 'Orcinus orca',
       'Peponocephala electra', 'Pseudorca crassidens', 'Sousa chinensis',
       'Sotalia guianensis', 'Stenella frontalis', 'Stenella clymene',
       'Stenella attenuata', 'Stenella longirostris',
       'Stenella coeruleoalba', 'Steno bredanensis', 'Tursiop

- There are more names in wikipedia than in maps and tree
- All scientific names used for the map can be matched to scientific names from wikipedia
- There are 4 names from the tree that can not be matched to the wiki names

## unique identification for all three sources

In [89]:
wiki_df_merge = wiki_df[['Family_common', 'Family_Greek', 'Genus', 'Common name',
       'Scientific name', 'IUCN Red List status', 'Global population estimate']].copy()
wiki_df_merge['wiki'] = 1
wiki_df_merge['Scientific name'] = wiki_names

map_df_merge = pd.DataFrame({
    'Scientific name': map_names,  # column with your strings
    'map': 1        # column with all values set to True
})

tree_names = list(map(lambda x: x.lower(), list(tree_df['Scientific name'])))
tree_df_merge = pd.DataFrame({
    'Scientific name': tree_names,  # column with your strings
    'tree': 1        # column with all values set to True
})


display(wiki_df_merge.head())
display(map_df_merge.head())
display(tree_df_merge.head())

Unnamed: 0,Family_common,Family_Greek,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki
0,right_whales,Balaenidae,Balaena,Bowhead whale,balaena mysticetus,LC,10000,1
1,right_whales,Balaenidae,Eubalaena,North Atlantic right whale,eubalaena glacialis,CR,350,1
2,right_whales,Balaenidae,Eubalaena,North Pacific right whale,eubalaena japonica,EN,"404–2,108[12]",1
3,right_whales,Balaenidae,Eubalaena,Southern right whale,eubalaena australis,LC,"13,600[13]",1
5,rorquals,Balaenopteridae,Balaenoptera,Blue whale,balaenoptera musculus,EN,"5,000–15,000",1


Unnamed: 0,Scientific name,map
0,balaenoptera omurai,1
1,megaptera novaeangliae,1
2,lagenorhynchus acutus,1
3,delphinus delphis,1
4,stenella coeruleoalba,1


Unnamed: 0,Scientific name,tree
0,stenella clymene,1
1,stenella longirostris,1
2,lagenodelphis hosei,1
3,delphinus delphis,1
4,stenella coeruleoalba,1


In [94]:
merged_df = wiki_df_merge.merge(map_df_merge, how='left', on='Scientific name')
merged_df = merged_df.merge(tree_df_merge, how='outer', on='Scientific name')
merged_df[['wiki', 'map', 'tree']] = merged_df[['wiki', 'map', 'tree']].fillna(0)
merged_df['unique_id'] = list(map(lambda s: s.replace(" ", "_"), list(merged_df['Scientific name'])))
merged_df

Unnamed: 0,Family_common,Family_Greek,Genus,Common name,Scientific name,IUCN Red List status,Global population estimate,wiki,map,tree,unique_id
0,right_whales,Balaenidae,Balaena,Bowhead whale,balaena mysticetus,LC,10000,1.0,1.0,1.0,balaena_mysticetus
1,rorquals,Balaenopteridae,Balaenoptera,Common minke whale,balaenoptera acutorostrata,LC,200000,1.0,1.0,1.0,balaenoptera_acutorostrata
2,rorquals,Balaenopteridae,Balaenoptera,Antarctic minke whale,balaenoptera bonaerensis,NT,"515,000[18]",1.0,0.0,1.0,balaenoptera_bonaerensis
3,rorquals,Balaenopteridae,Balaenoptera,Sei whale,balaenoptera borealis,EN,"80,000[17]",1.0,1.0,1.0,balaenoptera_borealis
4,rorquals,Balaenopteridae,Balaenoptera,Bryde's whale,balaenoptera brydei,LC,"90,000–100,000",1.0,1.0,0.0,balaenoptera_brydei
...,...,...,...,...,...,...,...,...,...,...,...
95,beaked_whales,Ziphiidae,Tasmacetus,Shepherd's beaked whale,tasmacetus shepherdi,DD,Unknown[af],1.0,0.0,1.0,tasmacetus_shepherdi
96,oceanic_dolphins,Delphinidae,Tursiops,Indo-Pacific bottlenose dolphin,tursiops aduncus,NT,Unknown,1.0,0.0,1.0,tursiops_aduncus
97,oceanic_dolphins,Delphinidae,Tursiops,Tamanend's bottlenose dolphin,tursiops erebennus,NE,Unknown,1.0,0.0,0.0,tursiops_erebennus
98,oceanic_dolphins,Delphinidae,Tursiops,Common bottlenose dolphin,tursiops truncatus,LC,"600,000[22]",1.0,1.0,1.0,tursiops_truncatus


In [97]:
import json
df_json = merged_df.to_json(orient='records')
df_json

with open("cetacean_names.json", "w") as outfile:
    outfile.write(df_json)