# MARIS to WoRMS
Reconciling MARIS species with WoRMS ones

In [None]:
import pandas as pd
from pathlib import Path
from fastcore.xtras import save_pickle, load_pickle
from marisco.utils import match_worms
from tqdm import tqdm

## Load MARIS Excel lookup table

In [None]:
df_species = pd.read_excel('../files/lut/dbo_species.xlsx')

In [None]:
df_species.head()

Unnamed: 0,species_id,species,code,biogroup,biogroup_id,organism,codorg,Taxonname,Taxonrank,Commonname,TaxonDB,TaxonDBID,TaxonDBURL,TaxonDB2,TaxonDBID2,TaxonDBURL2,SQL
0,0,Not available,0,,0,NOT AVAILABLE,0,(Not available),Not available,,Not available,Not available,Not available,,,,
1,1,Aristeus antennatus,AA,,2,DECAPODA,CRUDEC,Aristeus antennatus,species,,Wikidata,Q500678,https://www.wikidata.org/wiki/Q500678,WoRMS,107083.0,http://www.marinespecies.org/aphia.php?p=taxde...,if ((select species_id from species where taxo...
2,2,Apostichopus,xx,,3,xx,xx,Apostichopus,genus,,Wikidata,Q4780592,https://www.wikidata.org/wiki/Q4780592,,,,if ((select species_id from species where taxo...
3,3,Saccharina japonica var. religiosa,xx,,11,xx,xx,Saccharina japonica var. religiosa,variety,,Wikidata,Q68486806,https://www.wikidata.org/wiki/Q68486806,,,,if ((select species_id from species where taxo...
4,4,Siganus fuscescens,xx,,4,xx,xx,Siganus fuscescens,species,,Wikidata,Q867673,https://www.wikidata.org/wiki/Q867673,,,,if ((select species_id from species where taxo...


## Query WoRMS db

In [None]:
maris2worms = {}

for species in tqdm(df_species['species'].values):
    maris2worms[species] = match_worms(species)

del maris2worms['Not available']

100%|██████████| 1611/1611 [1:44:11<00:00,  3.88s/it]


In [None]:
maris2worms['Siganus fuscescens']

[[{'AphiaID': 273912,
   'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=273912',
   'scientificname': 'Siganus fuscescens',
   'authority': '(Houttuyn, 1782)',
   'status': 'accepted',
   'unacceptreason': None,
   'taxonRankID': 220,
   'rank': 'Species',
   'valid_AphiaID': 273912,
   'valid_name': 'Siganus fuscescens',
   'valid_authority': '(Houttuyn, 1782)',
   'parentNameUsageID': 126071,
   'kingdom': 'Animalia',
   'phylum': 'Chordata',
   'class': 'Teleostei',
   'order': 'Acanthuriformes',
   'family': 'Siganidae',
   'genus': 'Siganus',
   'citation': 'Froese, R. and D. Pauly. Editors. (2023). FishBase. Siganus fuscescens (Houttuyn, 1782). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=273912 on 2023-06-16',
   'lsid': 'urn:lsid:marinespecies.org:taxname:273912',
   'isMarine': 1,
   'isBrackish': 1,
   'isFreshwater': 0,
   'isTerrestrial': 0,
   'isExtinct': None,
   'match_type': 'exact',
   

### Save or load

In [None]:
fname_dump = Path('../files/dump') / 'maris2worms.pkl'
if maris2worms:
    save_pickle(fname_dump, maris2worms)
else:
    maris2worms = load_pickle(fname_dump)

exist


## Summary stats

In [None]:
results = {'maris_species': [], 'n_matches': [],
           'aphiaid': [], 'status': [], 'worms_species': []}

for k in maris2worms.keys():
    results['maris_species'].append(k)
    # If no match
    if maris2worms[k] == -1:
        results['n_matches'].append(0)
        results['aphiaid'].append(-1)
        results['status'].append(None)
        results['worms_species'].append(None)
    else:
        n_matches = len(maris2worms[k][0])
        attrs = maris2worms[k][0][0]
        results['n_matches'].append(n_matches)
        results['aphiaid'].append(attrs['AphiaID'])
        results['status'].append(attrs['status'])
        results['worms_species'].append(attrs['scientificname'])

In [None]:
df = pd.DataFrame(results); df.head()

Unnamed: 0,maris_species,n_matches,aphiaid,status,worms_species
0,Aristeus antennatus,1,107083,accepted,Aristeus antennatus
1,Apostichopus,1,241373,accepted,Apostichopus
2,Saccharina japonica var. religiosa,1,847544,accepted,Saccharina japonica var. religiosa
3,Siganus fuscescens,1,273912,accepted,Siganus fuscescens
4,Alpheus dentipes,1,107475,accepted,Alpheus dentipes


In [None]:
df_missing = df[df['n_matches'] == 0]
print(f'# missing: {len(df_missing)}')
df_missing.head(20)

# missing: 34


Unnamed: 0,maris_species,n_matches,aphiaid,status,worms_species
46,Carassius auratus auratus,0,-1,,
158,Neuston,0,-1,,
254,Pusa sibirica,0,-1,,
276,Fish larvae,0,-1,,
586,Parastichopus nigripunctatus,0,-1,,
662,Scomberomorini,0,-1,,
663,Channa argus,0,-1,,
694,Zooplankton,0,-1,,
695,Phytoplankton,0,-1,,
733,Marine algae,0,-1,,


In [None]:
df_multi = df[df['n_matches'] > 1]
print(f'# missing: {len(df_multi)}')
df_multi.head(20)

# missing: 34


Unnamed: 0,maris_species,n_matches,aphiaid,status,worms_species
39,Balaenoptera musculus,2,137090,accepted,Balaenoptera musculus
72,Chrysaora,2,135261,accepted,Chrysaora
92,Delphinus capensis,3,383815,unaccepted,Delphinus capensis
135,Holothuria,2,123456,accepted,Holothuria
167,Mesoplodon densirostris,3,137122,accepted,Mesoplodon densirostris
241,Polysiphonia,3,143853,accepted,Polysiphonia
245,Gonostomatidae,2,125601,accepted,Gonostomatidae
408,Gadus,2,125732,accepted,Gadus
558,Synagrops japonicus,2,367291,accepted,Synagrops japonicus
700,Ceramium virgatum,2,178915,accepted,Ceramium virgatum


In [None]:
df_accepted = df[df['status'] == 'accepted']
print(f'# accepted: {len(df_accepted)}')
df_accepted.head(20)

# accepted: 1409


Unnamed: 0,maris_species,n_matches,aphiaid,status,worms_species
0,Aristeus antennatus,1,107083,accepted,Aristeus antennatus
1,Apostichopus,1,241373,accepted,Apostichopus
2,Saccharina japonica var. religiosa,1,847544,accepted,Saccharina japonica var. religiosa
3,Siganus fuscescens,1,273912,accepted,Siganus fuscescens
4,Alpheus dentipes,1,107475,accepted,Alpheus dentipes
5,Hexagrammos agrammus,1,279410,accepted,Hexagrammos agrammus
6,Ditrema temminckii,1,280570,accepted,Ditrema temminckii
7,Parapristipoma trilineatum,1,273480,accepted,Parapristipoma trilineatum
8,Scombrops boops,1,220088,accepted,Scombrops boops
9,Pseudopleuronectes schrenki,1,275359,accepted,Pseudopleuronectes schrenki


## Excel lut expansion

In [None]:
import numpy as np

In [None]:
cols_worms = {k: [] for k in maris2worms['Siganus fuscescens'][0][0].keys()}
cols_all = {'species': []}
cols_all.update(cols_worms)

for k in maris2worms.keys():
    if maris2worms[k] == -1:
        cols_all['species'].append(k)
        for col in cols_worms.keys(): 
            cols_all[col].append(np.nan)
    else:
        for matches in maris2worms[k][0]:
            cols_all['species'].append(k)
            for k_m, v_m in matches.items():
                cols_all[k_m].append(v_m)

In [None]:
df_expanded = pd.DataFrame(cols_all)
df_expanded.head()

Unnamed: 0,species,AphiaID,url,scientificname,authority,status,unacceptreason,taxonRankID,rank,valid_AphiaID,...,genus,citation,lsid,isMarine,isBrackish,isFreshwater,isTerrestrial,isExtinct,match_type,modified
0,Aristeus antennatus,107083.0,https://www.marinespecies.org/aphia.php?p=taxd...,Aristeus antennatus,"(Risso, 1816)",accepted,,220.0,Species,107083.0,...,Aristeus,DecaNet eds. (2023). DecaNet. Aristeus antenna...,urn:lsid:marinespecies.org:taxname:107083,1.0,0.0,0.0,0.0,0.0,exact,2022-08-24T09:48:14.813Z
1,Apostichopus,241373.0,https://www.marinespecies.org/aphia.php?p=taxd...,Apostichopus,"Liao, 1980",accepted,,180.0,Genus,241373.0,...,Apostichopus,"WoRMS (2023). Apostichopus Liao, 1980. Accesse...",urn:lsid:marinespecies.org:taxname:241373,1.0,,0.0,0.0,,exact,2013-06-10T05:18:23.057Z
2,Saccharina japonica var. religiosa,847544.0,https://www.marinespecies.org/aphia.php?p=taxd...,Saccharina japonica var. religiosa,"(Miyabe) N.Yotsukura, S.Kawashima, T.Kawai, T....",accepted,,240.0,Variety,847544.0,...,Saccharina,"Guiry, M.D. & Guiry, G.M. (2023). AlgaeBase. W...",urn:lsid:marinespecies.org:taxname:847544,1.0,,,,,exact,2015-06-26T12:00:51.270Z
3,Siganus fuscescens,273912.0,https://www.marinespecies.org/aphia.php?p=taxd...,Siganus fuscescens,"(Houttuyn, 1782)",accepted,,220.0,Species,273912.0,...,Siganus,"Froese, R. and D. Pauly. Editors. (2023). Fish...",urn:lsid:marinespecies.org:taxname:273912,1.0,1.0,0.0,0.0,,exact,2008-01-15T17:27:08.177Z
4,Alpheus dentipes,107475.0,https://www.marinespecies.org/aphia.php?p=taxd...,Alpheus dentipes,"Guérin, 1832",accepted,,220.0,Species,107475.0,...,Alpheus,DecaNet eds. (2023). DecaNet. Alpheus dentipes...,urn:lsid:marinespecies.org:taxname:107475,1.0,0.0,0.0,0.0,0.0,exact,2022-05-13T08:15:22.193Z


In [None]:
col_types = {c: pd.Int64Dtype() 
             for c in ['AphiaID', 'taxonRankID', 'valid_AphiaID', 
                       'parentNameUsageID','isMarine', 'isBrackish',
                       'isFreshwater', 'isTerrestrial', 'isExtinct']}
df_expanded = df_expanded.astype(col_types)

In [None]:
df_expanded[df_expanded.duplicated(subset='species')].head()

Unnamed: 0,species,AphiaID,url,scientificname,authority,status,unacceptreason,taxonRankID,rank,valid_AphiaID,...,genus,citation,lsid,isMarine,isBrackish,isFreshwater,isTerrestrial,isExtinct,match_type,modified
40,Balaenoptera musculus,380449,https://www.marinespecies.org/aphia.php?p=taxd...,Balaenoptera musculus,"Van Beneden & Gervais, 1880",unaccepted,synonym,220,Species,137091.0,...,Balaenoptera,"Fordyce, E.; Perrin, W.F. (2023). World Cetace...",urn:lsid:marinespecies.org:taxname:380449,1,,,0.0,,exact,2009-12-31T18:14:43.750Z
74,Chrysaora,1379174,https://www.marinespecies.org/aphia.php?p=taxd...,Chrysaora,"Lamouroux, 1821",unaccepted,,180,Genus,1372852.0,...,Chrysaora,"WoRMS (2023). Chrysaora Lamouroux, 1821&nbsp;&...",urn:lsid:marinespecies.org:taxname:1379174,1,,,,1.0,exact,2019-09-19T12:01:24.303Z
95,Delphinus capensis,137093,https://www.marinespecies.org/aphia.php?p=taxd...,Delphinus capensis,"Gray, 1828",unaccepted,synonym,220,Species,137094.0,...,Delphinus,"Fordyce, E.; Perrin, W.F. (2023). World Cetace...",urn:lsid:marinespecies.org:taxname:137093,1,0.0,0.0,0.0,,exact,2016-05-05T05:59:43.010Z
96,Delphinus capensis,383816,https://www.marinespecies.org/aphia.php?p=taxd...,Delphinus capensis,"Rapp, 1837",nomen dubium,incertae sedis,220,Species,,...,Delphinus,"Fordyce, E.; Perrin, W.F. (2023). World Cetace...",urn:lsid:marinespecies.org:taxname:383816,1,,,0.0,,exact,2009-03-05T14:31:36.587Z
140,Holothuria,1315401,https://www.marinespecies.org/aphia.php?p=taxd...,Holothuria,"Linnaeus, 1758",unaccepted,"Treated as junior synonym of Physalia, Lamarcc...",180,Genus,135382.0,...,Holothuria,"Schuchert, P. (2023). World Hydrozoa Database....",urn:lsid:marinespecies.org:taxname:1315401,1,0.0,0.0,0.0,0.0,exact,2021-05-28T05:40:56.327Z


In [None]:
df_expanded[df_expanded['species'] == 'Salmo trutta trutta']

Unnamed: 0,species,AphiaID,url,scientificname,authority,status,unacceptreason,taxonRankID,rank,valid_AphiaID,...,genus,citation,lsid,isMarine,isBrackish,isFreshwater,isTerrestrial,isExtinct,match_type,modified
1624,Salmo trutta trutta,1416249,https://www.marinespecies.org/aphia.php?p=taxd...,Salmo trutta trutta,"Walbaum, 1792",unaccepted,,230,Subspecies,127188,...,Salmo,"Froese, R. and D. Pauly. Editors. (2023). Fish...",urn:lsid:marinespecies.org:taxname:1416249,1,1,1,0,,exact,2020-01-21T12:50:04.933Z
1625,Salmo trutta trutta,223866,https://www.marinespecies.org/aphia.php?p=taxd...,Salmo trutta trutta,"Linnaeus, 1758",unaccepted,,230,Subspecies,127187,...,Salmo,"Froese, R. and D. Pauly. Editors. (2023). Fish...",urn:lsid:marinespecies.org:taxname:223866,1,1,1,0,,exact,2023-01-17T15:33:01.533Z


### Join