# MARIS to WoRMS
Reconciling MARIS species with WoRMS ones

In [None]:
import pandas as pd
from pathlib import Path
from fastcore.xtras import save_pickle, load_pickle
from marisco.utils import match_worms
from tqdm import tqdm

## Load MARIS Excel lookup table

In [None]:
df_species = pd.read_excel('../files/lut/dbo_species.xlsx')

In [None]:
df_species.head()

## Query WoRMS db

In [None]:
maris2worms = {}

for species in tqdm(df_species['species'].values):
    maris2worms[species] = match_worms(species)

del maris2worms['Not available']

In [None]:
maris2worms['Siganus fuscescens']

### Save or load

In [None]:
fname_dump = Path('../files/dump') / 'maris2worms.pkl'
if maris2worms:
    save_pickle(fname_dump, maris2worms)
else:
    maris2worms = load_pickle(fname_dump)

## Summary stats

In [None]:
results = {'maris_species': [], 'n_matches': [],
           'aphiaid': [], 'status': [], 'worms_species': []}

for k in maris2worms.keys():
    results['maris_species'].append(k)
    # If no match
    if maris2worms[k] == -1:
        results['n_matches'].append(0)
        results['aphiaid'].append(-1)
        results['status'].append(None)
        results['worms_species'].append(None)
    else:
        n_matches = len(maris2worms[k][0])
        attrs = maris2worms[k][0][0]
        results['n_matches'].append(n_matches)
        results['aphiaid'].append(attrs['AphiaID'])
        results['status'].append(attrs['status'])
        results['worms_species'].append(attrs['scientificname'])

In [None]:
df = pd.DataFrame(results); df.head()

In [None]:
df_missing = df[df['n_matches'] == 0]
print(f'# missing: {len(df_missing)}')
df_missing.head(20)

In [None]:
df_multi = df[df['n_matches'] > 1]
print(f'# missing: {len(df_multi)}')
df_multi.head(20)

In [None]:
df_accepted = df[df['status'] == 'accepted']
print(f'# accepted: {len(df_accepted)}')
df_accepted.head(20)

## Excel lut expansion

In [None]:
import numpy as np

In [None]:
len(maris2worms['Chrysaora'][0])

In [None]:
cols_worms = {k: [] for k in maris2worms['Siganus fuscescens'][0][0].keys()}
cols_all = {'species': [], 'isDuplicated': []}
cols_all.update(cols_worms)

for k in maris2worms.keys():
    if maris2worms[k] == -1:
        cols_all['species'].append(k)
        cols_all['isDuplicated'].append(0)
        for col in cols_worms.keys(): cols_all[col].append(np.nan)
    else:
        is_duplicated = 1 if len(maris2worms[k][0]) > 1 else 0
        for matches in maris2worms[k][0]:
            cols_all['species'].append(k)
            cols_all['isDuplicated'].append(is_duplicated)
            for k_m, v_m in matches.items():    
                cols_all[k_m].append(v_m)
                
col_types = {c: pd.Int64Dtype() 
             for c in ['AphiaID', 'taxonRankID', 'valid_AphiaID', 
                       'parentNameUsageID','isMarine', 'isBrackish',
                       'isFreshwater', 'isTerrestrial', 'isExtinct']}

df_expanded = pd.DataFrame(cols_all)
df_expanded = df_expanded.astype(col_types)

In [None]:
df_expanded.head()

In [None]:
df_expanded[df_expanded.duplicated(subset='species')].head()

In [None]:
df_expanded[df_expanded['species'] == 'Salmo trutta trutta']

### Join

In [None]:
df = pd.merge(df_species, df_expanded, on='species', how='right')
df

In [None]:
df.columns

In [None]:
df[['species', 'isDuplicated', 'AphiaID']][df['isDuplicated'] == 1]

In [None]:
fname_dump = Path('../files/dump') / 'dbo_species_expanded.xlsx'
df.to_excel(fname_dump, index=False)