# 0.1.8: Species name harmonization (hydraulic traits)

## Background

When working with ecological datasets from multiple sources, species names often vary due to:
- Different taxonomic authorities and naming conventions
- Synonyms and outdated nomenclature
- Spelling variations and author citations
- Subspecies, varieties, and cultivars vs. species-level names

The species names from GBIF, sPlot, and TRY all are harmonized against different sources, and so it is important to ensure that they are each harmonized against the same source for direct matching and comparison.


## Imports and config

In [13]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
from src.conf.environment import log

pd.set_option("display.max_columns", None)

## TRY growth forms + hydraulic traits harmonization (provided by David Schellenberger Costa)

In [21]:
dsc_fp = "data/raw/hydraulic_traits_matching_results_2025-10-03.gz"
dsc_df = pd.read_csv(dsc_fp, compression="gzip")

dsc_df.head()

  dsc_df = pd.read_csv(dsc_fp, compression="gzip")


Unnamed: 0,hydName,hydNameIn,nameIn,groName,groNameIn,authorsIn,nameIn.1,authorsIn.1,familyLCVP,statusLCVP,nameInLCVP,authorsInLCVP,nameOutLCVP,authorsOutLCVP,rowLCVP,diffGenLCVP,diffSpecLCVP,diffRestLCVP,diffAuthorsLCVP,matchTypeLCVP,timeLCVP,familyWCVP,statusWCVP,nameInWCVP,authorsInWCVP,nameOutWCVP,authorsOutWCVP,rowWCVP,diffGenWCVP,diffSpecWCVP,diffRestWCVP,diffAuthorsWCVP,matchTypeWCVP,timeWCVP,familyWFO,statusWFO,nameInWFO,authorsInWFO,nameOutWFO,authorsOutWFO,rowWFO,diffGenWFO,diffSpecWFO,diffRestWFO,diffAuthorsWFO,matchTypeWFO,timeWFO,familyWP,statusWP,nameInWP,authorsInWP,nameOutWP,authorsOutWP,rowWP,diffGenWP,diffSpecWP,diffRestWP,diffAuthorsWP,matchTypeWP,timeWP,nameIn.2,authorsIn.2,GBIFKeyGBIF,searchTypeGBIF,synonymGBIF,matchNameGBIF,matchedScientificNameGBIF,matchedCanonicalNameGBIF,matchedAuthorshipGBIF,diffGenGBIF,diffSpecGBIF,diffRestGBIF,diffAuthorsGBIF,timeGBIF,scientificNameGBIF,canonicalNameGBIF,rankGBIF,authorshipGBIF,familyGBIF,orderGBIF,classGBIF,phylumGBIF,kingdomGBIF
0,abarema abbottii,Abarema abbottii,Abarema abbottii,,,,Abarema abbottii,,Fabaceae,synonym,Abarema abbottii,(Rose & Leonard) Barneby & J.W.Grimes,Jupunba abbottii,(Rose & Leonard) Britton & Rose,144.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema abbottii,(Rose & Leonard) Barneby & J.W.Grimes,Jupunba abbottii,(Rose & Leonard) Britton & Rose,149.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema abbottii,(Rose & Leonard) Barneby & J.W.Grimes,Jupunba abbottii,(Rose & Leonard) Britton & Rose,156.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema abbottii,(Rose & Leonard) Barneby & J.W.Grimes,Jupunba abbottii,(Rose & Leonard) Britton & Rose,157.0,0.0,0.0,0.0,,1.0,0.0,Abarema abbottii,,2977834.0,1.0,True,Abarema abbottii,Abarema abbottii (Rose & Leonard) Barneby & J....,Abarema abbottii,(Rose & Leonard) Barneby & J.W.Grimes,0.0,0.0,0.0,,0.33,Jupunba abbottii (Rose & Leonard) Britton & Rose,Jupunba abbottii,SPECIES,(Rose & Leonard) Britton & Rose,Fabaceae,Fabales,Magnoliopsida,Tracheophyta,Plantae
1,abarema cochliocarpos,Abarema cochliocarpos,Abarema cochliocarpos,,,,Abarema cochliocarpos,,Fabaceae,accepted,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,172.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,accepted,Abarema cochliacarpos,(Gomes) Barneby & J.W.Grimes,Abarema cochliacarpos,(Gomes) Barneby & J.W.Grimes,179.0,0.0,1.666667,0.0,,4.0,6.51,Fabaceae,accepted,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,189.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,accepted,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,187.0,0.0,0.0,0.0,,1.0,0.0,Abarema cochliocarpos,,2977867.0,1.0,False,Abarema cochliocarpos,Abarema cochliocarpos (Gomes) Barneby & J.W.Gr...,Abarema cochliocarpos,(Gomes) Barneby & J.W.Grimes,0.0,0.0,0.0,,0.37,Abarema cochliocarpos (Gomes) Barneby & J.W.Gr...,Abarema cochliocarpos,SPECIES,(Gomes) Barneby & J.W.Grimes,Fabaceae,Fabales,Magnoliopsida,Tracheophyta,Plantae
2,abarema ganymedea,Abarema ganymedea,Abarema ganymedea,,,,Abarema ganymedea,,Fabaceae,synonym,Abarema ganymedea,Barneby & J.W.Grimes,Jupunba ganymedea,(Barneby & J.W.Grimes) M.V.B.Soares & al.,191.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema ganymedea,Barneby & J.W.Grimes,Jupunba ganymedea,"(Barneby & J.W.Grimes) M.V.B.Soares, M.P.Morim...",197.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema ganymedea,Barneby & J.W.Grimes,Jupunba ganymedea,"(Barneby & J.W.Grimes) M.V.B.Soares, M.P.Morim...",208.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema ganymedea,Barneby & J.W.Grimes,Jupunba ganymedea,"(Barneby & J.W.Grimes) M.V.B.Soares, M.P.Morim...",205.0,0.0,0.0,0.0,,1.0,0.0,Abarema ganymedea,,11456706.0,1.0,True,Abarema ganymedea,Abarema ganymedea Barneby & J.W.Grimes,Abarema ganymedea,Barneby & J.W.Grimes,0.0,0.0,0.0,,0.36,Jupunba ganymedea (Barneby & J.W.Grimes) M.V.B...,Jupunba ganymedea,SPECIES,"(Barneby & J.W.Grimes) M.V.B.Soares, M.P.Morim...",Fabaceae,Fabales,Magnoliopsida,Tracheophyta,Plantae
3,abarema idiopoda,Abarema idiopoda,Abarema idiopoda,,,,Abarema idiopoda,,Fabaceae,synonym,Abarema idiopoda,(S.F.Blake) Barneby & J.W.Grimes,Jupunba idiopoda,(S.F.Blake) M.V.B.Soares & al.,199.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema idiopoda,(S.F.Blake) Barneby & J.W.Grimes,Jupunba idiopoda,"(S.F.Blake) M.V.B.Soares, M.P.Morim & Iganci",205.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema idiopoda,(S.F.Blake) Barneby & J.W.Grimes,Jupunba idiopoda,"(S.F.Blake) M.V.B.Soares, M.P.Morim & Iganci",216.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,synonym,Abarema idiopoda,(S.F.Blake) Barneby & J.W.Grimes,Jupunba idiopoda,"(S.F.Blake) M.V.B.Soares, M.P.Morim & Iganci",214.0,0.0,0.0,0.0,,1.0,0.0,Abarema idiopoda,,11430363.0,1.0,True,Abarema idiopoda,Abarema idiopoda (S.F.Blake) Barneby & J.W.Grimes,Abarema idiopoda,(S.F.Blake) Barneby & J.W.Grimes,0.0,0.0,0.0,,0.37,"Jupunba idiopoda (S.F.Blake) M.V.B.Soares, M.P...",Jupunba idiopoda,SPECIES,"(S.F.Blake) M.V.B.Soares, M.P.Morim & Iganci",Fabaceae,Fabales,Magnoliopsida,Tracheophyta,Plantae
4,abarema levelii,Abarema levelii,Abarema levelii,,,,Abarema levelii,,Fabaceae,accepted,Abarema levelii,(Cowan) Barneby & J.W.Grimes,Abarema levelii,(Cowan) Barneby & J.W.Grimes,216.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,accepted,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,221.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,accepted,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,233.0,0.0,0.0,0.0,,1.0,0.0,Fabaceae,accepted,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,230.0,0.0,0.0,0.0,,1.0,0.0,Abarema levelii,,2977916.0,1.0,False,Abarema levelii,Abarema levelii (R.S.Cowan) Barneby & J.W.Grimes,Abarema levelii,(R.S.Cowan) Barneby & J.W.Grimes,0.0,0.0,0.0,,0.37,Abarema levelii (R.S.Cowan) Barneby & J.W.Grimes,Abarema levelii,SPECIES,(R.S.Cowan) Barneby & J.W.Grimes,Fabaceae,Fabales,Magnoliopsida,Tracheophyta,Plantae


In [11]:
dsc_df.columns

Index(['hydName', 'hydNameIn', 'nameIn', 'groName', 'groNameIn', 'authorsIn',
       'nameIn.1', 'authorsIn.1', 'familyLCVP', 'statusLCVP', 'nameInLCVP',
       'authorsInLCVP', 'nameOutLCVP', 'authorsOutLCVP', 'rowLCVP',
       'diffGenLCVP', 'diffSpecLCVP', 'diffRestLCVP', 'diffAuthorsLCVP',
       'matchTypeLCVP', 'timeLCVP', 'familyWCVP', 'statusWCVP', 'nameInWCVP',
       'authorsInWCVP', 'nameOutWCVP', 'authorsOutWCVP', 'rowWCVP',
       'diffGenWCVP', 'diffSpecWCVP', 'diffRestWCVP', 'diffAuthorsWCVP',
       'matchTypeWCVP', 'timeWCVP', 'familyWFO', 'statusWFO', 'nameInWFO',
       'authorsInWFO', 'nameOutWFO', 'authorsOutWFO', 'rowWFO', 'diffGenWFO',
       'diffSpecWFO', 'diffRestWFO', 'diffAuthorsWFO', 'matchTypeWFO',
       'timeWFO', 'familyWP', 'statusWP', 'nameInWP', 'authorsInWP',
       'nameOutWP', 'authorsOutWP', 'rowWP', 'diffGenWP', 'diffSpecWP',
       'diffRestWP', 'diffAuthorsWP', 'matchTypeWP', 'timeWP', 'nameIn.2',
       'authorsIn.2', 'GBIFKeyGBIF', 'searchTy

In [None]:
cols = {
    "hydNameIn": "string[pyarrow]",  # Hydraulic trait name
    "groNameIn": "string[pyarrow]",  # Growth form name
    "nameOutWFO": "string[pyarrow]",  # WFO name
    "GBIFKeyGBIF": pd.Int32Dtype(),  # GBIF key
    "nameOutWCVP": "string[pyarrow]",  # WCVP name
}

dsc_df = pd.read_csv(dsc_fp, compression="gzip", usecols=cols.keys(), dtype=cols)

## Harmonize GBIF species

In [None]:
all_species_map = dsc_df.copy()

gbif_fp = "data/raw/all_tracheophyta_non-cult_2024-04-10/all_tracheophyta_non-cult_2024-04-10.parquet/*"

gbif_cols = {
    "specieskey": pd.Int32Dtype(),
    "taxonrank": "string[pyarrow]",
}

gbif_species = (
    dd.read_parquet(gbif_fp, columns=list(gbif_cols.keys()))
    .astype(gbif_cols)
    .query("taxonrank == 'SPECIES'")
    .drop(columns=["taxonrank"])
    .drop_duplicates(subset=["specieskey"])
    .dropna()
    .compute()
)

gbif_species.head(3)

Unnamed: 0,specieskey
56,5389017
93,2855398
163,2927245


In [40]:
gbif_species.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 322333 entries, 56 to 99639
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   specieskey  322333 non-null  Int32
dtypes: Int32(1)
memory usage: 4.0 MB


In [50]:
gbif_matched = pd.merge(
    all_species_map.dropna(subset=["groNameIn"]),
    gbif_species,
    left_on="GBIFKeyGBIF",
    right_on="specieskey",
    how="inner",
)

print(
    f"Matched {gbif_matched.shape[0]} of {all_species_map.shape[0]} hydraulic species "
    f"({(gbif_matched.shape[0] / all_species_map.shape[0]) * 100:.2f}%)"
)

Matched 192812 of 288698 hydraulic species (66.79%)


## Harmonize sPlot species

In [45]:
splot_fp = "data/interim/splot/extracted/vegetation.parquet"

splot_cols = {"Species": "string[pyarrow]"}

splot_species = (
    pd.read_parquet(splot_fp, columns=list(splot_cols.keys()))
    .astype(splot_cols)
    .drop_duplicates(subset=["Species"])
    .dropna()
)

In [54]:
print(f"sPlot species: {splot_species.shape[0]}")
splot_matched = pd.merge(
    all_species_map.dropna(subset=["groNameIn"]),
    splot_species,
    left_on="nameOutWCVP",
    right_on="Species",
    how="inner",
)

print(
    f"Matched {splot_matched.shape[0]} of {all_species_map.shape[0]} hydraulic species "
    f"({(splot_matched.shape[0] / all_species_map.shape[0]) * 100:.2f}%)"
)


sPlot species: 116086
Matched 72996 of 288698 hydraulic species (25.28%)
