In [12]:
import pandas as pd
import numpy as np
from pyobistools.validation.check_scientificname_and_ids import check_scientificname_and_ids
import nest_asyncio
nest_asyncio.apply()
pd.set_option('max_colwidth', None)
NaN = np.nan

### Info about this notebook series

This series of notebook is meant to serve as an educational tool to learn how to use the Pyobistools biodiversity data validation package: + https://github.com/cioos-siooc/pyobistools

Command to install Pyobistools (currently not hosted on Pypi)
+ pip install git+https://github.com/cioos-siooc/pyobistools@main#egg=pyobistools

Darwin Core documentation: 
+ https://dwc.tdwg.org/

Darwin Core file types required fields: 
+ https://ioos.github.io/bio_mobilization_workshop/01-introduction/index.html
+ https://ioos.github.io/bio_mobilization_workshop/04-create-schema/index.html

### Notebook to test Pyobistools' function 'check_scientifiname_and_ids'

##### Function 'check_scientifiname_and_ids' description
This function is used to evaluate the validity of scientific names, scientific name Ids and taxon ranks with Worms and Itis databases

##### Function 'check_scientifiname_and_ids' arguments
+ data: Dataframe of the data to evaluate
+ value: Type of analysis to run for each row in the dataset 
    + 'names': Analyzes scientific names validity with Worms and Itis (Itis optional)
    + 'names_ids': Above plus analyzes scientific name Ids
    + 'names_taxons_ids': Above plus analyzes taxon rank
+ itis_usage (default =  False): Option to validate data also with Itis if Worms service does not provide a positive answer for a given scientific name

##### Results
+ The validation section of the result tables will make a comparison of the scientific names, taxon tanks and scientificname IDs values coming from the dataset and the ones coming from the databases. If the comparison between the respectives columns shows that the values are the same, 'Oui/Yes' is displayed. Otherwise 'Non/No' is displayed.

Load different types of DWC files:

In [13]:
occurrence_core = pd.read_csv('test_occurrence_core_check_scientificname_and_ids.csv')
occurrence_core.head(5)

Unnamed: 0,datasetID,occurrenceID,eventDate,countryCode,location,decimalLatitude,decimalLongitude,accuracyInMeters,kingdom,commonName,...,acceptedNameUsage,acceptedNameUsageID,taxonID,taxon_id,iNaturalistID,taxonRank,occurrenceStatus,basisOfRecord,quality_grade,url
0,ABCD-2021,ABCD-001,2021-08-11 06:02:26,CA,Randolph Island,45.268169,-66.122442,8,Plantae,reed canary grass,...,Phalaris arundinacea,urn:lsid:itis.gov:itis_tsn:41335,https://www.gbif.org/species/5289756,63337,90776330,Species,present,HumanObservation,research,https://www.inaturalist.org/observations/90776330
1,ABCD-2021,ABCD-002,2021-08-11 06:03:12,CA,Randolph Island,45.268148,-66.122424,9,Plantae,prairie cordgrass,...,Sporobolus michauxianus,urn:lsid:ipni.org:names:77145291-1,https://www.gbif.org/species/9592414,772984,90776368,Species,present,HumanObservation,needs_id,https://www.inaturalist.org/observations/90776368
2,ABCD-2021,ABCD-003,2021-08-11 06:04:32,CA,Randolph Island,45.26814,-66.12222,8,Plantae,three-square bulrush,...,Schoenoplectus pungens,urn:lsid:itis.gov:itis_tsn:508146,https://www.gbif.org/species/2711190,59072,90776406,Species,present,HumanObservation,research,https://www.inaturalist.org/observations/90776406
3,ABCD-2021,ABCD-004,2021-08-11 06:07:02,CA,Randolph Island,45.268185,-66.12205,9,Plantae,red pigweed,...,Oxybasis rubra,urn:lsid:ipni.org:names:77121013-1,https://www.gbif.org/species/7725469,931679,90776443,Species,present,HumanObservation,needs_id,https://www.inaturalist.org/observations/90776443
4,ABCD-2021,ABCD-005,2021-08-11 06:08:36,CA,Randolph Island,45.268295,-66.121987,8,Plantae,rough cocklebur,...,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089,https://www.gbif.org/species/3089154,57920,90776465,Species,present,HumanObservation,research,https://www.inaturalist.org/observations/90776465


In [14]:
occurrence_extension = pd.read_csv('test_occurrence_extension_check_scientificname_and_ids.csv')
occurrence_extension.head(5)

Unnamed: 0,eventID,occurrenceID,scientificName,aphiaID,scientificNameID,taxonRank,occurrenceStatus,organismQuantityType,organismQuantity,basisofRecord
0,0003_ABCD_BE10C1,0003_ABCD_BE10C1_1,Mesodesma arctatum,156805.0,urn:lsid:marinespecies.org:taxname:156805,species,present,Individual count,2,HumanObservation
1,0003_ABCD_BE10C1,0003_ABCD_BE10C1_2,Nephtys ciliata,130356.0,urn:lsid:marinespecies.org:taxname:130356,species,present,Individual count,1,HumanObservation
2,0003_ABCD_BE10C2,0003_ABCD_BE10C2_3,Leucon nasicoides,148682.0,urn:lsid:marinespecies.org:taxname:148682,species,present,Individual count,2,HumanObservation
3,0003_ABCD_BE10C2,0003_ABCD_BE10C2_4,Mesodesma arctatum,156805.0,urn:lsid:marinespecies.org:taxname:156805,species,present,Individual count,5,HumanObservation
4,0003_ABCD_BE10C2,0003_ABCD_BE10C2_5,Nephtys ciliata,130356.0,urn:lsid:marinespecies.org:taxname:130356,species,present,Individual count,1,HumanObservation


Try the check_scientificname_and_ids function:

In [15]:
check_scientificname_and_ids(occurrence_core, 'names')

0 : 204: Worms Apocynum cannabinum 
3 : 204: Worms Hylaeus 
1 : 204: Worms Scutellaria lateriflora 
4 : 200: Worms Tracheophyta 
9 : 200: Worms Typha latifolia 
8 : 204: Worms Euphorbia maculata 
6 : 200: Worms Juncus 
45 : 204: Worms Halerpestes cymbalaria 
14 : 204: Worms Pastinaca sativa 
5 : 204: Worms Sium suave 
44 : 200: Worms Vicia cracca 
13 : 200: Worms Elymus virginicus 
12 : 204: Worms Sporobolus michauxianus 
32 : 204: Worms Ilex verticillata 
73 : 204: Worms Harmonia axyridis 
26 : 200: Worms Galium mollugo 
2 : 200: Worms Doellingeria umbellata 
10 : 200: Worms Jacobaea vulgaris 
31 : 204: Worms Spiraea alba 
34 : 204: Worms Phalaris arundinacea 
18 : 204: Worms Alisma triviale 
33 : 200: Worms Schoenoplectus pungens 
23 : 200: Worms Leucanthemum vulgare 
16 : 204: Worms Persicaria sagittata 
41 : 204: Worms Linaria vulgaris 
36 : 204: Worms Convallaria majalis 
53 : 204: Worms Persicaria amphibia 
39 : 204: Worms Betula papyrifera 
28 : 200: Worms Anthoxanthum odoratum 

Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,scientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
127,Acer rubrum,Non/No,,,,,,,
80,Acorus americanus,Non/No,,,,,,,
115,Alisma triviale,Non/No,,,,,,,
49,Alnus alnobetula,Non/No,,,,,,,
33,Alnus incana,Non/No,,,,,,,
...,...,...,...,...,...,...,...,...,...
38,Ulva intestinalis,Oui/Yes,234471,accepted,,Species,234471,Ulva intestinalis,urn:lsid:marinespecies.org:taxname:234471
21,Vicia,Oui/Yes,416135,accepted,,Genus,416135,Vicia,urn:lsid:marinespecies.org:taxname:416135
73,Vicia cracca,Oui/Yes,1492127,accepted,,Species,1492127,Vicia cracca,urn:lsid:marinespecies.org:taxname:1492127
4,Xanthium strumarium,Oui/Yes,1092089,accepted,,Species,1092089,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089


In [16]:
check_scientificname_and_ids(occurrence_extension, 'names')

1 : 200: Worms Littorina obtusata 
10 : 200: Worms Idotea balthica 
13 : 200: Worms Microspio 
9 : 200: Worms Oediceros borealis 
14 : 200: Worms Eteone longa 
16 : 200: Worms Hydrobia minuta 
4 : 200: Worms Nereis diversicolor 
17 : 200: Worms Mesodesma arctatum 
6 : 200: Worms Macoma balthica 
11 : 200: Worms Mya arenaria 
5 : 200: Worms Nephtys ciliata 
7 : 200: Worms Littorina saxatilis 
2 : 200: Worms Leucon nasicoides 
3 : 200: Worms Mytilus edulis 
15 : 200: Worms Psammonyx nobilis 
12 : 200: Worms Gammarus oceanicus 
8 : 200: Worms Nematoda 
18 : 200: Worms Oligochaeta 


Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,scientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
11,Hydrobia minuta,Non/No,152020.0,unaccepted,preoccupied name,Species,574096.0,Ecrobia truncata,urn:lsid:marinespecies.org:taxname:152020
2,Leucon nasicoides,Non/No,148682.0,alternate representation,,Species,110619.0,Leucon (Leucon) nasicoides,urn:lsid:marinespecies.org:taxname:148682
8,Nereis diversicolor,Non/No,340537.0,deleted,AphiaID resurrection,,152302.0,Hediste diversicolor,urn:lsid:marinespecies.org:taxname:340537
4,Psammonyx nobilis,Non/No,158140.0,unaccepted,superseded recombination,Species,1255501.0,Wecomedon nobilis,urn:lsid:marinespecies.org:taxname:158140
9,,Oui/Yes,,,,,,,
6,Eteone longa,Oui/Yes,130616.0,accepted,,Species,130616.0,Eteone longa,urn:lsid:marinespecies.org:taxname:130616
15,Gammarus oceanicus,Oui/Yes,102285.0,accepted,,Species,102285.0,Gammarus oceanicus,urn:lsid:marinespecies.org:taxname:102285
18,Idotea balthica,Oui/Yes,119039.0,accepted,,Species,119039.0,Idotea balthica,urn:lsid:marinespecies.org:taxname:119039
12,Littorina obtusata,Oui/Yes,140263.0,accepted,,Species,140263.0,Littorina obtusata,urn:lsid:marinespecies.org:taxname:140263
17,Littorina saxatilis,Oui/Yes,445895.0,unaccepted,,Species,140264.0,Littorina saxatilis,urn:lsid:marinespecies.org:taxname:445895


In [18]:
table1, table2 = check_scientificname_and_ids(occurrence_core, 'names_ids')
table1

16 : 204: Worms Persicaria sagittata 
58 : 204: Worms Epalpus signifer 
63 : 200: Worms Bryophyta 
36 : 204: Worms Convallaria majalis 
72 : 200: Worms Ranunculus repens 
43 : 200: Worms Persicaria 
51 : 204: Worms Calystegia sepium 
78 : 200: Worms Glyceria 
42 : 200: Worms Juncus gerardii 
68 : 204: Worms Impatiens capensis 
48 : 200: Worms Erechtites hieraciifolius 
70 : 204: Worms Matteuccia struthiopteris 
47 : 200: Worms Centaurea nigra 
12 : 204: Worms Sporobolus michauxianus 
14 : 204: Worms Pastinaca sativa 
2 : 200: Worms Doellingeria umbellata 
122 : 204: Worms Lysimachia ciliata 
84 : 204: Worms Arisaema triphyllum 
93 : 204: Worms Acorus americanus 
18 : 204: Worms Alisma triviale 
86 : 204: Worms Thalictrum pubescens 
85 : 200: Worms Cirsium arvense 
135 : 200: Worms Zizania 
126 : 200: Worms Typha 
110 : 204: Worms Amphicarpaea bracteata 
62 : 204: Worms Stachys palustris 
83 : 204: Worms Chelone glabra 
95 : 200: Worms Carex pallescens 
90 : 200: Worms Centaurea jacea 


Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,scientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
127,Acer rubrum,Non/No,,,,,,,
80,Acorus americanus,Non/No,,,,,,,
115,Alisma triviale,Non/No,,,,,,,
49,Alnus alnobetula,Non/No,,,,,,,
33,Alnus incana,Non/No,,,,,,,
...,...,...,...,...,...,...,...,...,...
38,Ulva intestinalis,Oui/Yes,234471,accepted,,Species,234471,Ulva intestinalis,urn:lsid:marinespecies.org:taxname:234471
21,Vicia,Oui/Yes,416135,accepted,,Genus,416135,Vicia,urn:lsid:marinespecies.org:taxname:416135
73,Vicia cracca,Oui/Yes,1492127,accepted,,Species,1492127,Vicia cracca,urn:lsid:marinespecies.org:taxname:1492127
4,Xanthium strumarium,Oui/Yes,1092089,accepted,,Species,1092089,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089


In [19]:
table2

Unnamed: 0_level_0,Ref. ID,Validation,Validation,Dataset Values,Dataset Values,Database values,Database values
Unnamed: 0_level_1,occurrenceID,scientificName_Validation,scientificNameID_Validation,scientificName,scientificNameID,Valid_Name,LSID
0,ABCD-001,Non/No,Non/No,Phalaris arundinacea,urn:lsid:itis.gov:itis_tsn:41335,,
1,ABCD-002,Non/No,Non/No,Sporobolus michauxianus,urn:lsid:ipni.org:names:77145291-1,,
3,ABCD-004,Non/No,Non/No,Oxybasis rubra,urn:lsid:ipni.org:names:77121013-1,,
5,ABCD-006,Non/No,Non/No,Linaria vulgaris,urn:lsid:itis.gov:itis_tsn:33216,,
11,ABCD-012,Non/No,Non/No,Halerpestes cymbalaria,,,
...,...,...,...,...,...,...,...
288,ABCD-289,Oui/Yes,Oui/Yes,Schoenoplectus,urn:lsid:marinespecies.org:taxname:382429,Schoenoplectus,urn:lsid:marinespecies.org:taxname:382429
290,ABCD-291,Oui/Yes,Oui/Yes,Rosa,urn:lsid:marinespecies.org:taxname:425714,Rosa,urn:lsid:marinespecies.org:taxname:425714
294,ABCD-295,Oui/Yes,Oui/Yes,Tussilago farfara,urn:lsid:marinespecies.org:taxname:594804,Tussilago farfara,urn:lsid:marinespecies.org:taxname:594804
296,ABCD-297,Oui/Yes,Oui/Yes,Galium,urn:lsid:marinespecies.org:taxname:993892,Galium,urn:lsid:marinespecies.org:taxname:993892


In [20]:
table3, table4 = check_scientificname_and_ids(occurrence_core, 'names_taxons_ids')

4 : 200: Worms Tracheophyta 
13 : 200: Worms Elymus virginicus 
0 : 204: Worms Apocynum cannabinum 
27 : 204: Worms Viburnum opulus 
17 : 204: Worms Lycopus uniflorus 
30 : 200: Worms Bidens 
20 : 200: Worms Xanthium strumarium 
69 : 200: Worms Schoenoplectus tabernaemontani 
8 : 204: Worms Euphorbia maculata 
7 : 204: Worms Berberis thunbergii 
16 : 204: Worms Persicaria sagittata 
3 : 204: Worms Hylaeus 
5 : 204: Worms Sium suave 
6 : 200: Worms Juncus 
26 : 200: Worms Galium mollugo 
10 : 200: Worms Jacobaea vulgaris 
24 : 200: Worms Agrostis gigantea 
14 : 204: Worms Pastinaca sativa 
28 : 200: Worms Anthoxanthum odoratum 
11 : 200: Worms Tanacetum vulgare 
19 : 200: Worms Lythrum salicaria 
25 : 204: Worms Crataegus 
29 : 200: Worms Raphanus raphanistrum 
15 : 200: Worms Scirpus 
44 : 200: Worms Vicia cracca 
22 : 204: Worms Hylotelephium telephium 
23 : 200: Worms Leucanthemum vulgare 
1 : 204: Worms Scutellaria lateriflora 
80 : 204: Worms Alnus alnobetula 
70 : 204: Worms Matte

In [21]:
table3

Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,scientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
127,Acer rubrum,Non/No,,,,,,,
80,Acorus americanus,Non/No,,,,,,,
115,Alisma triviale,Non/No,,,,,,,
49,Alnus alnobetula,Non/No,,,,,,,
33,Alnus incana,Non/No,,,,,,,
...,...,...,...,...,...,...,...,...,...
72,Tanacetum vulgare,Oui/Yes,993749,accepted,,Species,993749,Tanacetum vulgare,urn:lsid:marinespecies.org:taxname:993749
109,Thuja occidentalis,Oui/Yes,1634575,accepted,,Species,1634575,Thuja occidentalis,urn:lsid:marinespecies.org:taxname:1634575
106,Tracheophyta,Oui/Yes,596326,accepted,,Phylum (Division),596326,Tracheophyta,urn:lsid:marinespecies.org:taxname:596326
73,Vicia cracca,Oui/Yes,1492127,accepted,,Species,1492127,Vicia cracca,urn:lsid:marinespecies.org:taxname:1492127


In [22]:
table4

Unnamed: 0_level_0,Ref. ID,Validation,Validation,Validation,Dataset Values,Dataset Values,Dataset Values,Database values,Database values,Database values
Unnamed: 0_level_1,occurrenceID,scientificName_Validation,taxonRank_Validation,scientificNameID_Validation,scientificName,taxonRank,scientificNameID,Valid_Name,Taxon_Rank,LSID
0,ABCD-001,Non/No,Non/No,Non/No,Phalaris arundinacea,Species,urn:lsid:itis.gov:itis_tsn:41335,,,
1,ABCD-002,Non/No,Non/No,Non/No,Sporobolus michauxianus,Species,urn:lsid:ipni.org:names:77145291-1,,,
2,ABCD-003,Non/No,Non/No,Non/No,Schoenoplectus pungens,Species,urn:lsid:itis.gov:itis_tsn:508146,,,
3,ABCD-004,Non/No,Non/No,Non/No,Oxybasis rubra,Species,urn:lsid:ipni.org:names:77121013-1,,,
5,ABCD-006,Non/No,Non/No,Non/No,Linaria vulgaris,Species,urn:lsid:itis.gov:itis_tsn:33216,,,
...,...,...,...,...,...,...,...,...,...,...
260,ABCD-261,Oui/Yes,Oui/Yes,Oui/Yes,Scirpus,Genus,urn:lsid:marinespecies.org:taxname:431943,Scirpus,Genus,urn:lsid:marinespecies.org:taxname:431943
262,ABCD-263,Oui/Yes,Oui/Yes,Oui/Yes,Vicia cracca,Species,urn:lsid:marinespecies.org:taxname:1492127,Vicia cracca,Species,urn:lsid:marinespecies.org:taxname:1492127
267,ABCD-268,Oui/Yes,Oui/Yes,Oui/Yes,Lythrum salicaria,Species,urn:lsid:marinespecies.org:taxname:594786,Lythrum salicaria,Species,urn:lsid:marinespecies.org:taxname:594786
269,ABCD-270,Oui/Yes,Oui/Yes,Oui/Yes,Anthoxanthum odoratum,Species,urn:lsid:marinespecies.org:taxname:993846,Anthoxanthum odoratum,Species,urn:lsid:marinespecies.org:taxname:993846


Try the check_scientificname_and_ids function - itis_usage:

In [None]:
# function can take a few minutes to process with 'itis_usage = True'
# uncomment following line to test the 'itis_usage' parameter
#check_scientificname_and_ids(occurrence_core, 'names', itis_usage= True).head()