In [12]:
import pandas as pd
import numpy as np
from ckanapi import RemoteCKAN

NaN = np.nan
from pyobistools.validation.check_scientificname_and_ids import check_scientificname_and_ids
from ckanapi import RemoteCKAN
import nest_asyncio
nest_asyncio.apply()
pd.set_option('max_colwidth', None)

### Info about this notebook series

This series of notebook is meant to serve as an educational tool to learn how to use the Pyobistools biodiversity data validation package: + https://github.com/cioos-siooc/pyobistools

Command to install Pyobistools (currently not hosted on Pypi)
+ pip install git+https://github.com/cioos-siooc/pyobistools@main#egg=pyobistools

Darwin Core documentation: 
+ https://dwc.tdwg.org/

Darwin Core file types required fields: 
+ https://ioos.github.io/bio_mobilization_workshop/01-introduction/index.html
+ https://ioos.github.io/bio_mobilization_workshop/04-create-schema/index.html

### Notebook to test Pyobistools' function 'check_scientifiname_and_ids'

##### Function 'check_scientifiname_and_ids' description
This function is used to evaluate the validity of scientific names, scientific name Ids and taxon ranks with Worms and Itis databases

##### Function 'check_scientifiname_and_ids' arguments
+ data: Dataframe of the data to evaluate
+ value: Type of analysis to run for each row in the dataset 
    + 'names': Analyzes scientific names validity with Worms and Itis (Itis optional)
    + 'names_ids': Above plus analyzes scientific name Ids
    + 'names_taxons_ids': Above plus analyzes taxon rank
+ itis_usage (default =  False): Option to validate data also with Itis if Worms service does not provide a positive answer for a given scientific name

##### Results
+ The validation section of the result tables will make a comparison of the scientific names, taxon tanks and scientificname IDs values coming from the dataset and the ones coming from the databases. If the comparison between the respectives columns shows that the values are the same, 'Oui/Yes' is displayed. Otherwise 'Non/No' is displayed.

Load different types of DWC files:

In [15]:
occurrence_core = pd.read_csv('test_occurrence_core_check_scientificname_and_ids.csv')
occurrence_core.head(5)

Unnamed: 0,datasetID,occurrenceID,eventDate,countryCode,location,decimalLatitude,decimalLongitude,accuracyInMeters,kingdom,commonName,...,acceptedNameUsage,acceptedNameUsageID,taxonID,taxon_id,iNaturalistID,taxonRank,occurrenceStatus,basisOfRecord,quality_grade,url
0,ABCD-2021,ABCD-001,2021-08-11 06:02:26,CA,Randolph Island,45.268169,-66.122442,8,Plantae,reed canary grass,...,Phalaris arundinacea,urn:lsid:itis.gov:itis_tsn:41335,https://www.gbif.org/species/5289756,63337,90776330,Species,present,HumanObservation,research,https://www.inaturalist.org/observations/90776330
1,ABCD-2021,ABCD-002,2021-08-11 06:03:12,CA,Randolph Island,45.268148,-66.122424,9,Plantae,prairie cordgrass,...,Sporobolus michauxianus,urn:lsid:ipni.org:names:77145291-1,https://www.gbif.org/species/9592414,772984,90776368,Species,present,HumanObservation,needs_id,https://www.inaturalist.org/observations/90776368
2,ABCD-2021,ABCD-003,2021-08-11 06:04:32,CA,Randolph Island,45.26814,-66.12222,8,Plantae,three-square bulrush,...,Schoenoplectus pungens,urn:lsid:itis.gov:itis_tsn:508146,https://www.gbif.org/species/2711190,59072,90776406,Species,present,HumanObservation,research,https://www.inaturalist.org/observations/90776406
3,ABCD-2021,ABCD-004,2021-08-11 06:07:02,CA,Randolph Island,45.268185,-66.12205,9,Plantae,red pigweed,...,Oxybasis rubra,urn:lsid:ipni.org:names:77121013-1,https://www.gbif.org/species/7725469,931679,90776443,Species,present,HumanObservation,needs_id,https://www.inaturalist.org/observations/90776443
4,ABCD-2021,ABCD-005,2021-08-11 06:08:36,CA,Randolph Island,45.268295,-66.121987,8,Plantae,rough cocklebur,...,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089,https://www.gbif.org/species/3089154,57920,90776465,Species,present,HumanObservation,research,https://www.inaturalist.org/observations/90776465


In [14]:
occurrence_extension = pd.read_csv('test_occurrence_extension_check_scientificname_and_ids.csv')
occurrence_extension.head(5)

Unnamed: 0,eventID,occurrenceID,scientificName,aphiaID,scientificNameID,taxonRank,occurrenceStatus,organismQuantityType,organismQuantity,basisofRecord
0,0003_ABCD_BE10C1,0003_ABCD_BE10C1_1,Mesodesma arctatum,156805.0,urn:lsid:marinespecies.org:taxname:156805,species,present,Individual count,2,HumanObservation
1,0003_ABCD_BE10C1,0003_ABCD_BE10C1_2,Nephtys ciliata,130356.0,urn:lsid:marinespecies.org:taxname:130356,species,present,Individual count,1,HumanObservation
2,0003_ABCD_BE10C2,0003_ABCD_BE10C2_3,Leucon nasicoides,148682.0,urn:lsid:marinespecies.org:taxname:148682,species,present,Individual count,2,HumanObservation
3,0003_ABCD_BE10C2,0003_ABCD_BE10C2_4,Mesodesma arctatum,156805.0,urn:lsid:marinespecies.org:taxname:156805,species,present,Individual count,5,HumanObservation
4,0003_ABCD_BE10C2,0003_ABCD_BE10C2_5,Nephtys ciliata,130356.0,urn:lsid:marinespecies.org:taxname:130356,species,present,Individual count,1,HumanObservation


Try the check_scientificname_and_ids function:

In [16]:
check_scientificname_and_ids(occurrence_core, 'names')

21 : 204: Worms Lysimachia terrestris 
35 : 204: Worms Mentha canadensis 
25 : 200: Worms Tracheophyta 
14 : 204: Worms Sium suave 
40 : 204: Worms Prunella vulgaris 
20 : 204: Worms Lysimachia ciliata 
32 : 204: Worms Carex pallescens 
49 : 204: Worms Hypericum mutilum 
17 : 204: Worms Harmonia axyridis 
10 : 204: Worms Alisma triviale 
4 : 204: Worms Alnus alnobetula 
7 : 204: Worms Acer rubrum 
23 : 200: Worms Calidris minutilla 
6 : 204: Worms Echinocystis lobata 
22 : 200: Worms Polistes fuscatus 
52 : 200: Worms Juncus gerardii 
15 : 204: Worms Epalpus signifer 
96 : 204: Worms Pastinaca sativa 
19 : 200: Worms Schoenoplectus 
41 : 200: Worms Ranunculus repens 
24 : 204: Worms Linaria vulgaris 
31 : 204: Worms Lycopus uniflorus 
43 : 204: Worms Hylotelephium telephium 
2 : 204: Worms Argentina anserina 
51 : 200: Worms Zizania 
0 : 204: Worms Rhagonycha fulva 
9 : 200: Worms Cichorium intybus 
42 : 200: Worms Bidens 
12 : 200: Worms Elymus virginicus 
28 : 204: Worms Frangula aln

Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,ScientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
127,Acer rubrum,Non/No,,,,,,,
80,Acorus americanus,Non/No,,,,,,,
115,Alisma triviale,Non/No,,,,,,,
49,Alnus alnobetula,Non/No,,,,,,,
33,Alnus incana,Non/No,,,,,,,
...,...,...,...,...,...,...,...,...,...
38,Ulva intestinalis,Oui/Yes,234471,accepted,,Species,234471,Ulva intestinalis,urn:lsid:marinespecies.org:taxname:234471
21,Vicia,Oui/Yes,416135,accepted,,Genus,416135,Vicia,urn:lsid:marinespecies.org:taxname:416135
73,Vicia cracca,Oui/Yes,1492127,accepted,,Species,1492127,Vicia cracca,urn:lsid:marinespecies.org:taxname:1492127
4,Xanthium strumarium,Oui/Yes,1092089,accepted,,Species,1092089,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089


In [17]:
check_scientificname_and_ids(occurrence_extension, 'names')

1 : 200: Worms Microspio 
8 : 200: Worms Oediceros borealis 
2 : 200: Worms Mesodesma arctatum 
16 : 200: Worms Littorina saxatilis 
18 : 200: Worms Leucon nasicoides 
3 : 200: Worms Eteone longa 
7 : 200: Worms Macoma balthica 
6 : 200: Worms Nephtys ciliata 
13 : 200: Worms Mya arenaria 
12 : 200: Worms Mytilus edulis 
17 : 200: Worms Littorina obtusata 
11 : 200: Worms Hydrobia minuta 
5 : 200: Worms Psammonyx nobilis 
9 : 200: Worms Nereis diversicolor 
4 : 200: Worms Idotea balthica 
14 : 200: Worms Gammarus oceanicus 
10 : 200: Worms Oligochaeta 
15 : 200: Worms Nematoda 


Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,ScientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
11,Hydrobia minuta,Non/No,152020.0,unaccepted,preoccupied name,Species,574096.0,Ecrobia truncata,urn:lsid:marinespecies.org:taxname:152020
2,Leucon nasicoides,Non/No,148682.0,alternate representation,,Species,110619.0,Leucon (Leucon) nasicoides,urn:lsid:marinespecies.org:taxname:148682
8,Nereis diversicolor,Non/No,340537.0,deleted,AphiaID resurrection,,152302.0,Hediste diversicolor,urn:lsid:marinespecies.org:taxname:340537
4,Psammonyx nobilis,Non/No,158140.0,unaccepted,superseded recombination,Species,1255501.0,Wecomedon nobilis,urn:lsid:marinespecies.org:taxname:158140
9,,Oui/Yes,,,,,,,
6,Eteone longa,Oui/Yes,130616.0,accepted,,Species,130616.0,Eteone longa,urn:lsid:marinespecies.org:taxname:130616
15,Gammarus oceanicus,Oui/Yes,102285.0,accepted,,Species,102285.0,Gammarus oceanicus,urn:lsid:marinespecies.org:taxname:102285
18,Idotea balthica,Oui/Yes,119039.0,accepted,,Species,119039.0,Idotea balthica,urn:lsid:marinespecies.org:taxname:119039
12,Littorina obtusata,Oui/Yes,140263.0,accepted,,Species,140263.0,Littorina obtusata,urn:lsid:marinespecies.org:taxname:140263
17,Littorina saxatilis,Oui/Yes,445895.0,unaccepted,,Species,140264.0,Littorina saxatilis,urn:lsid:marinespecies.org:taxname:445895


In [18]:
table1, table2 = check_scientificname_and_ids(occurrence_core, 'names_ids')
table1

2 : 204: Worms Argentina anserina 
0 : 204: Worms Rhagonycha fulva 
5 : 204: Worms Persicaria sagittata 
20 : 204: Worms Lysimachia ciliata 
3 : 204: Worms Betula papyrifera 
4 : 204: Worms Alnus alnobetula 
17 : 204: Worms Harmonia axyridis 
10 : 204: Worms Alisma triviale 
15 : 204: Worms Epalpus signifer 
8 : 200: Worms Larus argentatus 
34 : 204: Worms Matteuccia struthiopteris 
32 : 204: Worms Carex pallescens 
31 : 204: Worms Lycopus uniflorus 
16 : 204: Worms Halerpestes cymbalaria 
28 : 204: Worms Frangula alnus 
7 : 204: Worms Acer rubrum 
6 : 204: Worms Echinocystis lobata 
27 : 204: Worms Apocynum cannabinum 
18 : 200: Worms Thuja occidentalis 
35 : 204: Worms Mentha canadensis 
26 : 204: Worms Iris versicolor 
14 : 204: Worms Sium suave 
13 : 200: Worms Nuphar 
25 : 200: Worms Tracheophyta 
37 : 204: Worms Chelone glabra 
30 : 204: Worms Viburnum opulus 
21 : 204: Worms Lysimachia terrestris 
36 : 204: Worms Spiraea alba 
59 : 204: Worms Sorbus 
24 : 204: Worms Linaria vulg

Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,ScientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
127,Acer rubrum,Non/No,,,,,,,
80,Acorus americanus,Non/No,,,,,,,
115,Alisma triviale,Non/No,,,,,,,
49,Alnus alnobetula,Non/No,,,,,,,
33,Alnus incana,Non/No,,,,,,,
...,...,...,...,...,...,...,...,...,...
38,Ulva intestinalis,Oui/Yes,234471,accepted,,Species,234471,Ulva intestinalis,urn:lsid:marinespecies.org:taxname:234471
21,Vicia,Oui/Yes,416135,accepted,,Genus,416135,Vicia,urn:lsid:marinespecies.org:taxname:416135
73,Vicia cracca,Oui/Yes,1492127,accepted,,Species,1492127,Vicia cracca,urn:lsid:marinespecies.org:taxname:1492127
4,Xanthium strumarium,Oui/Yes,1092089,accepted,,Species,1092089,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089


In [19]:
table2

Unnamed: 0_level_0,Ref. ID,Validation,Validation,Dataset Values,Dataset Values,Database values,Database values
Unnamed: 0_level_1,OccurrenceID,ScientificName_Validation,scientificNameID_Validation,ScientificName,ScientificNameID,Valid_Name,LSID
0,ABCD-001,Non/No,Non/No,Phalaris arundinacea,urn:lsid:itis.gov:itis_tsn:41335,,
1,ABCD-002,Non/No,Non/No,Sporobolus michauxianus,urn:lsid:ipni.org:names:77145291-1,,
2,ABCD-003,Non/No,Non/No,Schoenoplectus pungens,urn:lsid:itis.gov:itis_tsn:508146,,
3,ABCD-004,Non/No,Non/No,Oxybasis rubra,urn:lsid:ipni.org:names:77121013-1,,
5,ABCD-006,Non/No,Non/No,Linaria vulgaris,urn:lsid:itis.gov:itis_tsn:33216,,
...,...,...,...,...,...,...,...
288,ABCD-289,Oui/Yes,Oui/Yes,Schoenoplectus,urn:lsid:marinespecies.org:taxname:382429,Schoenoplectus,urn:lsid:marinespecies.org:taxname:382429
290,ABCD-291,Oui/Yes,Oui/Yes,Rosa,urn:lsid:marinespecies.org:taxname:425714,Rosa,urn:lsid:marinespecies.org:taxname:425714
294,ABCD-295,Oui/Yes,Oui/Yes,Tussilago farfara,urn:lsid:marinespecies.org:taxname:594804,Tussilago farfara,urn:lsid:marinespecies.org:taxname:594804
296,ABCD-297,Oui/Yes,Oui/Yes,Galium,urn:lsid:marinespecies.org:taxname:993892,Galium,urn:lsid:marinespecies.org:taxname:993892


In [20]:
table3, table4 = check_scientificname_and_ids(occurrence_core, 'names_taxons_ids')

1 : 200: Worms Erechtites hieraciifolius 
4 : 204: Worms Alnus alnobetula 
0 : 204: Worms Rhagonycha fulva 
6 : 204: Worms Echinocystis lobata 
13 : 200: Worms Nuphar 
21 : 204: Worms Lysimachia terrestris 
5 : 204: Worms Persicaria sagittata 
27 : 204: Worms Apocynum cannabinum 
32 : 204: Worms Carex pallescens 
20 : 204: Worms Lysimachia ciliata 
12 : 200: Worms Elymus virginicus 
3 : 204: Worms Betula papyrifera 
8 : 200: Worms Larus argentatus 
25 : 200: Worms Tracheophyta 
29 : 200: Worms Plantago 
10 : 204: Worms Alisma triviale 
16 : 204: Worms Halerpestes cymbalaria 
22 : 200: Worms Polistes fuscatus 
26 : 204: Worms Iris versicolor 
15 : 204: Worms Epalpus signifer 
17 : 204: Worms Harmonia axyridis 
49 : 204: Worms Hypericum mutilum 
31 : 204: Worms Lycopus uniflorus 
2 : 204: Worms Argentina anserina 
11 : 200: Worms Tussilago farfara 
18 : 200: Worms Thuja occidentalis 
24 : 204: Worms Linaria vulgaris 
14 : 204: Worms Sium suave 
37 : 204: Worms Chelone glabra 
34 : 204: W

In [21]:
table3

Unnamed: 0_level_0,Dataset Values,Validation,Database values,Database values,Database values,Database values,Database values,Database values,Database values
Unnamed: 0_level_1,ScientificName,Exact_Match,TaxonID,Status,Unacceptreason,Taxon_Rank,Valid_TaxonID,Valid_Name,LSID
127,Acer rubrum,Non/No,,,,,,,
80,Acorus americanus,Non/No,,,,,,,
115,Alisma triviale,Non/No,,,,,,,
49,Alnus alnobetula,Non/No,,,,,,,
33,Alnus incana,Non/No,,,,,,,
...,...,...,...,...,...,...,...,...,...
38,Ulva intestinalis,Oui/Yes,234471,accepted,,Species,234471,Ulva intestinalis,urn:lsid:marinespecies.org:taxname:234471
21,Vicia,Oui/Yes,416135,accepted,,Genus,416135,Vicia,urn:lsid:marinespecies.org:taxname:416135
73,Vicia cracca,Oui/Yes,1492127,accepted,,Species,1492127,Vicia cracca,urn:lsid:marinespecies.org:taxname:1492127
4,Xanthium strumarium,Oui/Yes,1092089,accepted,,Species,1092089,Xanthium strumarium,urn:lsid:marinespecies.org:taxname:1092089


In [22]:
table4

Unnamed: 0_level_0,Ref. ID,Validation,Validation,Validation,Dataset Values,Dataset Values,Dataset Values,Database values,Database values,Database values
Unnamed: 0_level_1,OccurrenceID,ScientificName_Validation,TaxonRank_Validation,scientificNameID_Validation,ScientificName,TaxonRank,ScientificNameID,Valid_Name,Taxon_Rank,LSID
0,ABCD-001,Non/No,Non/No,Non/No,Phalaris arundinacea,Species,urn:lsid:itis.gov:itis_tsn:41335,,,
1,ABCD-002,Non/No,Non/No,Non/No,Sporobolus michauxianus,Species,urn:lsid:ipni.org:names:77145291-1,,,
2,ABCD-003,Non/No,Non/No,Non/No,Schoenoplectus pungens,Species,urn:lsid:itis.gov:itis_tsn:508146,,,
3,ABCD-004,Non/No,Non/No,Non/No,Oxybasis rubra,Species,urn:lsid:ipni.org:names:77121013-1,,,
5,ABCD-006,Non/No,Non/No,Non/No,Linaria vulgaris,Species,urn:lsid:itis.gov:itis_tsn:33216,,,
...,...,...,...,...,...,...,...,...,...,...
288,ABCD-289,Oui/Yes,Oui/Yes,Oui/Yes,Schoenoplectus,Genus,urn:lsid:marinespecies.org:taxname:382429,Schoenoplectus,Genus,urn:lsid:marinespecies.org:taxname:382429
290,ABCD-291,Oui/Yes,Oui/Yes,Oui/Yes,Rosa,Genus,urn:lsid:marinespecies.org:taxname:425714,Rosa,Genus,urn:lsid:marinespecies.org:taxname:425714
294,ABCD-295,Oui/Yes,Oui/Yes,Oui/Yes,Tussilago farfara,Species,urn:lsid:marinespecies.org:taxname:594804,Tussilago farfara,Species,urn:lsid:marinespecies.org:taxname:594804
296,ABCD-297,Oui/Yes,Oui/Yes,Oui/Yes,Galium,Genus,urn:lsid:marinespecies.org:taxname:993892,Galium,Genus,urn:lsid:marinespecies.org:taxname:993892


Try the check_scientificname_and_ids function - itis_usage:

In [23]:
# function can take a few minutes to process with 'itis_usage = True'
# uncomment following line to test the 'itis_usage' parameter
#check_scientificname_and_ids(occurrence_core, 'names', itis_usage= True).head()