This notebook uses the existing Wyoming Landscape Conservation Initiative (WLIC) literature database and the eXtract Dark Data database (xDD, also known as GeoDeepDive: https://geodeepdive.org/) to build a list of unique specie names that have been published on in WLCI efforts.

Output Files

1) sources/WLCI_Literature_xDD_Matches.json
    *This file stores identifiers of articles from the WLCI literature database and xDD database where articles have a common DOI (same article)
    
2) sources/WLCI_Species_List_from_Literature.json
    *This files stores a list of scientific names that were mentioned in WLCI literature that are currently present in xDD based on the common DOI match documented in file one 

In [3]:
#Import needed packages
import requests
import pandas as pd
import json
import bispy
from IPython.display import display

bis_utils = bispy.bis.Utils()

In [4]:
#The WLCI literature database was exported as CSL JSON and posted to this ScienceBase Item: https://www.sciencebase.gov/catalog/item/4f4e476fe4b07f02db47e19f 
#as file wlci_lit_20190702.json (This should be replaced with the offical literature database release as that is finalized).  The file can be accessed via 

#File download url
lit_json_file = "https://www.sciencebase.gov/catalog/file/get/4f4e476fe4b07f02db47e19f?f=__disk__ad%2F2e%2Fcc%2Fad2ecc69ef3957ef9cd23d4cba1c7c68b983acad"

#Request and return JSON of literature data
lit_json = requests.get(lit_json_file).json()

In [8]:
#Print number of records in literature database.  
print (f'Number of articles in WLCI literature database: {len(lit_json)}')

Number of articles in WLCI literature database: 303


In [11]:
#Display the first record from the literature database
lit_json[0:20]

[{'id': 'http://zotero.org/groups/2341914/items/R3NSCC9R',
  'type': 'article-journal',
  'title': 'Greater Sage-Grouse Population Trends Across Wyoming: WY Sage-grouse Population Viability Analysis',
  'container-title': 'The Journal of Wildlife Management',
  'page': '397-412',
  'volume': '82',
  'issue': '2',
  'source': 'DOI.org (Crossref)',
  'abstract': "The scale at which analyses are performed can have an effect on model results and often one scale does not accurately describe the ecological phenomena of interest (e.g., population trends) for wide‐ranging species: yet, most ecological studies are performed at a single, arbitrary scale. To best determine local and regional trends for greater sage‐grouse (Centrocercus urophasianus) in Wyoming, USA, we modeled density‐independent and ‐dependent population growth across multiple spatial scales relevant to management and conservation (Core Areas [habitat encompassing approximately 83% of the sage‐grouse population on ∼24% of surfac

In [10]:
#Set function for accessing xDD database
def xdd_api(route, params):
    """Create list of docs mentioning a term of interest
    Parameters : see https://geodeepdive.org/api for more detail
    ----------
    routes : str of available api routes for xDD 
    params : str of key value pairs of paramaters:values separated by &
    """
    base_url = 'https://geodeepdive.org/api'
    search = (base_url + '/' + route + '?' + str(params))
    #print (search)
    try:
        r=requests.get(search)
        if r.status_code == 200 and 'success' in r.json():
            json_r = r.json()
            data = json_r['success']['data']
            return data
        elif r.status_code == 200:
            data = []
            return data
        else:
            raise Exception('xDD API returning: {}'.format(r.status_code))
    except Exception as e:
        raise Exception(e)

In [12]:
#Create list of relationship information between WLCI database and xDD database
wlci_xdd = []
#For each reference in the WLCI literature database attempt to match article to a GeoDeepDive ID using the DOI.
#Note that not all articles with have a DOI documented for both databases.  In these cases we could attempt a title match in future iterations.
for ref in lit_json:
    wlci_id = ref['id']
    if 'DOI' in ref:
        doi = ref['DOI']
    #In Zotero some types of articles did not provide an 'id' field.  If this was the case we stored available DOIs in the note field
    elif 'note' in ref and 'OCLC' not in ref['note']:
        doi = ref['note']
        
    #Set parameters for xDD search, return only 1 result (max=1)    
    route = 'articles'
    param = 'max=1&doi='+str(doi)
    
    #Search xDD, return json data
    xdd_data = xdd_api(route, param)
    #If there is a match for the article from WLCI to xDD then record the identifiers and their relationship across databases
    if xdd_data:
        xdd_id = xdd_data[0]['_gddid']
        wlci_xdd.append(
            {
                'wlci_id': wlci_id, 
                'relation': 'doi_match', 
                'xdd_id':xdd_id, 
                'param':param, 
                'title': ref['title']
            }
        )

In [19]:
#Store relationship data between WLCI literature database and xDD database
with open('sources/WLCI_Literature_xDD_Matches.json', 'w') as f:
    f.write(json.dumps(wlci_xdd, indent=4))
    f.close()

In [15]:
#Show first 5 matches as example
wlci_xdd[0:6]

[{'wlci_id': 'http://zotero.org/groups/2341914/items/R3NSCC9R',
  'relation': 'doi_match',
  'xdd_id': '5c2c30a41faed655488b299b',
  'param': 'max=1&doi=10.1002/jwmg.21386',
  'title': 'Greater Sage-Grouse Population Trends Across Wyoming: WY Sage-grouse Population Viability Analysis'},
 {'wlci_id': 'http://zotero.org/groups/2341914/items/Z7TMPV6Q',
  'relation': 'doi_match',
  'xdd_id': '579f4458cf58f123c56623f8',
  'param': 'max=1&doi=10.1016/j.ecolind.2015.03.002',
  'title': 'Forecasting Sagebrush Ecosystem Components and Greater Sage-grouse Habitat for 2050: Learning from Past Climate Patterns and Landsat Imagery to Predict the Future'},
 {'wlci_id': 'http://zotero.org/groups/2341914/items/XYW5M83E',
  'relation': 'doi_match',
  'xdd_id': '5d4384980b45c76cafa2c0eb',
  'param': 'max=1&doi=10.1002/jwmg.1050',
  'title': 'Effects of Lek Count Protocols on Greater Sage-grouse Population Trend Estimates: Lek Count Timing and Trend Estimates'},
 {'wlci_id': 'http://zotero.org/groups/234

In [16]:
#Create list of species that are mentioned in the WLCI efforts 
specie_mentions = list()

#Search each article for a list of ITIS terms (this example uses scientific name only) and create a unique list for all WLCI articles in xDD
#Also record hits which are the number of times that term was referenced in the article
for xdd_record in wlci_xdd:
    route = 'terms'
    param = f"docid={xdd_record['xdd_id']}&dictionary=ITIS"
    xdd_data = xdd_api(route, param)
    for x in xdd_data:
        #x['term'] is a list of ITIS terms mentioned in the article
        if len(x['term'].split()) > 1:
            specie = x['term']
            hits = str(x['n_hits'])
            specie_mentions.append(
                {'Scientific Name':specie, 
                 'xdd_id':xdd_id, 
                 'wlci_id':wlci_id, 
                 'n_hits':hits
                }
            ) 

In [17]:
#Show first 5 mentions
specie_mentions[0:6]

[{'Scientific Name': 'Branta canadensis',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '1'},
 {'Scientific Name': 'Castor canadensis',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '1'},
 {'Scientific Name': 'Haliaeetus leucocephalus',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '6'},
 {'Scientific Name': 'Macaca sylvanus',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '1'},
 {'Scientific Name': 'Artemisia tridentata',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '1'},
 {'Scientific Name': 'Centrocercus urophasianus',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '5'}]

In [20]:
# Cache the array of retrieved documents and return/display a random sample for verification
display(bis_utils.doc_cache("sources/WLCI_Species_List_from_Literature.json", specie_mentions))

{'Doc Cache File': 'sources/WLCI_Species_List_from_Literature.json',
 'Number of Documents in Cache': 346,
 'Document Number 341': {'Scientific Name': 'Centrocercus urophasianus',
  'xdd_id': '5c4e3f571faed655489408c3',
  'wlci_id': 'http://zotero.org/groups/2341914/items/KP3R7Q33',
  'n_hits': '10'}}