In [1]:
import json

taxonomy_lookup = []

In [2]:
def get_wlci_xdd_cache():
    '''
    description: Import WLCI xdd species cached data 
    '''
    with open("../sources/WLCI_Species_List_from_Literature.json", "r") as f:
        wlci_sp_mentions = json.loads(f.read())
    return wlci_sp_mentions

def get_wlci_xdd_info(name):
    '''
    description: Extract summary information for a provided species from the WLCI_Species_List_from_Literature.json file.
    This information summarizes mentions of scientific names from WLCI literature included in the xDD database.
    '''
    wlci_sp_mentions = get_wlci_xdd_cache()
    unique_wlci_xdd_articles = list(set([spp["xdd_id"] for spp in wlci_sp_mentions]))
    
    wlci_xdd_info = dict()
    name_mentions = [i for i in wlci_sp_mentions if i['Scientific Name']==name]
    count_hits = sum(int(mention['n_hits']) for mention in name_mentions)
    doi_list = list(set([mention['doi'] for mention in name_mentions]))
    wlci_xdd_info['scientific_name'] = name
    #Percent of wlci_xdd articles that mention the specie "name"
    wlci_xdd_info['wlci_xdd_per'] = len(doi_list)/len(unique_wlci_xdd_articles)*100
    wlci_xdd_info['wlci_xdd_hits'] = count_hits
    wlci_xdd_info['wlci_xdd_doi_list'] = doi_list
    return wlci_xdd_info

def get_xdd_snippets(name, xdd_snippet_cache):
    '''
    description: Pull count and api to access xDD snippets for species of interest.
    '''
    xdd_snippets = dict()

    record = [i for i in xdd_snippet_cache if i['parameters']['Search Term']==name if i['data']] 
    if len(record) > 0:
        record = record[0] #tested for only 1 record per specie name, one duplicate existed but all data other than timestamp was same so ignored
        xdd_snippets['xdd_snippet_api'] = record['processing_metadata']['api']
        xdd_snippets['xdd_snippet_doc_ct'] = len(record['data'])
    
    return xdd_snippets
    


In [3]:
#Import ITIS Cache and grabs summary information for a species
def itis_summary(specie_name, itis_cache):
    '''
    description: Extract summary information for a provided species from the itis.json file.
    This information summarizes relevant ITIS information.
    '''

    record = [i for i in itis_cache if i['parameters']['Scientific Name']==specie_name][0]
    itis_data = dict()
    for itis_entry in record['data']:
        itis_related_tsn =[]
        if itis_entry['usage'] == 'valid' or itis_entry['usage'] == 'accepted':
            itis_data['itis_tsn'] = itis_entry['tsn']
            itis_data['itis_tsn_usage'] = itis_entry['usage']
            for level in itis_entry['biological_taxonomy']:
                l = level['rank']
                field = f'itis_{l}'
                bio_tax_value = level['name']
                itis_data[field]= bio_tax_value
            if 'commonnames' in itis_entry:
                itis_data['itis_common_names'] = list(set([nm['name'] for nm in itis_entry['commonnames'] if nm['language']=='English']))
        else:
            itis_related_tsn.append(itis_entry['tsn'])
        if len(itis_related_tsn)>0:
            itis_data['itis_related_tsn'] = itis_related_tsn
        if 'geographicDivision' in itis_entry:
            itis_data['itis_geographic_division']= list(set([i['geographic_value'] for i in itis_entry['geographicDivision']]))
        if 'jurisdiction' in itis_entry:
            itis_data['itis_native_to']=list(set([i['jurisdiction_value'] for i in itis_entry['jurisdiction'] if i['origin']=='Native']))

    else:
        itis_tsn = record['data'][0]['tsn']
        
    return itis_data

In [4]:
#Import SGCN Cache and grabs summary information for a species
def sgcn_summary(specie_name, sgcn_cache):
    '''
    description: Extract summary information for a provided species from the sgcn.json file.
    '''
    sgcn_data = dict()
    
    record = [i for i in sgcn_cache if i['parameters']['Scientific Name']==name if i['data']] 
    if len(record) > 0:
        record = record[0] #tested for only 1 record per specie name, one duplicate existed but all data other than timestamp was same so ignored
        sgcn_data['sgcn_statelist_2005']= record['data']['statelist_2005']
        sgcn_data['sgcn_statelist_2015']= record['data']['statelist_2015']
        sgcn_data['sgcn_reference']=record['processing_metadata']['api']
    return sgcn_data
    

In [5]:
#Read in the data
with open('../cache/xdd.json', 'r') as f:
    xdd_snippet_cache = json.loads(f.read())

with open('../cache/sgcn.json', 'r') as f:
        sgcn_cache = json.loads(f.read())

with open('../cache/itis.json', 'r') as f:
    itis_cache = json.loads(f.read())


In [6]:
wlci_sp_mentions = get_wlci_xdd_cache()
sci_name_list = list(set([spp["Scientific Name"] for spp in wlci_sp_mentions]))

for name in sci_name_list:
    specie_document = dict()
    
    wlci_xdd_info = get_wlci_xdd_info(name)
    specie_document.update(wlci_xdd_info)
    
    itis_data = itis_summary(name, itis_cache) 
    specie_document.update(itis_data)
    
    sgcn_data = sgcn_summary(name, sgcn_cache)
    specie_document.update(sgcn_data)
    
    xdd_snippets = get_xdd_snippets(name, xdd_snippet_cache)
    specie_document.update(xdd_snippets)
    
    taxonomy_lookup.append(specie_document)

In [7]:
taxonomy_lookup

[{'scientific_name': 'Reithrodontomys megalotis',
  'wlci_xdd_per': 3.4482758620689653,
  'wlci_xdd_hits': 3,
  'wlci_xdd_doi_list': ['10.1111/1365-2664.12513'],
  'itis_tsn': '180343',
  'itis_tsn_usage': 'valid',
  'itis_Kingdom': 'Animalia',
  'itis_Subkingdom': 'Bilateria',
  'itis_Infrakingdom': 'Deuterostomia',
  'itis_Phylum': 'Chordata',
  'itis_Subphylum': 'Vertebrata',
  'itis_Infraphylum': 'Gnathostomata',
  'itis_Superclass': 'Tetrapoda',
  'itis_Class': 'Mammalia',
  'itis_Subclass': 'Theria',
  'itis_Infraclass': 'Eutheria',
  'itis_Order': 'Rodentia',
  'itis_Suborder': 'Myomorpha',
  'itis_Superfamily': 'Muroidea',
  'itis_Family': 'Cricetidae',
  'itis_Subfamily': 'Neotominae',
  'itis_Genus': 'Reithrodontomys',
  'itis_Subgenus': 'Reithrodontomys (Reithrodontomys)',
  'itis_Species': 'Reithrodontomys megalotis',
  'itis_common_names': ['Western Harvest Mouse'],
  'itis_geographic_division': ['North America', 'Middle America'],
  'itis_native_to': ['Continental US', 'M