In [1]:
import pandas as pd
from pathlib import Path
import requests
import json
import time
import numpy as np

In [2]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [3]:
def print_json(obj):
    print(json.dumps(obj,  indent=4))

In [4]:
raw_data_paths = Path('../data').rglob('taxa_list.csv')
data_paths = [str(path) for path in raw_data_paths]
print(data_paths)

['../data/clarkstown-high-school-north/taxa_list.csv', '../data/cedar-creek-reserve/taxa_list.csv', '../data/los-angeles-bioblitz/taxa_list.csv', '../data/ciencia-ciudadana-peru-bats/taxa_list.csv', '../data/ciencia-ciudadana-peru-bees/taxa_list.csv']


# interactions

## create interactions taxa csv

In [351]:
cols = ['scientific_name', 'taxon_id', 'common_name']
taxa_df = pd.read_csv('outputs/combine_taxa_list.csv', dtype=str, usecols=cols)
taxa_df.to_csv('outputs/interactions_taxa_list.csv', index=False)

## create interactions

In [375]:
taxa_path = 'outputs/interactions_taxa_list.csv'
taxa_df = pd.read_csv(taxa_path, dtype=str)
log_df(taxa_df)

(9406, 3)


Unnamed: 0,scientific_name,common_name,taxon_id
0,Deidamia inscriptum,Lettered Sphinx,143452
1,Acer,maples,47727
2,Plantago lanceolata,ribwort plantain,53178
3,Poa pratensis,Kentucky bluegrass,60307
4,Magnoliopsida,dicots,47124


In [376]:
interaction_path = 'outputs/interactions.csv'
interaction_df = pd.read_csv(interaction_path, dtype=str)
log_df(interaction_df)

(47856, 7)


Unnamed: 0,subject_taxon_id,subject_common_name,subject_scientific_name,target_scientific_name,target_common_name,target_taxon_id,interaction
0,143452,Lettered Sphinx,Deidamia inscriptum,Parthenocissus,,50280,eats
1,143452,Lettered Sphinx,Deidamia inscriptum,Vitis,grapevines,60773,eats
2,143452,Lettered Sphinx,Deidamia inscriptum,Ampelopsis brevipedunculata,porcelain berry,457553,eats
3,143452,Lettered Sphinx,Deidamia inscriptum,Vitis vinifera,wine grape,79519,eats
4,143452,Lettered Sphinx,Deidamia inscriptum,Parthenocissus quinquefolia,Virginia creeper,50278,eats


In [377]:
def fetchGlobi(taxon_name, interaction):
    time.sleep(0.25)
    apiBase = 'https://api.globalbioticinteractions.org/taxon'
    url = f'{apiBase}/{taxon_name}/{interaction}'
    # print(url)
    
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        if len(json_data['data']) > 0:
            return json_data['data'][0][2]

            
def fetchiNat(taxon_name):
    if taxon_name.startswith('BOLD:'):
        return
    if taxon_name.startswith('ORN.'):
        return
    if taxon_name.startswith('http'):
        return
    if taxon_name.startswith('various'):
        return
    if taxon_name.startswith('secretions'):
        return

    
    time.sleep(0.25)
    url = f'https://api.inaturalist.org/v1/taxa?q={taxon_name}'
    # print(url)
    
    response = requests.get(url)
    if response.status_code == 200:
        json_data = response.json()
        if len(json_data['results']) > 0:
            result =json_data['results'][0]
            data = {
                'scientific_name': result['name'], 
                'taxon_id': result['id'], 
            }
            if 'preferred_common_name' in result:
                data['common_name'] =  result['preferred_common_name']
            else:
                data['common_name'] = ''
            return data

            
            
def formatInteractions(df, row, interaction, interactionLimit = 3): 
    interactions = []
    not_found_names = []
    count = 0
    
    # connect to Globi api    
    names = fetchGlobi(row['scientific_name'], interaction)
    if not names:
        return
    
    
    # look for Globi taxon name in the existing taxa dataframe     
    for name in names:
        search_results = df[df['scientific_name'] == name]
        if len(search_results) > 0:
            interaction_results = {
                'subject_taxon_id': row['taxon_id'],
                'subject_common_name': row['common_name'],
                'subject_scientific_name': row['scientific_name'],

                'target_scientific_name': name, 
                'target_common_name': search_results['common_name'].values[0],
                'target_taxon_id': search_results['taxon_id'].values[0],
                'interaction': interaction,
            }
            interactions.append(interaction_results)
            count += 1
        else:
            not_found_names.append(name)
            
    # if Globi taxon name isn't in taxa dataframe, look up taxon name in iNat api
    for name in not_found_names:
        if count < interactionLimit:
            search_results = fetchiNat(name)
            if search_results:
                interaction_results = {
                    'subject_taxon_id': row['taxon_id'],
                    'subject_common_name': row['common_name'],
                    'subject_scientific_name': row['scientific_name'],
                    
                    'target_scientific_name': name, 
                    'target_common_name': search_results['common_name'],
                    'target_taxon_id': search_results['taxon_id'],
                    'interaction': interaction,
                }
                interactions.append(interaction_results)
                count += 1
                
                # add iNat taxa to taxa dataframe
                taxon = [ name,  search_results['common_name'],  search_results['taxon_id']]      

                df.loc[len(df) + 1] = taxon
            else:
                taxon = [ name,  '',  '']      
                df.loc[len(df) + 1] = taxon
                
    return interactions

        



In [383]:
interaction_data = []
for index, row in taxa_df.iterrows():
    if index % 50 == 0:
        print(index, end=' ')
        
    for interaction in ['eats', 'eatenBy', 'pollinates', 'pollinatedBy', 'preysOn', 'preyedUponBy']:
        results = formatInteractions(taxa_df, row, interaction)
        if results:
            interaction_data += results
            


In [381]:
new_interactions = pd.DataFrame(interaction_data)
interaction_df = interaction_df.append(new_interactions)
log_df(interaction_df)

(53859, 7)


Unnamed: 0,subject_taxon_id,subject_common_name,subject_scientific_name,target_scientific_name,target_common_name,target_taxon_id,interaction
0,143452,Lettered Sphinx,Deidamia inscriptum,Parthenocissus,,50280,eats
1,143452,Lettered Sphinx,Deidamia inscriptum,Vitis,grapevines,60773,eats
2,143452,Lettered Sphinx,Deidamia inscriptum,Ampelopsis brevipedunculata,porcelain berry,457553,eats
3,143452,Lettered Sphinx,Deidamia inscriptum,Vitis vinifera,wine grape,79519,eats
4,143452,Lettered Sphinx,Deidamia inscriptum,Parthenocissus quinquefolia,Virginia creeper,50278,eats


create interactions csv

In [382]:
interaction_df.to_csv(interaction_path, index=False)


create interactions json

In [393]:
cols = [
    'subject_taxon_id', 
    'target_scientific_name', 'target_common_name', 'target_taxon_id',
    'interaction'
]
interaction_json_df = interaction_df[cols]
log_df(interaction_json_df)

In [394]:
path = '../inat_data_explorer/src/lib/data/interactions.json'
interaction_json_df.to_json(path, orient = "records")

create taxa csv

In [379]:
log_df(taxa_df)

(9932, 3)


Unnamed: 0,scientific_name,common_name,taxon_id
0,Deidamia inscriptum,Lettered Sphinx,143452
1,Acer,maples,47727
2,Plantago lanceolata,ribwort plantain,53178
3,Poa pratensis,Kentucky bluegrass,60307
4,Magnoliopsida,dicots,47124


In [380]:
taxa_df.to_csv(taxa_path, index=False)
