In [1]:
import pandas as pd
from pathlib import Path
import requests
import json
import time
import numpy as np

from globi import formatInteractions, fetchGlobi, fetchiNat, add_inat_taxa_data


In [2]:
def log_df(df, nrows=5):
    print(df.shape)
    return df.head(nrows)

In [3]:
def print_json(obj):
    print(json.dumps(obj,  indent=4))

In [4]:
raw_data_paths = Path('../data').rglob('taxa_list.csv')
data_paths = [str(path) for path in raw_data_paths]
print(data_paths)

interaction_path = 'outputs/interactions.csv'
de_taxa_path = 'outputs/de_taxa_list.csv'
globi_taxa_path = 'outputs/interactions_taxa_list.csv'

[]


# interactions

## create interactions taxa csv

In [28]:
cols = ['scientific_name', 'common_name', 'taxon_id']
taxa_df = pd.read_csv(de_taxa_path, dtype=str, usecols=cols)
log_df(taxa_df)


(7315, 3)


Unnamed: 0,taxon_id,common_name,scientific_name
0,1,Animals,Animalia
1,47534,Cnidarians,Cnidaria
2,48921,Hydrozoans,Hydrozoa
3,152823,Siphonophores,Siphonophorae
4,117304,Man O' Wars,Physaliidae


In [20]:
cols = ['subject_taxon_id', 'subject_common_name', 'subject_scientific_name']
interactions_df = pd.read_csv(interaction_path, dtype=str, usecols=cols)
interactions_df.drop_duplicates(inplace=True)

interactions_df.rename(columns = {
    'subject_taxon_id': 'taxon_id', 
    'subject_common_name': 'common_name', 
    'subject_scientific_name': 'scientific_name'
}, inplace=True)

interactions_df['has_globi'] = True
interactions_df['type'] = 'subject'

log_df(interactions_df)

(7510, 5)


Unnamed: 0,taxon_id,common_name,scientific_name,has_globi,type
0,143452,Lettered Sphinx,Deidamia inscriptum,True,subject
10,47727,maples,Acer,True,subject
18,53178,ribwort plantain,Plantago lanceolata,True,subject
26,60307,Kentucky bluegrass,Poa pratensis,True,subject
47,47124,dicots,Magnoliopsida,True,subject


In [29]:
updated_globi_taxa = taxa_df.merge(interactions_df, how="left" )
log_df(updated_globi_taxa)

(7315, 3)


Unnamed: 0,taxon_id,common_name,scientific_name
0,1,Animals,Animalia
1,47534,Cnidarians,Cnidaria
2,48921,Hydrozoans,Hydrozoa
3,152823,Siphonophores,Siphonophorae
4,117304,Man O' Wars,Physaliidae


In [31]:
updated_globi_taxa.to_csv(globi_taxa_path, index=False)

## create interactions

In [27]:
cols = ['scientific_name', 'common_name', 'taxon_id']
taxa_df = pd.read_csv(de_taxa_path, dtype=str, usecols=cols)
log_df(taxa_df)

(7315, 3)


Unnamed: 0,taxon_id,common_name,scientific_name
0,1,Animals,Animalia
1,47534,Cnidarians,Cnidaria
2,48921,Hydrozoans,Hydrozoa
3,152823,Siphonophores,Siphonophorae
4,117304,Man O' Wars,Physaliidae


In [38]:
globi_taxa_df = pd.read_csv(globi_taxa_path, dtype=str)
globi_taxa_df = globi_taxa_df[globi_taxa_df['has_globi'].isna()]
log_df(globi_taxa_df)

(7315, 4)


Unnamed: 0,taxon_id,common_name,scientific_name,has_globi
0,1,Animals,Animalia,
1,47534,Cnidarians,Cnidaria,
2,48921,Hydrozoans,Hydrozoa,
3,152823,Siphonophores,Siphonophorae,
4,117304,Man O' Wars,Physaliidae,


In [39]:
interaction_path = 'outputs/interactions.csv'
interaction_df = pd.read_csv(interaction_path, dtype=str)
log_df(interaction_df)

(0, 7)


Unnamed: 0,subject_taxon_id,subject_common_name,subject_scientific_name,target_scientific_name,target_common_name,target_taxon_id,interaction


In [40]:
interaction_data = []
count = 0
for index, row in globi_taxa_df.iterrows():
    if count % 50 == 0:
        print(count, end=' ')
        
    for interaction in ['eats', 'eatenBy', 'pollinates', 'pollinatedBy', 'preysOn', 'preyedUponBy']:
        results = formatInteractions(taxa_df, row, interaction, 100)
        if results:
            interaction_data += results
            
    count += 1
            


0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 1750 1800 1850 1900 1950 2000 2050 2100 2150 2200 2250 2300 2350 2400 2450 2500 2550 2600 2650 2700 2750 2800 2850 2900 2950 3000 3050 3100 3150 3200 3250 3300 3350 3400 3450 3500 3550 3600 3650 3700 3750 3800 3850 3900 3950 4000 4050 4100 4150 4200 4250 4300 4350 4400 4450 4500 4550 4600 4650 4700 4750 4800 4850 4900 4950 5000 5050 5100 5150 5200 5250 5300 5350 5400 5450 5500 5550 5600 5650 5700 5750 5800 5850 5900 5950 6000 6050 6100 6150 6200 6250 6300 6350 6400 6450 6500 6550 6600 6650 6700 6750 6800 6850 6900 6950 7000 7050 7100 7150 7200 7250 7300 

In [41]:
new_interactions = pd.DataFrame(interaction_data)
interaction_df = interaction_df.append(new_interactions)
log_df(interaction_df)

(165994, 7)


Unnamed: 0,subject_taxon_id,subject_common_name,subject_scientific_name,target_scientific_name,target_common_name,target_taxon_id,interaction
0,1,Animals,Animalia,Insecta,Insects,47158,eats
1,1,Animals,Animalia,Bacteria,bacteria,67333,eats
2,1,Animals,Animalia,Bacillariophyceae,Diatoms,123880,eats
3,1,Animals,Animalia,Animalia,Animals,1,eats
4,1,Animals,Animalia,Mysidae,,85912,eats


create interactions csv

In [42]:
interaction_df.to_csv(interaction_path, index=False)


### add inat common name and taxon id to Globi scientific names

In [36]:
interaction_df = pd.read_csv(interaction_path, dtype=str)
log_df(interaction_df)

(165994, 8)


Unnamed: 0,subject_taxon_id,subject_common_name,subject_scientific_name,target_scientific_name,target_common_name,target_taxon_id,interaction,search_inat_taxa
0,1,Animals,Animalia,Insecta,Insects,47158,eats,
1,1,Animals,Animalia,Bacteria,bacteria,67333,eats,
2,1,Animals,Animalia,Bacillariophyceae,Diatoms,123880,eats,
3,1,Animals,Animalia,Animalia,Animals,1,eats,
4,1,Animals,Animalia,Mysidae,,85912,eats,


In [37]:
blank_df = interaction_df[interaction_df['target_taxon_id'].isna()].copy()
blank_df = blank_df[blank_df['search_inat_taxa'] != 'True']
blank_df.dropna(subset=['target_scientific_name'], inplace=True)
log_df(blank_df)

(17761, 8)


Unnamed: 0,subject_taxon_id,subject_common_name,subject_scientific_name,target_scientific_name,target_common_name,target_taxon_id,interaction,search_inat_taxa
88776,21359,True Toads,Bufonidae,Euspilotus,,,eatenBy,
88780,21359,True Toads,Bufonidae,Thamnophis sirtalis fitchi,,,eatenBy,
88791,21359,True Toads,Bufonidae,Causus rhombeatus,,,preyedUponBy,
88793,21359,True Toads,Bufonidae,Causus maculatus,,,preyedUponBy,
88797,21359,True Toads,Bufonidae,Causus resimus,,,preyedUponBy,


In [38]:
names = blank_df['target_scientific_name'].unique()
len(names)

12063

In [None]:
for index, name in enumerate(names):
    if index % 100 == 0:
        print(index, end=' ')
        
    results = fetchiNat(name)
    interaction_df.loc[interaction_df['target_scientific_name'] == name, 'search_inat_taxa'] = True
    if results:
        add_inat_taxa_data(interaction_df, results)
        
    

0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 

In [None]:
interaction_df.to_csv(interaction_path, index=False)


create interactions json

In [10]:

path = '../app/src/lib/data/interactions.csv'
interaction_df.to_csv(path, index = False)