# Midterm — iNaturalist
### Daniel Phillips, Chris Howard, Phillip Johnson, Jacob Smith, Michael Reid

In [1]:
import requests
import json
import pandas as pd
from pandas.io.json import json_normalize
from IPython.display import display

def get_taxa_id(species_name):
    '''
    This function returns the taxon_id when given the species name.
    
    Parameters:
    species_name: a string object representing a species name, e.g. "Danaus plexippus"
    
    Returns: ids, a list object containing integer id's for the species
    '''
    
    base_url = "http://api.inaturalist.org/v1/taxa/autocomplete?q="

    
    request = requests.get(base_url + "%20".join(species_name.split()))
    data = request.json()

    ids = []
    for i in data['results']:
        ids.append(i['id'])

    return ids


def get_observation(id_no, month, year):
    '''
    This function returns observation data when given taxon_id, month, and year.
    
    Parameters: 
    id_no: an integer representing species taxon_id
    month: an integer (1-12) representing the month of interest
    year: an integer representing year of interest
    
    Returns: observational data for taxon_id for specified month and year.
    '''
    #Url builder, for the request
    base_url = "http://api.inaturalist.org/v1/observations?"
    end_url = "&order=desc&order_by=created_at"
    url = base_url + 'taxon_id=' + str(id_no) + '&month=' + str(month) + '&year=' + str(year) + end_url

    request = requests.get(url)
    data = request.json()    

    return data
    
        
def get_count_one_month(id_no_lst, month, year):
    '''
    This function counts the number of observations of a taxon_id, for each month of a given year.
    
    Parameters:
    id_no_lst: a Python list object containing IDs 
    month: an integer object (1-12) representing the month you want the count for
    year: an integer object for the year of interest
    
    Returns: count, an integer of how many observations are given for some id, for some month of a given year.
    '''
    count = 0
    for i in id_no_lst:
        count += int(get_observation(i, month, year)['total_results']) #total_results key associates w/ ea. set of obs data
    return count
    
    

species = [
    'Danaus plexippus',
    'Hyles lineata',
    'Zerene cesonia',
    'Papilio multicaudata',
    'Agraulis vanillae',
    'Papilio cresphontes',
    'Strymon melinus',
    'Vanessa cardui',
    'Hylephila phyleus',
    'Danaus gilippus'
]

months = [
    'January',
    'February',
    'March',
    'April',
    'May',
    'June',
    'July',
    'August',
    'September',
    'October',
    'November',
    'December'
]

def main():
    print('running')
    species_to_id = {}
    frames = []
    
    
    #Get a dictionary of the taxa -> lst(ids)
    for i in species:
        species_to_id[i] = get_taxa_id(i)
    
    #Map integers 1-12 to 'January' through 'December'
    month_map = dict(zip(range(1,13), months))
    
    species_dict_out = {}
    year = 2016
    
    print(species_to_id)
    
    #Create a dictionary for each species
    for spec in species_to_id:
        species_dict_out[spec] = {}
        
        #Map each species' months to their corresponding count of species observations for that month (and year)
        for mon in month_map:
            species_dict_out[spec][month_map[mon]] = get_count_one_month(species_to_id[spec], mon, year)
        
        print(species_dict_out[spec])
        
        #Creates list of observed IDs for each species, e.g. {'Danaus plexippus': [48662, 235550]}
        frames.append(species_dict_out[spec])
            
        
    #Makes the JSON Species->ID_List structures 
    result = json_normalize(frames)
    display(result)
    
    
%time main()
    

running
{'Agraulis vanillae': [49150, 312743, 461059, 416323, 208039, 234216], 'Strymon melinus': [50931, 346149, 238732, 238733, 542113, 238734, 238735], 'Papilio cresphontes': [50072], 'Papilio multicaudata': [68263, 237647, 148765, 460141, 545972], 'Hyles lineata': [49348, 293806], 'Danaus plexippus': [48662, 235550], 'Danaus gilippus': [51743, 51748, 50063, 51744, 51746, 51747, 51749], 'Vanessa cardui': [48548], 'Zerene cesonia': [129362, 239037, 532836], 'Hylephila phyleus': [50340, 312192]}
{'March': 44, 'October': 707, 'June': 104, 'May': 92, 'July': 207, 'February': 33, 'August': 245, 'September': 352, 'April': 86, 'December': 79, 'November': 395, 'January': 31}
{'March': 16, 'October': 242, 'June': 100, 'May': 99, 'July': 123, 'February': 29, 'August': 163, 'September': 220, 'April': 128, 'December': 2, 'November': 29, 'January': 4}
{'March': 10, 'October': 76, 'June': 40, 'May': 19, 'July': 54, 'February': 5, 'August': 104, 'September': 73, 'April': 9, 'December': 3, 'Novembe

Unnamed: 0,April,August,December,February,January,July,June,March,May,November,October,September
0,86,245,79,33,31,207,104,44,92,395,707,352
1,128,163,2,29,4,123,100,16,99,29,242,220
2,9,104,3,5,1,54,40,10,19,12,76,73
3,15,41,4,2,0,33,40,7,16,7,24,37
4,59,70,1,7,1,67,26,73,43,5,16,50
5,306,745,288,180,105,418,226,283,219,618,1270,988
6,23,131,86,12,19,93,58,16,37,251,373,212
7,61,88,38,24,12,123,98,32,47,164,105,76
8,7,15,21,1,3,9,5,1,5,110,91,57
9,72,224,17,12,6,149,116,43,65,131,280,292


Wall time: 4min 11s


In [2]:
taxon_dict={}
def get_organized(file):
	file=open(file, 'r')
	for line in file:
		line=line.split()
		taxon_dict[line[0]]=' '.join(line[1:])
		print(taxon_dict[line[0]])
	return taxon_dict
		
hello = get_organized('taxon_list.txt')
print(taxon_dict)

scientificName
Polites origenes
Epargyreus clarus
Poanes zabulon
Euphyes dion
Pyrgus scriptura
Thorybes pylades
Erynnis juvenalis
Polites mystic
Ancyloxypha numitor
Poanes hobomok
Thymelicus lineola
Erynnis baptisiae
Polites themistocles
Eantis tamenund
Thorybes drusius
Antigonus emorsa
Chioides albofasciatus
Urbanus proteus
Poanes viator
Urbanus dorantes
Pyrgus centaureae
Pyrgus albescens
Wallengrenia egeremet
Euphyes vestris
Panoquina ocola
Hylephila phyleus
Pyrgus communis
Achlyodes pallida
Thorybes confusis
Lerema accius
Pyrgus oileus
Hesperia columbia
Erynnis horatius
Amblyscirtes celia
Hesperia colorado
Thorybes bathyllus
Autochton pseudocellus
Heliopetes macaira
Heliopetes laviana
Poanes melane
Polites peckius
Gesta invisus
Amblyscirtes fimbriata
Erynnis zarucco
Lerodea eufala
Phocides pigmalion
Pholisora catullus
Ochlodes agricola
Atalopedes campestris
Wallengrenia otho
Polites sabuleti
Erynnis tristis
Erynnis funeralis
Erynnis icelus
Amblyscirtes hegon
Carterocephalus palaemon

## Running the R Scripts through Bash

The below function will find (and append to a list) every file in the data directory. This will be useful when iterating over each taxonomy ID and running them through the SDM.

In [15]:
import sys, os
files = os.listdir("C:\\Users\\Chris\\Desktop\\ebutterfly-sdm\\scripts\\data\\inaturalist")

print(files)
for_bash = " ".join([str(file) for file in files])
print(for_bash)

['20000jan-iNaturalist.txt', '50931-iNaturalist.txt']
20000jan-iNaturalist.txt 50931-iNaturalist.txt


In [65]:
%%bash
start=$(date +%s.%N)

ids=("47226")
months=("01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12" "all")
for id in "${ids[@]}"; do
    for month in "${months[@]}"; do
        Rscript --vanilla run-sdm.R data/inaturalist/$id-$month-iNaturalist.txt $id-$month output/
    done
done
end=$(date +%s.%N)    
runtime=$(python -c "print(${end} - ${start})")

echo "Runtime was $runtime"



null device 
          1 
class       : RasterLayer 
dimensions  : 168, 144, 24192  (nrow, ncol, ncell)
resolution  : 0.04166667, 0.04166667  (x, y)
extent      : -123, -117, 32, 39  (xmin, xmax, ymin, ymax)
coord. ref. : +proj=longlat +datum=WGS84 
data source : /mnt/c/Users/Chris/Desktop/ebutterfly-sdm/scripts/output/47226-02-prediction.grd 
names       : layer 
values      : 0, 0.5454545  (min, max)

class       : RasterLayer 
dimensions  : 168, 144, 24192  (nrow, ncol, ncell)
resolution  : 0.04166667, 0.04166667  (x, y)
extent      : -123, -117, 32, 39  (xmin, xmax, ymin, ymax)
coord. ref. : +proj=longlat +datum=WGS84 
data source : /mnt/c/Users/Chris/Desktop/ebutterfly-sdm/scripts/output/47226-02-prediction-threshold.grd 
names       : layer 
values      : 0, 1  (min, max)

Finished with file writing.
null device 
          1 
class       : RasterLayer 
dimensions  : 192, 168, 32256  (nrow, ncol, ncell)
resolution  : 0.04166667, 0.04166667  (x, y)
extent      : -123, -116, 32, 40 

In max(obs.data$lat) : no non-missing arguments to max; returning -Inf
In min(obs.data$lat) : no non-missing arguments to min; returning Inf
In max(obs.data$lon) : no non-missing arguments to max; returning -Inf
In min(obs.data$lon) : no non-missing arguments to min; returning Inf
Error in validityMethod(object) : invalid extent: xmin >= xmax
Calls: crop ... .local -> <Anonymous> -> anyStrings -> validityMethod
Execution halted
In cor(x, y) : the standard deviation is zero
Error in singlefold(obs, k) : insufficient records:3, with k=5
Calls: kfold -> singlefold
Execution halted
In max(obs.data$lat) : no non-missing arguments to max; returning -Inf
In min(obs.data$lat) : no non-missing arguments to min; returning Inf
In max(obs.data$lon) : no non-missing arguments to max; returning -Inf
In min(obs.data$lon) : no non-missing arguments to min; returning Inf
Error in validityMethod(object) : invalid extent: xmin >= xmax
Calls: crop ... .local -> <Anonymous> -> anyStrings -> validityMethod


### a note about this bash script

The above script will work if the notebook is sitting in the same directory as the R scripts (like run-sdm.R). It iterates through each taxon ID in the ids array and runs the model on its corresponding text file (the CSV returned from Jeff's get-observations script). When we have our CSVs written, we can use this on them. We just have to think about how we want to structure the directory.