In [1]:
!pip install opentree



In [2]:
from opentree import OT
import requests
import pprint
import pandas as pd

Get the list of taxa from a textfile.  

In [3]:
fi = open("/Users/emily/Downloads/ObservedPlantSpecies.txt").readlines()

In [4]:
fi[0].split(';')

['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species\n']

In [5]:
fi[0].split(';')[5]

'genus'

In [6]:
all_genera = set()
for taxon in fi:
    try:
        genus = taxon.strip().split(';')[6]
        all_genera.add(genus)
    except:
        pass
        

In [7]:
len(all_genera)

232

In [8]:
OT.get_ottid_from_name('Stappia')


351680

In [9]:
matches = dict()
ottid_to_genus = dict()
for genus in all_genera:
    if len(genus) > 2:
        try:
            ott_id = OT.get_ottid_from_name(genus)
            ottid_to_genus['ott{}'.format(ott_id)]=genus
            matches[genus] = ott_id
        except:
            print("Failed to get an ottid for {}".format(genus))

Failed to get an ottid for Symphotrichum lanceolatum
Failed to get an ottid for species


In [10]:
output = OT.synth_induced_tree(ott_ids=list(matches.values()),  label_format='name_and_id')

In [11]:
#Create a dictionary that matches the taxa that are returned with the input taxa requested.
relabel = dict()
broken = output.response_dict['broken']
for taxon in broken:
    remap = broken[taxon]
    if remap.startswith('mrca'):
        if remap not in relabel:
            relabel[remap] = []
        relabel[remap].append("{} {}".format(ottid_to_genus[taxon], taxon))
    if remap.startswith('ott'):
        if remap not in relabel:
            relabel[remap] = []
        relabel[remap].append("{} {}".format(ottid_to_genus[taxon], taxon))

In [12]:
import copy
backuptree = copy.deepcopy(output.tree)

In [13]:
#Gos through each taxon in teh tree that is returned, and matches it to the rquested genus or genera, and adds those to the label
for taxon in backuptree.taxon_namespace:
    if taxon.label.startswith('mrca'):
        taxon.label = 'MRCA of taxa in '+' '.join(relabel[taxon.label])
    else:
        ott = taxon.label.split()[-1]
        if ott in relabel:
            added_taxa = ' MRCA of taxa in '+' '.join(relabel[ott])
            taxon.label = taxon.label + added_taxa

In [14]:
# Some characters in names mess up newick tree readers
def remove_problem_characters(instr, prob_char = "():#", replace_w = '?'):
    problem_characters = set(prob_char)
    for char in problem_characters:
        instr = instr.replace(char,replace_w)
    return instr

In [15]:
#Some of these taxon names are still causing issues...
for taxon in backuptree.taxon_namespace:
    taxon.label = remove_problem_characters(taxon.label)

for node in backuptree:
    if node.label:
        node.label = remove_problem_characters(node.label)

# Write the tree out to a file in Newick format
treefile = 'bigbio_clean.tre'
backuptree.write(path = treefile, schema = 'newick')

# Tree annotations
We can display additional data on the tree on ITOL by adding an annotation file.
This will map each node label to data about it.

We can get data about a lot of species using the Encyclopedia of Life API.

In [16]:
# Most labels are just the taxon and OTT ID, but some end up with weird other stuff
# Keep track of the actual species name that each label is for
name_to_label = {}
for taxon in backuptree.taxon_namespace:
    name_to_label[ottid_to_genus[taxon.label.split()[-1]]] = taxon.label

In [17]:
for taxon in backuptree.taxon_namespace:
    print(taxon.label)

Oxalis stricta ott237293
Euphorbia corollata ott827132
Euphorbia geyeri ott770977
Euphorbia glyptosperma ott145990
Euphorbia maculata ott683660
Populus tremuloides ott8858
Salix discolor ott164187
Viola pedata ott740286
Viola sagittata ott265431
Viola pallens ott697882
Viola pedatifida ott3915440
Urtica dioica ott267705
Ulmus americana ott706094
Ceanothus americanus ott559254
Potentilla argentea ott355267
Potentilla recta ott768212
Potentilla norvegica ott902633
Drymocallis arguta ott328274
Fragaria virginiana ott1004791
Fragaria vesca ott852873
Potentilla simplex ott396171
Rosa arkansana ott252379
Rubus occidentalis ott670811
Rubus idaeus ott156929
Rubus flagellaris ott896744
Spiraea alba ott774987
Prunus pumila ott180918
Prunus pensylvanica ott180915
Corylus americana ott731679
Alnus incana ott791133
Quercus macrocarpa ott37377
Quercus ellipsoidalis ott775280
Quercus rubra ott791115
Astragalus canadensis ott314154
Trifolium repens ott116218
Trifolium hybridum ott1066895
Trifolium pra

## API requests
Here are some helper functions that we will use to make different types of requests to the Encyclopedia of Life API. Queries must be made using Cypher Query Language.

In [60]:

# Put your EOL API token here
token = "eyJhbGciOiJIUzI1NiJ9.eyJ1c2VyIjoiZXNvdGhAdWNzYy5lZHUiLCJlbmNyeXB0ZWRfcGFzc3dvcmQiOiIkMmEkMTEkOEcxN0kwN3BaakxTcFlGV3RMMnZzZUx3YU1PQWVDRnphbVJWSmppUDRhTnRlaFQ4NkhZaUMifQ.zAOKzbTDVJjoD5uARaW0hN0OLDfzQ2FuKyy33kyWFOQ"

def api_request(query):
    """
    Make a GET request to the EOL API.
    
    Args:
        query (string): a Cypher Query Language query to the EOL database
    Returns:
        dictionary of the JSON response from the API
    """

    url = 'https://eol.org/service/cypher'
    headers = {
        "accept": "application/json",
        "authorization": "JWT " + token
    }
    params = {
        'query': query,
        'format': 'cypher'
    }  

    response = requests.get(url, headers=headers, params=params)
    if response.status_code != 200:
        print('Error: ', response.status_code)

#     pp = pprint.PrettyPrinter()
#     pp.pprint(response.json())

    return response.json()

def get_available_traits(species):
    query = 'MATCH (p:Page)-->(t:Trait)-[:predicate]->(q:Term) \
            WHERE p.canonical = "{}" \
            RETURN DISTINCT q.name AS trait \
            LIMIT 1000'.format(species)

    return api_request(query)

def get_species_relationships(species):
    """Get all traits that are a relationship with another species"""
    query = 'MATCH (p:Page)-->(t:Trait)-->(q:Term) \
            WHERE p.canonical = "{}" \
            WITH p, q, t.object_page_id AS pageID \
            MATCH (s:Page) \
            WHERE s.page_id = pageID \
            RETURN DISTINCT p.canonical AS species, q.name AS action, s.canonical AS target \
            LIMIT 100'.format(species)

    return api_request(query)

def get_species_association_trait(species, trait):
    query = 'MATCH (p:Page)-->(t:Trait)-->(q:Term) \
            WHERE p.canonical = "{}" AND q.name = "{}" \
            WITH p, q, t.object_page_id AS pageID \
            MATCH (s:Page) \
            WHERE s.page_id = pageID \
            RETURN DISTINCT p.canonical, q.name, s.canonical \
            LIMIT 100'.format(species, trait)

    return api_request(query)

def get_species_attributes(species):
    query = 'MATCH (p:Page)-->(t:Trait)-[:predicate]->(r:Term), \
            (t)-[:object_term]->(q:Term) \
            WHERE p.canonical = "{}" \
            RETURN DISTINCT p.canonical, r.name, q.name \
            LIMIT 100'.format(species)

    return api_request(query)

def get_species_list_attribute_trait(species_list, trait):
    query = 'MATCH (p:Page)-->(t:Trait)-[:predicate]->(r:Term), \
            (t)-[:object_term]->(q:Term) \
            WHERE p.canonical IN {} AND r.name = "{}" \
            RETURN DISTINCT p.canonical AS species, q.name AS value \
            LIMIT 100'.format(species_list, trait)

    return api_request(query)


def get_species_numeric_trait(species, trait):
    query = 'MATCH (t:Trait)<-[:trait]-(p:Page), \
           (t)-[:predicate]->(pred:Term)-[:parent_term|:synonym_of*0..]->(parent:Term) \
           WHERE p.canonical = "{}" AND parent.name = "{}" \
           OPTIONAL MATCH (t)-[:units_term]->(units:Term) \
           RETURN p.canonical, pred.name, t.measurement, units.name, t.source \
           LIMIT 100'.format(species, trait)
    return api_request(query)



species_list = list(name_to_label.keys())
# species_list = ['Acer rubrum']
trait = 'habitat'

data = []
binary_data = []
unique_values = set()


In [61]:
trait_counts = {}
for species in species_list:
    traits = get_available_traits(species)['data']
    for trait, in traits:
        if trait in trait_counts:
            trait_counts[trait] += 1
        else:
            trait_counts[trait] = 1

print(trait_counts)

plant height
stem diameter
Leaf mass per area
leaf area
seed mass
ratio
wood density
Leaf nitrogen per dry mass
leaf sheddability
geographic distribution includes
plant growth form
protein potential
graze animal palatability
browse animal palatability
fuelwood suitability
vegetative spread rate
grain type
seedling survival
seeds per pound
seed spread rate
plant propagation method
seed period end
fruit/seed persistence
fruit/seed abundance
seed period begin
bloom period
commercial availability
low temperature tolerance
shade tolerance
salt tolerance
soil depth
precipitation tolerance
planting density
soil pH
life cycle habit
moisture use
hedge tolerance
frost free days
fire tolerance
primary macronutrient requirements
drought tolerance
germination requirements
calcareous soil tolerance
anaerobic soil tolerance
soil requirements
human/livestock toxicity
shape
resprout ability after clipping
life span
grass growth type
shedability
allelopathic effect
growth rate
primary growth form
hortic

If there are a lot of species, they won't all fit in one API request.
Split the list into chunks of no more than 100 species at a time, and combine the results.

In [19]:
results = []
while species_list:
    print(len(species_list))
    index = min(100, len(species_list))
    results += get_species_list_attribute_trait(
        species_list[:index], 
        trait)['data']
    species_list = species_list[index:]

1
{'columns': ['species', 'value'],
 'data': [['Acer rubrum', 'peatland'],
          ['Acer rubrum', 'woodland canopy'],
          ['Acer rubrum', 'swamp'],
          ['Acer rubrum', 'upland soil'],
          ['Acer rubrum', 'deciduous forest'],
          ['Acer rubrum', 'forest'],
          ['Acer rubrum', 'limestone'],
          ['Acer rubrum', 'mountain'],
          ['Acer rubrum', 'lake'],
          ['Acer rubrum', 'histosol'],
          ['Acer rubrum', 'coniferous forest'],
          ['Acer rubrum', 'cliff'],
          ['Acer rubrum', 'dune'],
          ['Acer rubrum', 'plateau'],
          ['Acer rubrum', 'woodland'],
          ['Acer rubrum', 'flood plain'],
          ['Acer rubrum', 'farm'],
          ['Acer rubrum', 'coastal'],
          ['Acer rubrum', 'hammock'],
          ['Acer rubrum', 'sediment'],
          ['Acer rubrum', 'river'],
          ['Acer rubrum', 'river valley'],
          ['Acer rubrum', 'lake shore'],
          ['Acer rubrum', 'dry soil'],
          ['Acer 

Make a list of all possible trait values (the set of all values returned by the API).
This will be the header of the CSV output.

In [20]:
# Make a list of the unique trait values, sorted alphabetically
unique_values = sorted(list(set([row[1] for row in results])))

Now we need to reformat the data. We want to map each species to all the trait values it has.

Here we create a dictionary that maps `{'species name': {set of its trait values}}`.

In [21]:
data_dict = {}
for row in results:
    species = row[0]
    value = row[1]
    if species in data_dict:
        data_dict[species].add(value)
    else:
        data_dict[species] = set({value})

{'Acer rubrum': {'cliff', 'peatland', 'delta', 'coniferous forest', 'woodland canopy', 'mountain', 'canal', 'limestone', 'mixed forest', 'flood plain', 'river valley', 'plateau', 'prairie', 'hammock', 'loam', 'reservoir', 'lake shore', 'swamp', 'deciduous forest', 'dune', 'farm', 'histosol', 'savanna', 'sediment', 'terrestrial', 'forest', 'marsh', 'dry soil', 'temperate', 'river', 'upland soil', 'woodland', 'coastal', 'lake'}}


Finally, to output the data in a useful CSV format, we turn it into a binary table.

The rows are species names and the columns are trait values.

Each cell is filled with a 1 if the species has that trait value, 0 if not.

The output of this cell will be the data in CSV format. You can copy and paste this into a file.

In [22]:
binary_data = []
for species, values in data_dict.items():
    binary_row = [species]
    for value in unique_values:
        if value in values:
            binary_row.append('1')
        else:
            binary_row.append('0')
    binary_data.append(binary_row)
    
# Print out the data in CSV format
print(','.join(unique_values))
for row in binary_data:
    print(','.join(row))

canal, cliff, coastal, coniferous forest, deciduous forest, delta, dry soil, dune, farm, flood plain, forest, hammock, histosol, lake, lake shore, limestone, loam, marsh, mixed forest, mountain, peatland, plateau, prairie, reservoir, river, river valley, savanna, sediment, swamp, temperate, terrestrial, upland soil, woodland, woodland canopy
Acer rubrum, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1


In [None]:
# Replace spaces with underscores to get the tree node ID
for row in binary_data:
    species = row[0]
    species_label = name_to_label[species]
    species_label = species_label.replace(' ', '_')
    print(species_label + ',' + ','.join(row[1:]))