In [1]:
from elasticsearch import Elasticsearch
import pandas as pd
import numpy as np

In [2]:
es = Elasticsearch([
    "c18node2.acis.ufl.edu",
    "c18node6.acis.ufl.edu",
    "c18node10.acis.ufl.edu",
    "c18node12.acis.ufl.edu",
    "c18node14.acis.ufl.edu"
], sniff_on_start=False, sniff_on_connection_fail=False, retry_on_timeout=True, max_retries=10, timeout=10)

In [3]:
higher_taxon_ranks = ["dwc:kingdom","dwc:phylum","dwc:class","dwc:order","dwc:family"]

In [4]:
def rank_and_index(t):
    idx = t.Index
    if not isinstance(idx,tuple):
        idx=(idx,)
               
    rank = higher_taxon_ranks[len(idx) - 1]
    return (rank, idx)

In [5]:
def es_lookup(rank, idx):
    if idx[-1] == "":
        return ("", np.nan)    
    must = [
        {
            "match": {
                rank: {
                    "query": idx[-1],
                    "fuzziness": "AUTO"
                }
            }
        },
        {
            "term": {
                "dwc:taxonRank": rank.split(":")[-1]
            }
        }
    ]
    for i, r in enumerate(idx[:-1]):
        if r != "":
            must.append({
                "term": {
                    higher_taxon_ranks[i]: r
                }
            })
    hits = es.search(index="taxonnames",doc_type="taxonnames",body={
        "query": {
            "bool": {
                "must": must
            }
        }
    })["hits"]["hits"]  
    if len(hits) > 0:
        if "dwc:acceptedNameUsageID" in hits[0]["_source"]:
            hits = es.search(index="taxonnames",doc_type="taxonnames",body={
                "query": {
                    "bool": {
                        "must": {
                            "term": {
                                "dwc:taxonID": hits[0]["_source"]["dwc:acceptedNameUsageID"]
                            }
                        }
                    }
                }
            })["hits"]["hits"]
        if "gbif:canonicalName" in hits[0]["_source"]:
            name = hits[0]["_source"]["gbif:canonicalName"].lower()
        else:
            name = hits[0]["_source"]["dwc:scientificName"].lower()            
            
        return (name, hits[0]["_source"]["dwc:taxonID"])
    else:
        return ("", np.nan)

In [6]:
def higher_ranks(txid):
    hits = es.search(index="taxonnames",doc_type="taxonnames",body={
                "query": {
                    "bool": {
                        "must": {
                            "term": {
                                "dwc:taxonID": txid
                            }
                        }
                    }
                }
    })["hits"]["hits"]
    if len(hits) > 0:
        rank = "dwc:" + hits[0]["_source"]["dwc:taxonRank"]
        if "gbif:canonicalName" in hits[0]["_source"]:
            name = hits[0]["_source"]["gbif:canonicalName"].lower()
        else:
            name = hits[0]["_source"]["dwc:scientificName"].lower()
        
        if "dwc:parentNameUsageID" in hits[0]["_source"]:
            rv = higher_ranks(hits[0]["_source"]["dwc:parentNameUsageID"])
            if name != "":
                rv.update({rank: name, rank+"_id": hits[0]["_source"]["dwc:taxonID"]})
            return rv
        else:
            if name != "":
                return {rank: name, rank+"_id": hits[0]["_source"]["dwc:taxonID"]}
            else:
                return {}
    else:
        return {}

In [7]:
def correct_tree_frame(tree_frame):
    corrected_tree_frame = tree_frame.copy(deep=True)
    for cc in range(0, len(higher_taxon_ranks)):
        fields = list(higher_taxon_ranks[0:cc+1])
        print(fields)
        for t in corrected_tree_frame.groupby(fields).sum().itertuples():
            tv = True
            rank, idx = rank_and_index(t)
            if not isinstance(idx,tuple):
                idx=(idx,)
            
            tx, txid = es_lookup(rank, idx)
            
            for i, f in enumerate(fields):
                tv &= corrected_tree_frame[f] == idx[i]           

            if txid == txid and "" in idx:
                hr = higher_ranks(txid)
                for f, v in hr.items():
                    corrected_tree_frame.loc[tv, f] = v
            else:
                corrected_tree_frame.loc[tv, fields[-1]] = tx
                corrected_tree_frame.loc[tv, rank + "_id"] = txid

    return corrected_tree_frame

In [8]:
df = pd.read_csv("higherTaxon.csv", encoding="utf-8").fillna("")

In [9]:
df = df.sort_values("count", ascending=False)
df

Unnamed: 0,dwc:kingdom,dwc:phylum,dwc:class,dwc:order,dwc:family,count
54565,,,,,,546003
16662,,,,,asteraceae,70856
18116,,,,,orchidaceae,58703
70278,animalia,,,,,46217
49195,,,,,fabaceae,44816
80637,plantae,,,,asteraceae,44810
9745,,,,,poaceae,42260
84125,,,gastropoda,,,39092
14616,,,,,rubiaceae,33861
60875,animalia,arthropoda,insecta,hymenoptera,formicidae,33857


In [10]:
cdf = correct_tree_frame(df)
ndf = df.copy(deep=True)
for c in cdf.columns:
    if c.endswith("_id"):
        ndf[c] = cdf[c]
    elif c == "count":
        pass
    else:
        ndf["new_" + c] = cdf[c]

['dwc:kingdom']
['dwc:kingdom', 'dwc:phylum']
['dwc:kingdom', 'dwc:phylum', 'dwc:class']
['dwc:kingdom', 'dwc:phylum', 'dwc:class', 'dwc:order']
['dwc:kingdom', 'dwc:phylum', 'dwc:class', 'dwc:order', 'dwc:family']


In [12]:
re_agg = ndf.groupby([f for f in ndf.columns if f.startswith("new_") and f != "new_dwc:genus"]).sum()
re_agg.to_csv("tree_taxon.csv", header=True, encoding="utf-8")

In [13]:
ndf = ndf.sort_values([f for f in ndf.columns if f.startswith("new_")], ascending=False)
ndf

Unnamed: 0,dwc:kingdom,dwc:phylum,dwc:class,dwc:order,dwc:family,count,new_dwc:kingdom,new_dwc:phylum,new_dwc:class,new_dwc:order,new_dwc:family,dwc:kingdom_id,dwc:phylum_id,dwc:class_id,dwc:order_id,dwc:family_id
19046,virus,,,mononegavirales,rhabdoviridae,1,viruses,,,mononegavirales,rhabdoviridae,8,,,842,7758
45229,viruses,,,caudovirales,myoviridae,1,viruses,,,caudovirales,myoviridae,8,,,843,3779
90673,,,,,fungus,18,viruses,,,,fungal,8,,,,4904189
91238,,,,,uncertain fungus family,3,viruses,,,,fungal,8,,,,4904189
69043,,,,,fungae-basidiomycota,1,viruses,,,,fungal,8,,,,4904189
46615,,,,,fungae,1,viruses,,,,fungal,8,,,,4904189
6649,protista,sarcomastigophora,,trichomonadida,monocercomonadidae,1,protozoa,sarcomastigophora,zoomastigophora,trichomonadida,monocercomonadidae,7,56,118,875,2162
83970,protista,sarcomastigophora,,trichomonadida,trichomonadidae,3,protozoa,sarcomastigophora,zoomastigophora,trichomonadida,,7,56,118,875,
16913,protozoa,sarcomastigophora,zoomastigophorea,diplomonadida,hexamitidae,12,protozoa,sarcomastigophora,zoomastigophora,diplomonadida,hexamitidae,7,56,118,757,5858
83770,protista,sarcomastigophora,,diplomonadida,hexamitidae,4,protozoa,sarcomastigophora,zoomastigophora,diplomonadida,hexamitidae,7,56,118,757,5858


In [14]:
ndf.to_csv("tree_higherTaxon.csv", header=True, encoding="utf-8")

In [15]:
ndf[ndf["new_dwc:family"] == "agyrekocyathidae"]

Unnamed: 0,dwc:kingdom,dwc:phylum,dwc:class,dwc:order,dwc:family,count,new_dwc:kingdom,new_dwc:phylum,new_dwc:class,new_dwc:order,new_dwc:family,dwc:kingdom_id,dwc:phylum_id,dwc:class_id,dwc:order_id,dwc:family_id
19801,,,archaeocyatha,capsulocyathida,agyrekocyathidae,2,animalia,,,,agyrekocyathidae,1,,,,4814365


In [16]:
correct_tree_frame(df[df["dwc:family"] == "agyrekocyathidae"])

['dwc:kingdom']
['dwc:kingdom', 'dwc:phylum']
['dwc:kingdom', 'dwc:phylum', 'dwc:class']
['dwc:kingdom', 'dwc:phylum', 'dwc:class', 'dwc:order']
['dwc:kingdom', 'dwc:phylum', 'dwc:class', 'dwc:order', 'dwc:family']


Unnamed: 0,dwc:kingdom,dwc:phylum,dwc:class,dwc:order,dwc:family,count,dwc:kingdom_id,dwc:phylum_id,dwc:class_id,dwc:order_id,dwc:family_id
19801,animalia,,,,agyrekocyathidae,2,1,,,,4814365


In [17]:
for f in ndf.columns:
    if "dwc:" in f and not f.endswith("_id"):
        print(f, len(ndf[f].unique()), ndf[ndf[f] == ""].shape[0]-1, ndf[ndf[f] == ""].sum()["count"])

(u'dwc:kingdom', 193, 33715, 2539369)
(u'dwc:phylum', 1310, 33156, 3771171)
(u'dwc:class', 1422, 43501, 4227414)
(u'dwc:order', 4492, 30823, 3609477)
(u'dwc:family', 24145, 4308, 932242)
(u'new_dwc:kingdom', 10, 1664, 737336)
(u'new_dwc:phylum', 84, 5298, 965205)
(u'new_dwc:class', 238, 10258, 1134604)
(u'new_dwc:order', 1108, 21719, 1416718)
(u'new_dwc:family', 12545, 20775, 1526829)


In [18]:
re_agg.count()

count    13458
dtype: int64