In [2]:
import pandas as pd
import numpy as np
from functools import reduce 
from collections import defaultdict

In [2]:
ordered_cols = ['kingdom','phylum','class','family','order','genus','species']
descending_ranks = ['species', 'genus', 'order', 'family', 'class', 'phylum', 'kingdom']

## I. Utils

### Custom Tree class

In [3]:
class TreeNode:
    def __init__(self, name, rank):
        self.name = name
        self.rank = rank
        self.children = []
        self.parent = None
        
    def add_child(self, child):
        child.parent = self
        self.children.append(child)
          
    def get_level(self):
        level = 0
        p = self.parent
        while p:
            level += 1
            p = p.parent
        return level
        
    def pprint(self):
        spaces = ' ' * ( self.get_level() * 4)
        prefix = spaces + '|--' if self.parent else ""
        print(prefix + self.name)
        if self.children:
            for child in self.children:
                child.pprint()
    
    def to_dict(self):
        '''transform treeNode to a dictionary object in desired format:
            {'a':{
                'rank':'kingdom'
                'b':{
                    'rank':'phylum'
                    'c':{
                        ...
                    }
                }
            }}
           
        '''
        if self.children :
            return {self.name: self.children[0].to_dict(), 'rank' : self.rank}
        else:
            return {'leafs':self.name, 'rank': self.rank}

### Nested dictionary aggregator

In [4]:
def merge_branches(a, b, path=None):
    "merges b into a"
    if path is None: path = []
    for key in b:
        if key in a:
            if isinstance(a[key], dict) and isinstance(b[key], dict):
                merge_branches(a[key], b[key], path + [str(key)])
            elif a[key] == b[key]:
                pass # same leaf value
            else:
                a[key] = merge_leafs(a[key], b[key])           
        else:
            a[key] = b[key]
    return a

def merge_leafs(leaf_a, leaf_b):
    '''merges leafs according to leafs format'''
    if isinstance(leaf_a, list) and isinstance(leaf_b, list):
        leaf = leaf_a + leaf_b
    elif isinstance(leaf_a, list) and isinstance(leaf_b, str):
        leaf = leaf_a
        leaf.append(leaf_b)
    elif isinstance(leaf_a, str) and isinstance(leaf_b, list):
        leaf = leaf_b
        leaf.append(leaf_a)
    elif isinstance(leaf_a, str) and isinstance(leaf_b, str):
        leaf = [leaf_a, leaf_b]
    else:
        raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
    return list(set(leaf))


### Flat taxonomic array sequence to Tree Object

In [5]:
def to_tree(l:list, descending_ranks:list=descending_ranks)->TreeNode:
    '''transform a list into a TreeNode Object (branch)
    ---parameters---
    input: list containing taxonomic classification sequence 
    ordered like so ['kingdom', 'phylum', 'class', 'family', 'order', 'genus', 'species']
    '''
    taxonomicTree = TreeNode(l[0], descending_ranks[0])
    for i in range(1, len(l)):
        parent =  l[i]
        parentTree = TreeNode(parent, descending_ranks[i])
        parentTree.add_child(taxonomicTree)
        taxonomicTree = parentTree
    return taxonomicTree

### Wrapper : taxomic array matrix to Nested dictionnary

In [6]:
def data_to_tree(data, descending_ranks)->TreeNode:
    '''transforms array matrix to TreeNode Object 
    '''
    branches = []
    for row in data:
        branches.append(to_tree(row, descending_ranks).to_dict())
    tree = reduce(merge_branches, branches)
    return tree

### Parser

In [7]:
def to_document(tree:dict)->dict:
    '''parses into required format for js usage'''
    if ('leafs' in tree.keys()):
        if isinstance(tree['leafs'], list):
            return [{'name': leaf} for leaf in tree['leafs']]
        else:
            return [{'name': tree['leafs']}]
    else:
        l = []
        for key in [k for k in tree.keys() if k != 'rank']:
            if isinstance(tree[key], dict):
                l.append({
                    'name':key,
                    'attributes':{'rank': tree['rank']},
                    'children': to_document(tree[key]),
                    '_collapsed':True,

                })
        return l
            
        

### II. Usage

In [3]:
df = pd.read_csv(
    r'..\data\gbif_extract.csv',
    usecols=['kingdom','phylum','class','order','family','species','scientificName', 'canonicalName','genus','rank']
)

In [9]:
#filtering rows related to species & filtering in taxonimic descending order 
df_species = df.loc[(df['rank'] == 'SPECIES')][descending_ranks].copy(deep=True)
# todo : manage species with nan  values
species = df_species.dropna(how='any').values

In [10]:
species[:2]

array([['Caldisphaera lagunensis', 'Caldisphaera', 'Acidilobales',
        'Caldisphaeraceae', 'Thermoprotei', 'Crenarchaeota', 'Archaea'],
       ['Pyrodictium occultum', 'Pyrodictium', 'Desulfurococcales',
        'Pyrodictiaceae', 'Thermoprotei', 'Crenarchaeota', 'Archaea']],
      dtype=object)

In [11]:
tree_A = to_tree(species[0])
tree_B = to_tree(species[1])
tree_C = to_tree(species[2])

In [12]:
tree_A.pprint()

Archaea
    |--Crenarchaeota
        |--Thermoprotei
            |--Caldisphaeraceae
                |--Acidilobales
                    |--Caldisphaera
                        |--Caldisphaera lagunensis


In [13]:
tree_B.to_dict()

{'Archaea': {'Crenarchaeota': {'Thermoprotei': {'Pyrodictiaceae': {'Desulfurococcales': {'Pyrodictium': {'leafs': 'Pyrodictium occultum',
       'rank': 'species'},
      'rank': 'genus'},
     'rank': 'order'},
    'rank': 'family'},
   'rank': 'class'},
  'rank': 'phylum'},
 'rank': 'kingdom'}

In [14]:
merge_branches(tree_A.to_dict(), tree_B.to_dict())

{'Archaea': {'Crenarchaeota': {'Thermoprotei': {'Caldisphaeraceae': {'Acidilobales': {'Caldisphaera': {'leafs': 'Caldisphaera lagunensis',
       'rank': 'species'},
      'rank': 'genus'},
     'rank': 'order'},
    'rank': 'family',
    'Pyrodictiaceae': {'Desulfurococcales': {'Pyrodictium': {'leafs': 'Pyrodictium occultum',
       'rank': 'species'},
      'rank': 'genus'},
     'rank': 'order'}},
   'rank': 'class'},
  'rank': 'phylum'},
 'rank': 'kingdom'}

In [15]:
# full tree : kingdom -> species
tree_of_life = data_to_tree(species, descending_ranks)

In [16]:
tree_of_life

{'Archaea': {'Crenarchaeota': {'Thermoprotei': {'Caldisphaeraceae': {'Acidilobales': {'Caldisphaera': {'leafs': 'Caldisphaera lagunensis',
       'rank': 'species'},
      'rank': 'genus'},
     'rank': 'order'},
    'rank': 'family',
    'Pyrodictiaceae': {'Desulfurococcales': {'Pyrodictium': {'leafs': ['Pyrodictium occultum',
        'Pyrodictium brockii',
        'Pyrodictium abyssi'],
       'rank': 'species'},
      'rank': 'genus',
      'Hyperthermus': {'leafs': 'Hyperthermus butylicus', 'rank': 'species'},
      'Pyrolobus': {'leafs': 'Pyrolobus fumarii', 'rank': 'species'}},
     'rank': 'order'},
    'Acidilobaceae': {'Acidilobales': {'Acidilobus': {'leafs': 'Acidilobus aceticus',
       'rank': 'species'},
      'rank': 'genus'},
     'rank': 'order'},
    'Desulfurococcaceae': {'Desulfurococcales': {'Ignisphaera': {'leafs': 'Ignisphaera aggregans',
       'rank': 'species'},
      'rank': 'genus',
      'Thermosphaera': {'leafs': 'Thermosphaera aggregans', 'rank': 'species'

In [17]:
# partial data : kingdom -> family
data_kingdoms_to_families = df_species[df_species.columns[3:]].drop_duplicates().dropna().values

In [18]:
tree_kingdom_to_family = data_to_tree(data_kingdoms_to_families, ['family','class','phylum','kingdom'])

In [19]:
# partial tree : order -> species
data_orders_to_species = df_species[df_species.columns[:3]].drop_duplicates().dropna().values

In [20]:
tree_order_to_species = data_to_tree(data_orders_to_species, ['species','order','family'])

In [21]:
family_documents = to_document(tree_kingdom_to_family)

In [22]:
species_documents = to_document(tree_order_to_species)

In [23]:
species_documents

[{'name': 'Acidilobales',
  'attributes': {'rank': 'family'},
  'children': [{'name': 'Caldisphaera',
    'attributes': {'rank': 'order'},
    'children': [{'name': 'Caldisphaera lagunensis'}],
    '_collapsed': True},
   {'name': 'Acidilobus',
    'attributes': {'rank': 'order'},
    'children': [{'name': 'Acidilobus aceticus'}],
    '_collapsed': True}],
  '_collapsed': True},
 {'name': 'Desulfurococcales',
  'attributes': {'rank': 'family'},
  'children': [{'name': 'Pyrodictium',
    'attributes': {'rank': 'order'},
    'children': [{'name': 'Pyrodictium occultum'},
     {'name': 'Pyrodictium brockii'},
     {'name': 'Pyrodictium abyssi'}],
    '_collapsed': True},
   {'name': 'Hyperthermus',
    'attributes': {'rank': 'order'},
    'children': [{'name': 'Hyperthermus butylicus'}],
    '_collapsed': True},
   {'name': 'Pyrolobus',
    'attributes': {'rank': 'order'},
    'children': [{'name': 'Pyrolobus fumarii'}],
    '_collapsed': True},
   {'name': 'Ignisphaera',
    'attributes'

In [24]:
from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017/')

In [25]:
db_family = client['ceebios']['family']
db_species = client['ceebios']['species']

In [26]:
db_family.insert_many(family_documents)

<pymongo.results.InsertManyResult at 0x2b4d5e6d100>

In [27]:
db_species.insert_many(species_documents)

<pymongo.results.InsertManyResult at 0x2b4d4be6880>

In [28]:
a = db_family.find_one()

In [29]:
a

{'_id': ObjectId('5f846534b96e99541a83b8b7'),
 'name': 'Archaea',
 'attributes': {'rank': 'kingdom'},
 'children': [{'name': 'Crenarchaeota',
   'attributes': {'rank': 'phylum'},
   'children': [{'name': 'Thermoprotei',
     'attributes': {'rank': 'class'},
     'children': [{'name': 'Thermoproteaceae'},
      {'name': 'Sulfolobaceae'},
      {'name': 'Desulfurococcaceae'},
      {'name': 'Pyrodictiaceae'},
      {'name': 'Acidilobaceae'},
      {'name': 'Caldisphaeraceae'},
      {'name': 'Thermofilaceae'}],
     '_collapsed': True}],
   '_collapsed': True},
  {'name': 'Euryarchaeota',
   'attributes': {'rank': 'phylum'},
   'children': [{'name': 'Thermoplasmata',
     'attributes': {'rank': 'class'},
     'children': [{'name': 'Picrophilaceae'},
      {'name': 'Ferroplasmaceae'},
      {'name': 'Thermoplasmataceae'}],
     '_collapsed': True},
    {'name': 'Methanobacteria',
     'attributes': {'rank': 'class'},
     'children': [{'name': 'Methanobacteriaceae'},
      {'name': 'Metha