In [1]:
import pandas as pd
import os
import json
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import numpy as np

DATADIR = "../../dataset"
WIKIDIR = DATADIR + "/wikidata/processed"
PBDBDIR = DATADIR + "/pbdb"

In [14]:
def get_flat_tree_from_tree(tree):
    '''
    input a nested tree and output all full and incomplete 
    branches of the tree.
    return: a list where each element is a dictionary, where
    there is name, rank, id of each node/leave and path to root.
    '''
    def visit_nodes (root_child, path_from_root_id, path_from_root_name):
        '''
        input a root (root_child) and recursively append information
        of each of its downstream nodes and leaves to the list, branches.
        '''
        path_from_root_id += "," + root_child["id"]
        path_from_root_name += "," + root_child["name"]
        branches.append({"name": root_child["name"], "rank": root_child["rank"], "id": root_child["id"], "pathFromRootById": path_from_root_id, "pathFromRootByName": path_from_root_name})
        if "children" in root_child.keys():
            for child in root_child["children"]:
                visit_nodes(child, path_from_root_id, path_from_root_name)
        
    branches = []
    visit_nodes(tree, "","")
    return branches

def json_splitter(document, splits, DIR, folder_name):
    '''
    take a long json file (dictionary) and split it into "splits" parts
    and save into the folder under "folder_name"
    '''
    idx = 0
    for i in range(0,len(document),len(document)//(splits - 1)):
        idx += 1
        if idx < splits:
            temp = document[i:i+len(document)//(splits - 1)]
        else:
            temp = document[i:]
        with open(os.path.join(DIR, folder_name, folder_name + "_" + str(idx) + ".json"),"w") as f:
            json.dump(temp,f)
    

In [15]:
with open(os.path.join(WIKIDIR, "dangling_trees","Q27973466.json"),"rb") as f:
    test_tree = json.load(f)
get_flat_tree_from_tree(test_tree)

[{'name': 'Devia',
  'rank': '',
  'id': 'Q27973466',
  'pathFromRootById': ',Q27973466',
  'pathFromRootByName': ',Devia'},
 {'name': 'Devia prospera',
  'rank': 'Q7432',
  'id': 'Q11965580',
  'pathFromRootById': ',Q27973466,Q11965580',
  'pathFromRootByName': ',Devia,Devia prospera'},
 {'name': 'Devia congruens',
  'rank': 'Q7432',
  'id': 'Q14882376',
  'pathFromRootById': ',Q27973466,Q14882376',
  'pathFromRootByName': ',Devia,Devia congruens'}]

In [7]:
# load flat tree
with open(os.path.join(WIKIDIR, "tree_flat.pkl"),"rb") as f:
    tree_flat = pkl.load(f)

In [3]:
# load tree from biota
with open(os.path.join(WIKIDIR, "tree.json"),"rb") as f:
    tree_from_biota = json.load(f)

In [16]:
# compute flat tree for db
tree_for_db = get_flat_tree_from_tree(tree_from_biota)

In [18]:
# split flat tree 
json_splitter(tree_for_db,50,WIKIDIR,"flat_tree_for_db")

In [17]:
# save the flat tree for db into file
with open(os.path.join(WIKIDIR, "flat_tree_for_db.json"),"w") as f:
    json.dump(tree_for_db,f)

In [56]:
# load linked pbdb fossils 
with open(os.path.join(PBDBDIR, "pbdb.json"),"rb") as f:
    pbdb = json.load(f)

In [67]:
# split linked pbdb fossils into 10 files
json_splitter(pbdb,10,PBDBDIR,"pbdb_for_db")

In [12]:
data = pd.read_csv(os.path.join(WIKIDIR, "data.csv"), index_col = "id")
data["taxon_name"].fillna(data["species_name_value"],inplace=True)