In [2]:
import pandas as pd
import os
import json
import requests
import pickle as pkl
import matplotlib.pyplot as plt
import numpy as np

DATADIR = "../../dataset"
WIKIDIR = DATADIR + "/wikidata/processed"
PBDBDIR = DATADIR + "/pbdb"

In [70]:
def get_flat_tree_from_tree(tree):
    '''
    input a nested tree and output all full and incomplete 
    branches of the tree.
    return: a list where each element is a dictionary, where
    there is name, rank, id of each node/leave and path to root.
    '''
    def visit_nodes (root_child, path_from_root_id, path_from_root_name,path_from_root_rank):
        '''
        input a root (root_child) and recursively append information
        of each of its downstream nodes and leaves to the list, branches.
        '''
        path_from_root_id += "," + root_child["id"]
        path_from_root_name += "," + root_child["name"]
        path_from_root_rank += "," + root_child["rank"]
        children = []
        if "children" in root_child.keys():
            for child in root_child["children"]:
                children.append(child["id"])
        branches.append({"name": root_child["name"], "rank": root_child["rank"], "id": root_child["id"], \
                         "pathFromRootById": path_from_root_id, \
                         "pathFromRootByName": path_from_root_name, \
                         "pathFromRootByRank": path_from_root_rank, \
                         "children": children})
        if "children" in root_child.keys():
            for child in root_child["children"]:
                visit_nodes(child, path_from_root_id, path_from_root_name, path_from_root_rank)
        
    branches = []
    visit_nodes(tree, "","","")
    return branches

def json_splitter(document, splits, DIR, folder_name):
    '''
    take a long json file (dictionary) and split it into "splits" parts
    and save into the folder under "folder_name"
    '''
    idx = 0
    for i in range(0,len(document),len(document)//(splits - 1)):
        idx += 1
        if idx < splits:
            temp = document[i:i+len(document)//(splits - 1)]
        else:
            temp = document[i:]
        with open(os.path.join(DIR, folder_name, folder_name + "_" + str(idx) + ".json"),"w") as f:
            json.dump(temp,f)
    

In [71]:
with open(os.path.join(WIKIDIR, "dangling_trees","Q27973466.json"),"rb") as f:
    test_tree = json.load(f)
get_flat_tree_from_tree(test_tree)

[{'name': 'Devia',
  'rank': '',
  'id': 'Q27973466',
  'pathFromRootById': ',Q27973466',
  'pathFromRootByName': ',Devia',
  'pathFromRootByRank': ',',
  'children': ['Q11965580', 'Q14882376']},
 {'name': 'Devia prospera',
  'rank': 'Q7432',
  'id': 'Q11965580',
  'pathFromRootById': ',Q27973466,Q11965580',
  'pathFromRootByName': ',Devia,Devia prospera',
  'pathFromRootByRank': ',,Q7432',
  'children': []},
 {'name': 'Devia congruens',
  'rank': 'Q7432',
  'id': 'Q14882376',
  'pathFromRootById': ',Q27973466,Q14882376',
  'pathFromRootByName': ',Devia,Devia congruens',
  'pathFromRootByRank': ',,Q7432',
  'children': []}]

In [72]:
# load tree from biota
with open(os.path.join(WIKIDIR, "tree.json"),"rb") as f:
    tree_from_biota = json.load(f)

In [73]:
# compute flat tree for db
tree_for_db = get_flat_tree_from_tree(tree_from_biota)

In [74]:
del tree_from_biota

In [11]:
# load linked pbdb fossils 
with open(os.path.join(PBDBDIR, "pbdb.json"),"rb") as f:
    pbdb = json.load(f)

In [12]:
pbdb = pd.DataFrame(pbdb)

In [13]:
all_wikiRef = pbdb["wikiRef"].unique().tolist()

In [14]:
all_biota_wiki = [item["id"] for item in tree_for_db]

In [58]:
# note that some tree_for_db has multiple ids and ranks for the same taxon name,
# however there are no wikiRefs involved in those multiple ids
repeat_ids = set([item for ids in tree_for_db_pd[tree_for_db_pd["id"].str.contains("_")]["id"].tolist() for item in ids.split("_")])

wikiRef_in_repeat_ids = []
for wikiRef in all_wikiRef:
    if wikiRef in repeat_ids:
        wikiRef_in_repeat_ids.append(wikiRef)

print(wikiRef_in_repeat_ids)

[]

In [15]:
# get non biota wikiRefs in linked pbdb, which are wikiRefs not in the tree from biota (there are 22)
non_biota_wikiRef = []
all_biota_wiki_set = set(all_biota_wiki)
for i in all_wikiRef:
    if i not in all_biota_wiki_set:
        non_biota_wikiRef.append(i)
print(non_biota_wikiRef)

['Q24036450', 'Q16743596', 'Q1226423', 'Q104846928', 'Q21078601', 'Q7171121', 'Q16760352', 'Q63927535', 'Q63927550', 'Q63927541', 'Q15869639', 'Q100235649', 'Q97277751', 'Q65320918', 'Q63929649', 'Q51545675', 'Q86712427', 'Q95715804', 'Q97463695', 'Q11710941', 'Q51607598', 'Q26856809']


In [34]:
# exclude pbdb records with non biota wikiRefs 
pbdb = pbdb[~pbdb["wikiRef"].isin(non_biota_wikiRef)]

In [35]:
parsed_pbdb = json.loads(pbdb.to_json(orient="records"))

In [36]:
with open(os.path.join(PBDBDIR, "pbdb_for_db.json"),"w") as f:
    json.dump(parsed_pbdb,f)

In [37]:
# split linked pbdb fossils into 10 files
json_splitter(parsed_pbdb,10,PBDBDIR,"pbdb_for_db")

In [202]:
with open("../../dataset/pbdb/pbdb_for_db.json","rb") as f:
    pbdb = pd.DataFrame(json.load(f))

In [205]:
tree_for_db_pd = pd.DataFrame(tree_for_db)

In [206]:
# cut down tree_for_bd_pd by only keeping the items that are present in the tree from all fossil records
all_paths = tree_for_db_pd[tree_for_db_pd["id"].isin(set(pbdb["wikiRef"].unique()))]["pathFromRootById"].tolist()

In [207]:
all_ids_fossil_tree = set()
for path in all_paths:
    path = path.split(",")[1:]
    all_ids_fossil_tree.update(path)

In [217]:
tree_for_db_pd = tree_for_db_pd[tree_for_db_pd["id"].isin(all_ids_fossil_tree)]

In [210]:
# build dataframe for wikiRef id counts, min of minma, max of maxma
# indices are wikiRefs 
wikiRef_count_and_time_pd = pd.DataFrame(pbdb.groupby(["wikiRef"])["id"].count())
wikiRef_count_and_time_pd.columns = ["count"]
wikiRef_count_and_time_pd["minma"] = pbdb.groupby(["wikiRef"])["minma"].min().tolist()
wikiRef_count_and_time_pd["maxma"] = pbdb.groupby(["wikiRef"])["maxma"].max().tolist()


In [234]:
# initialize count, maxma, minma to 0
tree_for_db_pd["count"],tree_for_db_pd["maxma"], tree_for_db_pd["minma"] = [0]*tree_for_db_pd.shape[0], [-1]*tree_for_db_pd.shape[0],[-1]*tree_for_db_pd.shape[0]

In [228]:
tree_for_db_pd = tree_for_db_pd.set_index("id")

In [235]:
i = 0
for wikiRef in wikiRef_count_and_time_pd.index:
    i += 1
    if i % 1000 == 0:
        print(i)
    count, maxma, minma = wikiRef_count_and_time_pd.loc[wikiRef, "count"], \
    wikiRef_count_and_time_pd.loc[wikiRef, "maxma"], \
    wikiRef_count_and_time_pd.loc[wikiRef, "minma"]
    
    tree_for_db_pd.loc[wikiRef, "count"] = count
    tree_for_db_pd.loc[wikiRef, "maxma"] = maxma
    tree_for_db_pd.loc[wikiRef, "minma"] = minma

    ids_from_root = tree_for_db_pd.loc[wikiRef].pathFromRootById.split(",")[1:]

    for upstream_id in ids_from_root[:-1]:
        tree_for_db_pd.loc[upstream_id, "count"] += count
        if float(tree_for_db_pd.loc[upstream_id, "maxma"]) < maxma:
            tree_for_db_pd.loc[upstream_id, "maxma"] = maxma
        if float(tree_for_db_pd.loc[upstream_id, "minma"]) == -1 or \
            float(tree_for_db_pd.loc[upstream_id, "minma"]) > minma:
            tree_for_db_pd.loc[upstream_id, "minma"] = minma


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000


In [242]:
tree_for_db_pd.reset_index(level=0,inplace=True)

In [244]:
parsed_tree_for_db = json.loads(tree_for_db_pd.to_json(orient="records"))

In [245]:
# save the flat tree for db into file
with open(os.path.join(WIKIDIR, "fossil_related_flat_tree_for_db.json"),"w") as f:
    json.dump(parsed_tree_for_db,f)

In [246]:
tree_for_db_pd

Unnamed: 0,id,name,rank,pathFromRootById,pathFromRootByName,pathFromRootByRank,children,count,maxma,minma
0,Q2382443,Biota,superdomain,",Q2382443",",Biota",",superdomain","[Q1950208, Q28920818, Q808, Q20072875, Q19081,...",1286430,1000.0,0.0
1,Q19081,Prokaryota,domain,",Q2382443,Q19081",",Biota,Prokaryota",",superdomain,domain","[Q60972204, Q10876, Q13014476, Q10872]",101,635.0,28.1
2,Q10876,Bacteria,kingdom,",Q2382443,Q19081,Q10876",",Biota,Prokaryota,Bacteria",",superdomain,domain,kingdom","[Q24975421, Q26197526, Q26218602, Q3337759, Q6...",101,635.0,28.1
3,Q3337759,Negibacteria,subkingdom,",Q2382443,Q19081,Q10876,Q3337759",",Biota,Prokaryota,Bacteria,Negibacteria",",superdomain,domain,kingdom,subkingdom","[Q12963338, Q13012345, Q597831, Q1200941, Q536...",94,635.0,28.1
4,Q93315,Cyanobacteria,phylum,",Q2382443,Q19081,Q10876,Q3337759,Q93315",",Biota,Prokaryota,Bacteria,Negibacteria,Cyanob...",",superdomain,domain,kingdom,subkingdom,phylum","[Q16985953, Q21445955, Q3492534, Q1088359, Q34...",35,635.0,201.3
...,...,...,...,...,...,...,...,...,...,...
71318,Q3702219,Anabarites,genus,",Q2382443,Q19088,Q964455,Q129021,Q1205110,Q113...",",Biota,Eukaryota,Unikonta,Opisthokonta,Holozoa...",",superdomain,domain,subdomain,,,,,kingdom,fami...","[Q20716994, Q5147299, Q5155336, Q5146116]",190,635.0,513.0
71319,Q20716994,Anabarites trisulcatus,species,",Q2382443,Q19088,Q964455,Q129021,Q1205110,Q113...",",Biota,Eukaryota,Unikonta,Opisthokonta,Holozoa...",",superdomain,domain,subdomain,,,,,kingdom,fami...",[],94,541.0,485.4
71320,Q5147299,Anabarites gracilis,species,",Q2382443,Q19088,Q964455,Q129021,Q1205110,Q113...",",Biota,Eukaryota,Unikonta,Opisthokonta,Holozoa...",",superdomain,domain,subdomain,,,,,kingdom,fami...",[],2,541.0,516.0
71321,Q5155336,Anabarites sexalox,species,",Q2382443,Q19088,Q964455,Q129021,Q1205110,Q113...",",Biota,Eukaryota,Unikonta,Opisthokonta,Holozoa...",",superdomain,domain,subdomain,,,,,kingdom,fami...",[],16,520.0,513.0


In [253]:
tree_for_db_pd[tree_for_db_pd["minma"]<0]

Unnamed: 0,id,name,rank,pathFromRootById,pathFromRootByName,pathFromRootByRank,children,count,maxma,minma


In [18]:
# split flat tree 
json_splitter(parsed_tree_for_db,50,WIKIDIR,"flat_tree_for_db")