In [14]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import math
import collections
import json
import requests


DATADIR = "../../dataset"
WIKIDIR = DATADIR + "/wikidata"
PARSED_WIKIDIR = WIKIDIR + "/parsed"
PROCESSED_WIKIDIR = WIKIDIR + "/processed"

In [28]:
def is_nan(value):
    '''
    check if a value is nan
    return boolean
    '''
    if value == None:
        return True
    try:
        return math.isnan(float(value))
    except:
        return False

def get_rank_id_name_lookup(ranks):
    '''
    input a list of rank ids and query wikidata
    to return the corresponding rank name in English
    return: a dictionary, where keys are rank ids
    and values are corresponding English rank names
    '''
    rank_dict = {}
    for rank in ranks:
        request_result = requests.get(f"https://www.wikidata.org/wiki/Special:EntityData/{rank}.json").json()
        rank_dict[rank] = request_result["entities"][rank]["labels"]["en"]["value"]
    return rank_dict


def build_tree (data):
    '''
    generate tree of life using the code names 
    from the fully combined parsed data 
    return dict, where key is each parent code and 
    values are their corresponding children codes
    '''
    tree_dict = {}
    for i in range(data.shape[0]):
        parent_code = data["parent_taxon"][i]
        name = data.index[i]

        # only process entries with a certain parent node 
        if parent_code != None and not is_nan(parent_code):
            if parent_code not in tree_dict:
                tree_dict[parent_code] = [name]
            else:
                tree_dict[parent_code].append(name)
    
    for i in range(data.shape[0]):
        potential_leaf = data.index[i]
        if potential_leaf not in tree_dict:
            tree_dict[potential_leaf] = []
    return tree_dict


def get_missing_parents(data):
    '''
    Look for missing parents, which are parents whose id cannot be found in the data
    return: list of id code of the missing parents.
    '''
    missing_parents = []
    all_ids = data.index
    for parent in data[data["parent_taxon"].notna()]["parent_taxon"].unique().tolist():
        if parent not in all_ids:
            missing_parents.append(parent)
    return missing_parents

def build_tree_from_root(root_id, tree, data):
    try:
        name = data.loc[root_id]["taxon_name"]
        if is_nan(name):
            name = data.loc[root_id]["species_name_value"]
        if is_nan(name):
            name = ""
    except KeyError:
        name = ""
    
        
    try:
        rank = data.loc[root_id]["taxon_rank"]
        if is_nan(rank):
            rank = ""
    except KeyError:
        rank = ""
    
    
    rooted_tree = {"name":name,"rank":rank, "id": root_id, "children":[build_tree_from_root(child_id,tree, data) for child_id in tree[root_id]]}
    
    if rooted_tree["children"] == []:
        del rooted_tree["children"]
    else:
        # merge nodes or leaves that are siblings but has identical taxon names
        repeated_names = [item for item, count in collections.Counter([child["name"] for child in rooted_tree["children"]]).items() if count > 1]
        new_children_list = [child for child in rooted_tree["children"] if child["name"] not in repeated_names]
        for rep in repeated_names:
            replacement = {"name":rep,"rank":"", "id": "","children":[]}
            for child in rooted_tree["children"]:
                if child["name"] == rep:
                    if replacement["id"]:
                        replacement["id"] += "_" + child["id"]
                    else:
                        replacement["id"] = child["id"]
                    if replacement["rank"]:
                        replacement["rank"] += "_" + child["rank"]
                    else:
                        replacement["rank"] = child["rank"]
                    
                    if "children" in child.keys():
                        try:
                            replacement["children"] += child["children"]
                        except KeyError:
                            replacement["children"] = child["children"]
            new_children_list.append(replacement)

        rooted_tree["children"] = new_children_list
    return rooted_tree

In [29]:
with open(os.path.join(PROCESSED_WIKIDIR,"tree_flat.pkl"), "rb") as tree_file:
    tree_flat = pickle.load(tree_file)

In [38]:
data["taxon_name"].fillna(data["species_name_value"],inplace=True)

In [39]:
data.to_csv(PROCESSED_WIKIDIR + "/data.csv")

In [40]:
tree_from_biota = build_tree_from_root("Q2382443", tree_flat, data)

In [42]:
with open(os.path.join(PROCESSED_WIKIDIR, "tree.json"), 'w') as fp:
    json.dump(tree_from_biota, fp)