In [149]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import math


def is_nan(value):
    '''
    check if a value is nan
    return boolean
    '''
    try:
        return math.isnan(float(value))
    except:
        return False
    
def get_sorted_parents(data):
    '''
    sort all parent_taxon in the data,
    a helper function for build_parent_lookup
    return list
    '''
    parents = data["parent_taxon"].unique().tolist()
    ids = data["id"].unique().tolist()
    for parent in parents:
        if is_nan(parent):
            parents.remove(parent)
    parents = sorted(parents)
    return parents

def build_tree (data, id_name_dict):
    '''
    generate tree of life using the taxon names and parent names (not "Q..." code)
    from the fully combined parsed data 
    return dict
    '''
    tree_dict = {}
    for i in range(data.shape[0]):
        parent_code = data["parent_taxon"][i]
        # find the taxon name of the entry or the species_name_value of it
        name = data["taxon_name"][i]
        if name == None or is_nan(name):
            name = "species_name_value_" + data["species_name_value"][i]
        # only process entries with a certain parent node 
        if parent_code != None and not is_nan(parent_code):
            # retrieve taxon name of the parent taxon, if None then species_name_value
            parent_name = id_name_dict[parent_code]
                
            if parent_name not in tree_dict:
                tree_dict[parent_name] = [name]
            else:
                tree_dict[parent_name].append(name)
    return tree_dict

# it will take 30 seconds on my computer
def build_parent_lookup(data):
    '''
    compared to build_parent_lookup_naive, a much quicker way to
    build a lookup dict to link parent_taxon code to a name. 
    Its keys are parent_taxon code ("Q....")
    and values are the corresponding taxon_name, if taxon_name is nan, then the 
    values are the corresponding species_name_value, if the parent_taxon code
    is not in the "id" attribute of the fully combined parsed data, then the
    corresponding values are just the original parent_taxon code
    return: dict
    '''
    id_name_dict = {}
    data = data.sort_values(by=['id'],ignore_index=True)
    parents = get_sorted_parents(data)
    assert data["id"].tolist() == data["id"].unique().tolist()
    ids = data["id"].tolist()
    current_idx, previous_idx = 0,0
    start = time.time()
    while len(parents):
        # process the first parent in parent list
        parent = parents[0]
        # if the parent_taxon has not been found after looping the entire id list, 
        # then loop back to the previous idx and put the parent_taxon code itself as name
        if parent != ids[-1] and current_idx == len(ids)-1:
            current_idx = previous_idx 
            id_name_dict[parent] = parent
            parents.pop(0)
            
        # parent id matches an id on the id list
        elif parent == ids[current_idx]:
            name = data.loc[current_idx,"taxon_name"]
            if is_nan(name):
                # add a "species_name_" tag in front of species name, when taxon name is not available
                name = "species_name: " + data.loc[current_idx,"species_name_value"]
            id_name_dict[parent] = name
            parents.pop(0)
            current_idx += 1
            previous_idx = current_idx
            
        # parent id doesn't match 
        else:
            current_idx += 1
                 
    return id_name_dict

# it will take 2.5 hours on my computer
def build_parent_lookup_naive(data):
    '''
    a naive and slow way to perform the same function as build_parent_lookup
    no longer used
    '''
    id_name_dict = {}
    id_list = data["id"].tolist()
    i = 0
    for parent in data["parent_taxon"].unique():
        i += 1
        if i % 1000 == 0:
            time_taken = round(time.time() - start, 2)
            print(time_taken, " seconds, processing " + str(i))
        if not is_nan(parent):
            # if parent id can be found in id list then retrieve its taxon name or species_name_value
            try:
                idx = id_list.index(parent)
                name = data.loc[idx,"taxon_name"]
                if is_nan(name):
                    # add a "species_name_" tag in front of species name, when taxon name is not available
                    name = "species_name: " + data.loc[idx,"species_name_value"]
                id_name_dict[parent] = name
            # if parent id cannot be found in id list then its id = its name
            except ValueError:
                id_name_dict[parent] = parent
    return id_name_dict


In [39]:
# combine all parsed JSON files into one file (maybe this work has already been done and saved as wikidata_records.csv)
DATADIR = "../dataset"
WIKI_DIR = DATADIR + "/wikidata"
PARSED_WIKI_DIR = WIKI_DIR + "/parsed"
full_parsed = pd.DataFrame()
for file in os.listdir(PARSED_WIKI_DIR):
    temp_data = pd.read_json(os.path.join(PARSED_WIKI_DIR,file))
    full_parsed = pd.concat([full_parsed,temp_data],ignore_index=True)

# break species_name into two attributes: species_name_language and species_name_value
species_name_language, species_name_value = \
[full_parsed["species_name"][i]["language"] for i in range(full_parsed.shape[0])],\
[full_parsed["species_name"][i]["value"] for i in range(full_parsed.shape[0])]

full_parsed["species_name_value"], full_parsed["species_name_language"] = species_name_value,species_name_language
full_parsed = full_parsed.drop(columns="species_name")
full_parsed.to_csv(PARSED_WIKI_DIR + "/full_parsed.csv", index= False)


In [5]:
full_parsed.head()

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,t_range,t_start,t_end,mass,length,height,ncbi,images,species_name_value,species_name_language
0,P1034,,,,,,,,,,,[],main food source,en
1,P105,,,,,,,,,,,[],taxon rank,en
2,P1070,,,,,,,,,,,[],PlantList-ID,en
3,P1076,,,,,,,,,,,[],ICTV virus ID,en
4,P1135,,,,,,,,,,,[],nomenclatural status,en


In [151]:
# create a dict whose keys are parent_taxon code and value is the corresponding name (parent taxon code, or taxon name, or species_name_value)
id_name_dict = build_parent_lookup(full_parsed)
parent_id_to_name_dict_file = open(os.path.join(WIKI_DIR,"parent_id_to_name_dict.pkl"), "wb")
pickle.dump(id_name_dict, parent_id_to_name_dict_file)
parent_id_to_name_dict_file.close()

'''
There are some parent_taxon that is not in the id list 
There are also three parent_taxon who don't have a taxon_name, therefore, its species_name_value is used instead:
They are:

(Q21350434) species_name: Carabus subdiv. Procrustimorphi
(Q21350423) species_name: Carabus div. Multistriati
(Q21350410) species_name: Carabus div. Carabogenici
'''

"\nThere are some parent_taxon that is not in the id list \nThere are also three parent_taxon who don't have a taxon_name, therefore, its species_name_value is used instead:\nThey are:\n\n(Q21350434) species_name: Carabus subdiv. Procrustimorphi\n(Q21350423) species_name: Carabus div. Multistriati\n(Q21350410) species_name: Carabus div. Carabogenici\n"

In [127]:
# build the tree (dict) using parent name lookup dict and taxon names
tree = build_tree(full_parsed, fixed_dict)

tree_file = open((os.path.join(WIKI_DIR,"tree.pkl"), "wb")
pickle.dump(tree, tree_file)
tree_file.close()

In [134]:
# some entries of the tree have more than one parent
for key in tree.keys():
    if "Bacteria" in tree[key]:
        print("Bacteria's boss is " + key)
    if "Archaea" in tree[key]:
        print("Archaea's boss is " + key)
    if "Eukaryota" in tree[key]:
        print("Eukaryota's boss is " + key)

Bacteria's boss is Diapheromeridae
Bacteria's boss is Prokaryota
Archaea's boss is Prokaryota
Eukaryota's boss is Biota
Archaea's boss is Archaeidae


In [144]:
# there are cycles in the tree
def find_parent(tree,item):
    for key in tree.keys():
        if item in tree[key]:
            print(item + " has parent " + key)
            find_boss(tree,key)
    print(item + "has no parent")
find_parent(tree,"Eukaryota")

Eukaryota has parent Biota
Biota has parent Cupressaceae
Cupressaceae has parent Pinales
Pinales has parent Pinopsida
Pinopsida has parent Pinophyta
Pinophyta has parent Cycadophytanae
Cycadophytanae has parent Pteridobiotina
Pteridobiotina has parent Tracheophyta
Tracheophyta has parent Embryophyta
Embryophyta has parent Viridiplantae
Viridiplantae has parent Plantae
Plantae has parent Eukaryota
Eukaryota has parent Biota
Biota has parent Cupressaceae
Cupressaceae has parent Pinales
Pinales has parent Pinopsida
Pinopsida has parent Pinophyta
Pinophyta has parent Cycadophytanae
Cycadophytanae has parent Pteridobiotina
Pteridobiotina has parent Tracheophyta
Tracheophyta has parent Embryophyta
Embryophyta has parent Viridiplantae
Viridiplantae has parent Plantae
Plantae has parent Eukaryota
Eukaryota has parent Biota
Biota has parent Cupressaceae
Cupressaceae has parent Pinales
Pinales has parent Pinopsida
Pinopsida has parent Pinophyta
Pinophyta has parent Cycadophytanae
Cycadophytanae 

KeyboardInterrupt: 

In [143]:
# these are the children of Biota
tree["Biota"]

['Acytota',
 'Prokaryota',
 'Eukaryota',
 'Neomura',
 'Aphanobionta',
 'Protoctista',
 'Mychota',
 'Contophora',
 'Palaeopascichnus',
 'Somatohelix',
 'Virus',
 'Monera']

In [None]:
# load pre-computed data for next time use
full_parsed = pd.read_csv("../dataset/wikidata/full_parsed.csv")
with (open(os.path.join(WIKI_DIR,"parent_id_to_name_dict.pkl"), "rb")) as f:
    id_name_dict = pickle.load(f)
with (open(os.path.join(WIKI_DIR,"tree.pkl"), "rb")) as f:
    tree = pickle.load(f)    