In [8]:
# load pre-computed files
PROCESSED_WIKIDIR = "../dataset/wikidata/processed"
data = pd.read_csv(os.path.join(PROCESSED_WIKIDIR, "full_parsed.csv"))
with open(os.path.join(PROCESSED_WIKIDIR, "tree_flat.pkl"),"rb") as tree_f, open(os.path.join(PROCESSED_WIKIDIR, "id_name_lookup.pkl"),"rb") as id_name_f, open(os.path.join(PROCESSED_WIKIDIR, "id_rank_lookup.pkl"),"rb") as id_rank_f:
    tree, id_name_dict, id_rank_dict = pickle.load(tree_f), pickle.load(id_name_f), pickle.load(id_rank_f)


In [3]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import math
import collections
import json

DATADIR = "../dataset"
WIKIDIR = DATADIR + "/wikidata"
PARSED_WIKI_DIR = WIKIDIR + "/parsed"
PROCESSED_WIKIDIR = WIKIDIR + "/processed"

def is_nan(value):
    '''
    check if a value is nan
    return boolean
    '''
    try:
        return math.isnan(float(value))
    except:
        return False
    

def build_tree_with_taxon_name (data, id_name_dict):
    '''
    no longer used.
    generate tree of life using the taxon names and parent names (not "Q..." code)
    from the fully combined parsed data 
    return dict
    '''
    tree_dict = {}
    for i in range(data.shape[0]):
        parent_code = data["parent_taxon"][i]
        # find the taxon name of the entry or the species_name_value of it
        name = data["taxon_name"][i]
        if name == None or is_nan(name):
            name = "species_name_value_" + data["species_name_value"][i]
        # only process entries with a certain parent node 
        if parent_code != None and not is_nan(parent_code):
            # retrieve taxon name of the parent taxon, if None then species_name_value
            parent_name = id_name_dict[parent_code]
                
            if parent_name not in tree_dict:
                tree_dict[parent_name] = [name]
            else:
                tree_dict[parent_name].append(name)
    return tree_dict

def build_tree (data):
    '''
    generate tree of life using the code names 
    from the fully combined parsed data 
    return dict, where key is each parent code and 
    values are their corresponding children codes
    '''
    tree_dict = {}
    for i in range(data.shape[0]):
        parent_code = data["parent_taxon"][i]
        name = data["id"][i]

        # only process entries with a certain parent node 
        if parent_code != None and not is_nan(parent_code):
            if parent_code not in tree_dict:
                tree_dict[parent_code] = [name]
            else:
                tree_dict[parent_code].append(name)
    
    for i in range(data.shape[0]):
        potential_leaf = data["id"][i]
        if potential_leaf not in tree_dict:
            tree_dict[potential_leaf] = []
    return tree_dict

# it will take 30 seconds on my computer
def build_parent_lookup(data):
    '''
    compared to build_parent_lookup_naive, a much quicker way to
    build a lookup dict to link parent_taxon code to a name. 
    Its keys are parent_taxon code ("Q....")
    and values are the corresponding taxon_name, if taxon_name is nan, then the 
    values are the corresponding species_name_value, if the parent_taxon code
    is not in the "id" attribute of the fully combined parsed data, then the
    corresponding values are just the original parent_taxon code
    return: dict
    '''
    def get_sorted_parents(data):
        '''
        sort all parent_taxon in the data,
        a helper function for build_parent_lookup
        return list
        '''
        parents = data["parent_taxon"].unique().tolist()
        ids = data["id"].unique().tolist()
        for parent in parents:
            if is_nan(parent):
                parents.remove(parent)
        parents = sorted(parents)
        return parents
    
    id_name_dict = {}
    data = data.sort_values(by=['id'],ignore_index=True)
    parents = get_sorted_parents(data)
    assert data["id"].tolist() == data["id"].unique().tolist()
    ids = data["id"].tolist()
    current_idx, previous_idx = 0,0
    start = time.time()
    while len(parents):
        # process the first parent in parent list
        parent = parents[0]
        # if the parent_taxon has not been found after looping the entire id list, 
        # then loop back to the previous idx and put the parent_taxon code itself as name
        if parent != ids[-1] and current_idx == len(ids)-1:
            current_idx = previous_idx 
            id_name_dict[parent] = parent
            parents.pop(0)
            
        # parent id matches an id on the id list
        elif parent == ids[current_idx]:
            name = data.loc[current_idx,"taxon_name"]
            if is_nan(name):
                # add a "species_name_" tag in front of species name, when taxon name is not available
                name = "species_name: " + data.loc[current_idx,"species_name_value"]
            id_name_dict[parent] = name
            parents.pop(0)
            current_idx += 1
            previous_idx = current_idx
            
        # parent id doesn't match 
        else:
            current_idx += 1
                 
    return id_name_dict


# it will take 2.5 hours on my computer
def build_parent_lookup_naive(data):
    '''
    a naive and slow way to perform the same function as build_parent_lookup
    no longer used
    '''
    id_name_dict = {}
    id_list = data["id"].tolist()
    i = 0
    for parent in data["parent_taxon"].unique():
        i += 1
        if i % 1000 == 0:
            time_taken = round(time.time() - start, 2)
            print(time_taken, " seconds, processing " + str(i))
        if not is_nan(parent):
            # if parent id can be found in id list then retrieve its taxon name or species_name_value
            try:
                idx = id_list.index(parent)
                name = data.loc[idx,"taxon_name"]
                if is_nan(name):
                    # add a "species_name_" tag in front of species name, when taxon name is not available
                    name = "species_name: " + data.loc[idx,"species_name_value"]
                id_name_dict[parent] = name
            # if parent id cannot be found in id list then its id = its name
            except ValueError:
                id_name_dict[parent] = parent
    return id_name_dict

def get_path_to_root(tree,bottom_node):
    '''
    very slow.
    for an item in the tree, recursively search for its parent
    until finding the root, or detecting a cycle
    It does not detect multiple parent case (only follow path for one parent)
    return a list of ancestors for a child, the list ends in its root
    or a node in a cycle. In the cycle case, the list ends in the word "cycle"
    '''
    def add_parent_to_path(node):
        for key in tree.keys():
            if node in tree[key]:
                parent_list.append(key)
                if parent_list.count(parent_list[-1]) > 1:
                    parent_list.append("cycle!")
                else:
                    add_parent_to_path(key)
                # do not consider alternative path when multi-parents occur
                break
                
    parent_list = [bottom_node]
    add_parent_to_path(bottom_node)
    return parent_list

def get_node_parent(tree):
    '''
    very slow.
    get parent of all nodes in the tree
    return dict, where keys are nodes,
    values are their corresponding parent(s)
    '''
    count = 0
    node_parent = {}
    for node in tree.keys():
        count += 1
        if count % 100 == 0:
            print(count)
        parent = []
        for parent_candidate in tree.keys():
            if node in tree[parent_candidate]:
                parent.append(parent_candidate)
                
        node_parent[node] = parent
    return node_parent

def get_repetitive_taxon_name(data):
    '''
    for all repetitive taxon names, show how many counts there are in the data
    return dict, where key is repetitve taxon name, value is corresponding count
    '''
    count = collections.Counter(data[data["taxon_name"].notna()]["taxon_name"])
    for key, value in list(count.items()):
        if value == 1:
            del count[key]
    return dict(count)

def get_id_to_name_lookup(data):
    ''' 
    Build a look up dictionary of id codes to taxon names for all entries.
    If taxon_name is nan, then the species_name_value is used.
    return: dict, where keys are id codes and values are corresponding taxon_name,
    or species_name_value
    '''
    
    id_name_dict = {}
    ids, taxon_names, species_name_values = data["id"],data["taxon_name"],data["species_name_value"]
    for i in range(len(data)):
        id_code, name = ids[i], taxon_names[i]
        if is_nan(name):
            # use_species_name_value when taxon name is not available
            name = species_name_values[i]
        id_name_dict[id_code] = name

    return id_name_dict

def get_id_to_rank_lookup(data):
    ''' 
    Build a look up dictionary of id codes to taxon ranks for all entries.
    return: dict, where keys are id codes and values are corresponding taxon_rank,
    or None
    '''
    
    id_rank_dict = {}
    ids, taxon_ranks = data["id"],data["taxon_rank"]
    for i in range(len(data)):
        id_code, rank = ids[i], taxon_ranks[i]
        if is_nan(rank):
            rank = "NA"
        id_rank_dict[id_code] = rank

    return id_rank_dict


def get_missing_parents(id_name_dict,data):
    '''
    Look for missing parents, which are parents whose id cannot be found in the data
    return: list of id code of the missing parents.
    '''
    missing_parents = []
    for parent in data[data["parent_taxon"].notna()]["parent_taxon"].unique().tolist():
        if parent not in id_name_dict:
            missing_parents.append(parent)
    return missing_parents

def build_tree_from_root(root_id, tree, id_name, id_rank):
    try:
        name = id_name[root_id]
    except KeyError:
        name = ""
        
    try:
        rank = id_rank[root_id]
    except KeyError:
        rank = ""
    
    rooted_tree = {"name":name,"rank":rank, "id": root_id, "children":[build_tree_from_root(child_id,tree,id_name,id_rank) for child_id in tree[root_id]]}
    
    if rooted_tree["children"] == []:
        del rooted_tree["children"]
    else:
        # merge nodes or leaves that are siblings but has identical taxon names
        repeated_names = [item for item, count in collections.Counter([child["name"] for child in rooted_tree["children"]]).items() if count > 1]
        new_children_list = [child for child in rooted_tree["children"] if child["name"] not in repeated_names]
        for rep in repeated_names:
            replacement = {"name":rep,"rank":"", "id": "","children":[]}
            for child in rooted_tree["children"]:
                if child["name"] == rep:
                    if replacement["id"]:
                        replacement["id"] += "_" + child["id"]
                    else:
                        replacement["id"] = child["id"]
                    if replacement["rank"]:
                        replacement["rank"] += "_" + child["rank"]
                    else:
                        replacement["rank"] = child["rank"]
                    
                    if "children" in child.keys():
                        try:
                            replacement["children"] += child["children"]
                        except KeyError:
                            replacement["children"] = child["children"]
            new_children_list.append(replacement)

        rooted_tree["children"] = new_children_list
    return rooted_tree

In [92]:
# combine all parsed JSON files into one file (maybe this work has already been done and saved as wikidata_records.csv)
data = pd.DataFrame()
for file in os.listdir(PARSED_WIKIDIR):
    temp_data = pd.read_json(os.path.join(PARSED_WIKIDIR,file))
    data = pd.concat([data,temp_data],ignore_index=True)

# break species_name into two attributes: species_name_language and species_name_value
species_name_language, species_name_value = \
[data["species_name"][i]["language"] for i in range(data.shape[0])],\
[data["species_name"][i]["value"] for i in range(data.shape[0])]

data["species_name_value"], data["species_name_language"] = species_name_value,species_name_language
data = data[["id","taxon_name","taxon_rank","parent_taxon","species_name_value"]]
data.to_csv(PROCESSED_WIKIDIR + "/full_parsed.csv", index= False)


In [None]:
'''
when an item has no taxon name but has a parent taxon, use its species_name_value as its name.
"Neonymphon sp." and "Wainuia" are two species_name_value that equals the taxon names of other
items
'''
all_taxon_names = data["taxon_name"].unique()
species_values = data[data["taxon_name"].isna() & data["parent_taxon"].notna()]["species_name_value"].tolist()

print("species_name_value that is also present in taxon_names: ")
for species_value in species_values:
    if species_value in all_taxon_names:
        print(species_value)

'''
In cases where species_name_value names are used, these names are also not all unique. 
There are two "Bedotia spec." that use species_name_value

'''
print("not unique species_name_value: ")
species_count = collections.Counter(species_values)
for c in species_count.keys():
    if species_count[c] != 1:
        print(c + ": " + str(species_count[c]))

In [4]:
'''
The two Neonymphon sp. species can be merged
And the Wainuia is the parent of the second Wainuia
'''
data[
    (data["species_name_value"] == "Neonymphon sp.")
    | (data["species_name_value"] == "Wainuia")
    | (data["taxon_name"] == "Neonymphon sp.") 
    | (data["taxon_name"] == "Wainuia")
]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
1435832,Q63708474,Neonymphon sp.,Q7432,Q11845937,Neonymphon sp.
1488753,Q1905950,,Q7432,Q11845937,Neonymphon sp.
1527131,Q3076049,Wainuia,Q34740,Q3075210,Wainuia
2881841,Q7960198,,Q7432,Q3076049,Wainuia


In [93]:
'''
These two species can be merged
'''
data[data["species_name_value"] == "Bedotia spec."]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
973595,Q4879358,,Q7432,Q141320,Bedotia spec.
2860033,Q4879370,,Q7432,Q141320,Bedotia spec.


In [None]:
# create a dict whose keys are parent_taxon code and value is the corresponding name (parent taxon code, or taxon name, or species_name_value)
#id_name_dict = build_parent_lookup(data)
#parent_id_to_name_dict_file = open(os.path.join(WIKI_DIR,"parent_id_to_name_dict.pkl"), "wb")
#pickle.dump(id_name_dict, parent_id_to_name_dict_file)
#parent_id_to_name_dict_file.close()

# create a dict whose keys are all id codes and values are its corresponding taxon name, if null then species_name_value
id_name_dict = get_id_to_name_lookup(data)
id_name_dict_file = open(os.path.join(PROCESSED_WIKIDIR,"id_name_lookup.pkl"), "wb")
pickle.dump(id_name_dict, id_name_dict_file)
id_name_dict_file.close()

# create a dict whose keys are all id codes and values are its corresponding taxon rank, if null then None
id_rank_dict = get_id_to_rank_lookup(data)
id_rank_dict_file = open(os.path.join(PROCESSED_WIKIDIR,"id_rank_lookup.pkl"), "wb")
pickle.dump(id_rank_dict, id_rank_dict_file)
id_rank_dict_file.close()
'''
There are some parent_taxon that is not in the id list 
There are also three parent_taxon who don't have a taxon_name, therefore, its species_name_value is used instead:
They are:

(Q21350434) species_name: Carabus subdiv. Procrustimorphi
(Q21350423) species_name: Carabus div. Multistriati
(Q21350410) species_name: Carabus div. Carabogenici
'''

In [101]:
'''
these are the parents that are not present in the list of ids
which means they have no record in the wikidata



separate_roots = []
for code, taxon in id_name_dict.items():
    if code == taxon:
        separate_roots.append(code)
'''
missing_parents = get_missing_parents(id_name_dict, data)


In [100]:
'''
There are also entries with no parent information therefore they are not present in any trees
'''
data[(data["parent_taxon"].isna())].head()

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
31362,P1070,,,,PlantList-ID
37931,P1137,,,,fossil found in this unit
68300,P1772,,,,USDA PLANTS ID
70202,Q20672521,,,,Serpentine rouge
87883,Q21398146,,,,Puccinia asteris ß chrysanthemi-leucanthemi


In [None]:
# save the missing parents and those with parent_taxon = NaN into csv and use query_wikidata.py to fill in the missing information
pd.DataFrame(data=missing_parents,columns=["id"]).to_csv(PROCESSED_WIKIDIR + "/missing_parents.csv", index = False)
data[(data["parent_taxon"].isna())].to_csv(PROCESSED_WIKIDIR + "/no_parent_taxon.csv",index = False)

In [4]:
with open(os.path.join(PROCESSED_WIKIDIR, "updated_entries.json")) as a, open(os.path.join(PROCESSED_WIKIDIR, "updated_entries_missing_parents.json")) as b:
    updated_entries,updated_entries_missing_parents = json.load(a), json.load(b)
    extra_data = pd.concat([pd.DataFrame.from_dict(updated_entries_missing_parents, orient="index"),pd.DataFrame.from_dict(updated_entries, orient="index")])
    species_name_language, species_name_value = \
    [extra_data["species_name"][i]["language"] for i in range(extra_data.shape[0])],\
    [extra_data["species_name"][i]["value"] for i in range(extra_data.shape[0])]

    extra_data["species_name_value"], extra_data["species_name_language"] = species_name_value,species_name_language
    extra_data = extra_data[["id","taxon_name","taxon_rank","parent_taxon","species_name_value"]]

In [131]:
extra_missing_parents = get_missing_parents(id_name_dict, extra_data)
    

In [180]:
pd.DataFrame(data=extra_missing_parents,columns=["id"]).to_csv(PROCESSED_WIKIDIR + "/extra_missing_parents.csv", index = False)

In [182]:
with open(os.path.join(PROCESSED_WIKIDIR, "updated_entries_extra_missing_parents.json")) as a:
    updated_entries_extra_missing_parents = json.load(a)
    extra_extra_data = pd.DataFrame.from_dict(updated_entries_extra_missing_parents, orient="index")

In [185]:
get_missing_parents(id_name_dict,extra_extra_data[extra_extra_data["parent_taxon"].notna()])

['Q101246679', 'Q107155540', 'Q105756250']

In [None]:
'''
There are 9482 taxon_name repeating for twice,
322 taxon_name repeating 3 times,
25 taxon_name repeating for 4 times,
6 taxon_name repeating for 5 times,
1 taxon_name repeating for 7 times
'''
repetitive_taxon_name = get_repetitive_taxon_name(data)

collections.Counter(repetitive_taxon_name.values())

In [None]:
'''
Since data has repetitive taxon names, which leads to cycles in the tree
and incorrectly merged nodes, we use unique id code to build tree instead
of taxon names
'''
tree = build_tree(data)

tree_file = open(os.path.join(PROCESSED_WIKIDIR,"tree_flat.pkl"), "wb")
pickle.dump(tree, tree_file)
tree_file.close()

In [99]:
'''
for each group of siblings, check if there are repetitions in the taxon names
'''
repetitive_taxon_names = {}
for node_code, children_code in tree.items():
    try:
        children_names = [id_name_dict[child] for child in children_code]
        if len(children_names) != len(set(children_names)):
            repetitive_children = {}
            for i in range(len(children_names)):
                if children_names.count(children_names[i]) > 1:
                    repetition_name = children_names[i]
                    if repetition_name not in repetitive_children:
                        repetitive_children[repetition_name] = [children_code[i]]
                    else:
                        repetitive_children[repetition_name].append(children_code[i])
            repetitive_taxon_names[node_code] = repetitive_children                         
            # print(node_code, "has children with same taxon names: ", repetitive_children, "\n")
    except:
        continue

'''
for each parent, check if its children has different ranks
'''
different_rank_siblings = {}
for node_code, children_code in tree.items():
    try:
        children_ranks = {id_rank_dict[child] for child in children_code}
        if len(children_ranks) > 1:
            different_rank_siblings[node_code] = children_ranks                   
            # print(node_code, "has children with different ranks: ", children_ranks)
    except:
        continue

In [11]:
'''
There are two Biota, but the second one is the root
'''
data[data["taxon_name"] == "Biota"]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
1669271,Q87177320,Biota,Q34740,Q146037,Biota
2821400,Q2382443,Biota,Q22666877,,biota


In [22]:
# build the tree of life with Biota as root
tree_from_biota = build_tree_from_root("Q2382443", tree, id_name_dict, id_rank_dict)
with open(os.path.join(PROCESSED_WIKIDIR, "tree.json"), 'w') as fp:
    json.dump(tree_from_biota, fp)

In [18]:
# build the "dangling trees" for roots that are not Biota
for missing_parent in missing_parents:
    dangling_tree = build_tree_from_root(missing_parent, tree, id_name_dict, id_rank_dict)
    file_name = os.path.join(PROCESSED_WIKIDIR, "dangling_trees",missing_parent + ".json")
    with open(file_name, 'w') as fp:
        json.dump(dangling_tree, fp, indent=4)