In [3]:
# load pre-computed files
WIKIDIR = "../dataset/wikidata"
data = pd.read_csv(os.path.join(WIKIDIR, "full_parsed.csv"))[["id","taxon_name","taxon_rank","parent_taxon","species_name_value"]]
with open(os.path.join(WIKIDIR, "tree_flat.pkl"),"rb") as tree_f, open(os.path.join(WIKIDIR, "id_name_lookup.pkl"),"rb") as id_name_f, open(os.path.join(WIKIDIR, "id_rank_lookup.pkl"),"rb") as id_rank_f:
    tree, id_name_dict, id_rank_dict = pickle.load(tree_f), pickle.load(id_name_f), pickle.load(id_rank_f)


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [17]:
import pandas as pd
import numpy as np
import os
import time
import pickle
import math
import collections
import json

DATADIR = "../dataset"
WIKI_DIR = DATADIR + "/wikidata"
PARSED_WIKI_DIR = WIKI_DIR + "/parsed"

def is_nan(value):
    '''
    check if a value is nan
    return boolean
    '''
    try:
        return math.isnan(float(value))
    except:
        return False
    

def build_tree_with_taxon_name (data, id_name_dict):
    '''
    no longer used.
    generate tree of life using the taxon names and parent names (not "Q..." code)
    from the fully combined parsed data 
    return dict
    '''
    tree_dict = {}
    for i in range(data.shape[0]):
        parent_code = data["parent_taxon"][i]
        # find the taxon name of the entry or the species_name_value of it
        name = data["taxon_name"][i]
        if name == None or is_nan(name):
            name = "species_name_value_" + data["species_name_value"][i]
        # only process entries with a certain parent node 
        if parent_code != None and not is_nan(parent_code):
            # retrieve taxon name of the parent taxon, if None then species_name_value
            parent_name = id_name_dict[parent_code]
                
            if parent_name not in tree_dict:
                tree_dict[parent_name] = [name]
            else:
                tree_dict[parent_name].append(name)
    return tree_dict

def build_tree (data):
    '''
    generate tree of life using the code names 
    from the fully combined parsed data 
    return dict, where key is each parent code and 
    values are their corresponding children codes
    '''
    tree_dict = {}
    for i in range(data.shape[0]):
        parent_code = data["parent_taxon"][i]
        name = data["id"][i]

        # only process entries with a certain parent node 
        if parent_code != None and not is_nan(parent_code):
            if parent_code not in tree_dict:
                tree_dict[parent_code] = [name]
            else:
                tree_dict[parent_code].append(name)
    
    for i in range(data.shape[0]):
        potential_leaf = data["id"][i]
        if potential_leaf not in tree_dict:
            tree_dict[potential_leaf] = []
    return tree_dict

# it will take 30 seconds on my computer
def build_parent_lookup(data):
    '''
    compared to build_parent_lookup_naive, a much quicker way to
    build a lookup dict to link parent_taxon code to a name. 
    Its keys are parent_taxon code ("Q....")
    and values are the corresponding taxon_name, if taxon_name is nan, then the 
    values are the corresponding species_name_value, if the parent_taxon code
    is not in the "id" attribute of the fully combined parsed data, then the
    corresponding values are just the original parent_taxon code
    return: dict
    '''
    def get_sorted_parents(data):
        '''
        sort all parent_taxon in the data,
        a helper function for build_parent_lookup
        return list
        '''
        parents = data["parent_taxon"].unique().tolist()
        ids = data["id"].unique().tolist()
        for parent in parents:
            if is_nan(parent):
                parents.remove(parent)
        parents = sorted(parents)
        return parents
    
    id_name_dict = {}
    data = data.sort_values(by=['id'],ignore_index=True)
    parents = get_sorted_parents(data)
    assert data["id"].tolist() == data["id"].unique().tolist()
    ids = data["id"].tolist()
    current_idx, previous_idx = 0,0
    start = time.time()
    while len(parents):
        # process the first parent in parent list
        parent = parents[0]
        # if the parent_taxon has not been found after looping the entire id list, 
        # then loop back to the previous idx and put the parent_taxon code itself as name
        if parent != ids[-1] and current_idx == len(ids)-1:
            current_idx = previous_idx 
            id_name_dict[parent] = parent
            parents.pop(0)
            
        # parent id matches an id on the id list
        elif parent == ids[current_idx]:
            name = data.loc[current_idx,"taxon_name"]
            if is_nan(name):
                # add a "species_name_" tag in front of species name, when taxon name is not available
                name = "species_name: " + data.loc[current_idx,"species_name_value"]
            id_name_dict[parent] = name
            parents.pop(0)
            current_idx += 1
            previous_idx = current_idx
            
        # parent id doesn't match 
        else:
            current_idx += 1
                 
    return id_name_dict


# it will take 2.5 hours on my computer
def build_parent_lookup_naive(data):
    '''
    a naive and slow way to perform the same function as build_parent_lookup
    no longer used
    '''
    id_name_dict = {}
    id_list = data["id"].tolist()
    i = 0
    for parent in data["parent_taxon"].unique():
        i += 1
        if i % 1000 == 0:
            time_taken = round(time.time() - start, 2)
            print(time_taken, " seconds, processing " + str(i))
        if not is_nan(parent):
            # if parent id can be found in id list then retrieve its taxon name or species_name_value
            try:
                idx = id_list.index(parent)
                name = data.loc[idx,"taxon_name"]
                if is_nan(name):
                    # add a "species_name_" tag in front of species name, when taxon name is not available
                    name = "species_name: " + data.loc[idx,"species_name_value"]
                id_name_dict[parent] = name
            # if parent id cannot be found in id list then its id = its name
            except ValueError:
                id_name_dict[parent] = parent
    return id_name_dict

def get_path_to_root(tree,bottom_node):
    '''
    very slow.
    for an item in the tree, recursively search for its parent
    until finding the root, or detecting a cycle
    It does not detect multiple parent case (only follow path for one parent)
    return a list of ancestors for a child, the list ends in its root
    or a node in a cycle. In the cycle case, the list ends in the word "cycle"
    '''
    def add_parent_to_path(node):
        for key in tree.keys():
            if node in tree[key]:
                parent_list.append(key)
                if parent_list.count(parent_list[-1]) > 1:
                    parent_list.append("cycle!")
                else:
                    add_parent_to_path(key)
                # do not consider alternative path when multi-parents occur
                break
                
    parent_list = [bottom_node]
    add_parent_to_path(bottom_node)
    return parent_list

def get_node_parent(tree):
    '''
    very slow.
    get parent of all nodes in the tree
    return dict, where keys are nodes,
    values are their corresponding parent(s)
    '''
    count = 0
    node_parent = {}
    for node in tree.keys():
        count += 1
        if count % 100 == 0:
            print(count)
        parent = []
        for parent_candidate in tree.keys():
            if node in tree[parent_candidate]:
                parent.append(parent_candidate)
                
        node_parent[node] = parent
    return node_parent

def get_repetitive_taxon_name(data):
    '''
    for all repetitive taxon names, show how many counts there are in the data
    return dict, where key is repetitve taxon name, value is corresponding count
    '''
    count = collections.Counter(data[data["taxon_name"].notna()]["taxon_name"])
    for key, value in list(count.items()):
        if value == 1:
            del count[key]
    return dict(count)

def get_id_to_name_lookup(data):
    ''' 
    Build a look up dictionary of id codes to taxon names for all entries.
    If taxon_name is nan, then the species_name_value is used.
    return: dict, where keys are id codes and values are corresponding taxon_name,
    or species_name_value
    '''
    
    id_name_dict = {}
    ids, taxon_names, species_name_values = data["id"],data["taxon_name"],data["species_name_value"]
    for i in range(len(data)):
        id_code, name = ids[i], taxon_names[i]
        if is_nan(name):
            # use_species_name_value when taxon name is not available
            name = species_name_values[i]
        id_name_dict[id_code] = name

    return id_name_dict

def get_id_to_rank_lookup(data):
    ''' 
    Build a look up dictionary of id codes to taxon ranks for all entries.
    return: dict, where keys are id codes and values are corresponding taxon_rank,
    or None
    '''
    
    id_rank_dict = {}
    ids, taxon_ranks = data["id"],data["taxon_rank"]
    for i in range(len(data)):
        id_code, rank = ids[i], taxon_ranks[i]
        if is_nan(rank):
            rank = "NA"
        id_rank_dict[id_code] = rank

    return id_rank_dict


def get_missing_parents(id_name_dict,data):
    '''
    Look for missing parents, which are parents whose id cannot be found in the data
    return: list of id code of the missing parents.
    '''
    missing_parents = []
    for parent in data[data["parent_taxon"].notna()]["parent_taxon"].unique().tolist():
        if parent not in id_name_dict:
            missing_parents.append(parent)
    return missing_parents

def build_tree_from_root(root_id, tree, id_name, id_rank):
    try:
        name = id_name[root_id]
    except KeyError:
        name = ""
        
    try:
        rank = id_rank[root_id]
    except KeyError:
        rank = ""
    
    rooted_tree = {"name":name,"rank":rank, "id": root_id, "children":[build_tree_from_root(child_id,tree,id_name,id_rank) for child_id in tree[root_id]]}
    
    if rooted_tree["children"] == []:
        del rooted_tree["children"]
    else:
        # merge nodes or leaves that are siblings but has identical taxon names
        repeated_names = [item for item, count in collections.Counter([child["name"] for child in rooted_tree["children"]]).items() if count > 1]
        new_children_list = [child for child in rooted_tree["children"] if child["name"] not in repeated_names]
        for rep in repeated_names:
            replacement = {"name":rep,"rank":"", "id": "","children":[]}
            for child in rooted_tree["children"]:
                if child["name"] == rep:
                    if replacement["id"]:
                        replacement["id"] += "_" + child["id"]
                    else:
                        replacement["id"] = child["id"]
                    if replacement["rank"]:
                        replacement["rank"] += "_" + child["rank"]
                    else:
                        replacement["rank"] = child["rank"]
                    
                    if "children" in child.keys():
                        try:
                            replacement["children"] += child["children"]
                        except KeyError:
                            replacement["children"] = child["children"]
            new_children_list.append(replacement)

        rooted_tree["children"] = new_children_list
    return rooted_tree

In [None]:
# combine all parsed JSON files into one file (maybe this work has already been done and saved as wikidata_records.csv)
data = pd.DataFrame()
for file in os.listdir(PARSED_WIKI_DIR):
    temp_data = pd.read_json(os.path.join(PARSED_WIKI_DIR,file))
    data = pd.concat([data,temp_data],ignore_index=True)

# break species_name into two attributes: species_name_language and species_name_value
species_name_language, species_name_value = \
[data["species_name"][i]["language"] for i in range(data.shape[0])],\
[data["species_name"][i]["value"] for i in range(data.shape[0])]

data["species_name_value"], data["species_name_language"] = species_name_value,species_name_language
data = data.drop(columns="species_name")
data.to_csv(PARSED_WIKI_DIR + "/full_parsed.csv", index= False)


In [None]:
'''
when an item has no taxon name but has a parent taxon, use its species_name_value as its name.
"Neonymphon sp." and "Wainuia" are two species_name_value that equals the taxon names of other
items
'''
all_taxon_names = data["taxon_name"].unique()
species_values = data[data["taxon_name"].isna() & data["parent_taxon"].notna()]["species_name_value"].tolist()

print("species_name_value that is also present in taxon_names: ")
for species_value in species_values:
    if species_value in all_taxon_names:
        print(species_value)

'''
In cases where species_name_value names are used, these names are also not all unique. 
There are two "Bedotia spec." that use species_name_value

'''
print("not unique species_name_value: ")
species_count = collections.Counter(species_values)
for c in species_count.keys():
    if species_count[c] != 1:
        print(c + ": " + str(species_count[c]))

In [4]:
'''
The two Neonymphon sp. species can be merged
And the Wainuia is the parent of the second Wainuia
'''
data[
    (data["species_name_value"] == "Neonymphon sp.")
    | (data["species_name_value"] == "Wainuia")
    | (data["taxon_name"] == "Neonymphon sp.") 
    | (data["taxon_name"] == "Wainuia")
]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
1435832,Q63708474,Neonymphon sp.,Q7432,Q11845937,Neonymphon sp.
1488753,Q1905950,,Q7432,Q11845937,Neonymphon sp.
1527131,Q3076049,Wainuia,Q34740,Q3075210,Wainuia
2881841,Q7960198,,Q7432,Q3076049,Wainuia


In [5]:
'''
These two species can be merged
'''
data[data["species_name_value"] == "Bedotia spec."]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
973595,Q4879358,,Q7432,Q141320,Bedotia spec.
2860033,Q4879370,,Q7432,Q141320,Bedotia spec.


In [None]:
# create a dict whose keys are parent_taxon code and value is the corresponding name (parent taxon code, or taxon name, or species_name_value)
#id_name_dict = build_parent_lookup(data)
#parent_id_to_name_dict_file = open(os.path.join(WIKI_DIR,"parent_id_to_name_dict.pkl"), "wb")
#pickle.dump(id_name_dict, parent_id_to_name_dict_file)
#parent_id_to_name_dict_file.close()

# create a dict whose keys are all id codes and values are its corresponding taxon name, if null then species_name_value
id_name_dict = get_id_to_name_lookup(data)
id_name_dict_file = open(os.path.join(WIKI_DIR,"id_name_lookup.pkl"), "wb")
pickle.dump(id_name_dict, id_name_dict_file)
id_name_dict_file.close()

# create a dict whose keys are all id codes and values are its corresponding taxon rank, if null then None
id_rank_dict = get_id_to_rank_lookup(data)
id_rank_dict_file = open(os.path.join(WIKI_DIR,"id_rank_lookup.pkl"), "wb")
pickle.dump(id_rank_dict, id_rank_dict_file)
id_rank_dict_file.close()
'''
There are some parent_taxon that is not in the id list 
There are also three parent_taxon who don't have a taxon_name, therefore, its species_name_value is used instead:
They are:

(Q21350434) species_name: Carabus subdiv. Procrustimorphi
(Q21350423) species_name: Carabus div. Multistriati
(Q21350410) species_name: Carabus div. Carabogenici
'''

In [6]:
'''
these are the parents that are not present in the list of ids
which means they have no record in the wikidata



separate_roots = []
for code, taxon in id_name_dict.items():
    if code == taxon:
        separate_roots.append(code)
'''
missing_parents = get_missing_parents(id_name_dict, data)
missing_parents

['Q15869465',
 'Q99442800',
 'Q17319474',
 'Q4150646',
 'Q235536',
 'Q27110262',
 'Q99521037',
 'Q15341165',
 'Q13411080',
 'Q27973466',
 'Q5226073',
 'Q96381504',
 'Q72288',
 'Q845214',
 'Q102318370',
 'Q105134179',
 'Q105134195',
 'Q105452043',
 'Q934008',
 'Q102043669',
 'Q19409463',
 'Q17285269',
 'Q102047357',
 'Q56453547',
 'Q100582257',
 'Q105134193',
 'Q21367139',
 'Q84986611',
 'Q105130613',
 'Q14458220',
 'Q99424267']

In [None]:
'''
There are 9482 taxon_name repeating for twice,
322 taxon_name repeating 3 times,
25 taxon_name repeating for 4 times,
6 taxon_name repeating for 5 times,
1 taxon_name repeating for 7 times
'''
repetitive_taxon_name = get_repetitive_taxon_name(data)

collections.Counter(repetitive_taxon_name.values())

In [None]:
'''
Since data has repetitive taxon names, which leads to cycles in the tree
and incorrectly merged nodes, we use unique id code to build tree instead
of taxon names
'''
tree = build_tree(data)

tree_file = open(os.path.join(WIKI_DIR,"tree_flat.pkl"), "wb")
pickle.dump(tree, tree_file)
tree_file.close()

In [7]:
'''
for each group of siblings, check if there are repetitions in the taxon names
'''
repetitive_taxon_names = {}
for node_code, children_code in tree.items():
    try:
        children_names = [id_name_dict[child] for child in children_code]
        if len(children_names) != len(set(children_names)):
            repetitive_children = {}
            for i in range(len(children_names)):
                if children_names.count(children_names[i]) > 1:
                    repetition_name = children_names[i]
                    if repetition_name not in repetitive_children:
                        repetitive_children[repetition_name] = [children_code[i]]
                    else:
                        repetitive_children[repetition_name].append(children_code[i])
            repetitive_taxon_names[node_code] = repetitive_children                         
            print(node_code, "has children with same taxon names: ", repetitive_children, "\n")
    except:
        continue

Q10771886 has children with same taxon names:  {'Roegneria shandongensis': ['Q94445016', 'Q94445021']} 

Q157634 has children with same taxon names:  {'Clematis ‘Star of India’': ['Q2978844', 'Q83638443'], 'Clematis lineariloba': ['Q93342747', 'Q93342713']} 

Q34687 has children with same taxon names:  {'Rosa ‘Ave Maria’': ['Q2873382', 'Q96278591'], 'Rosa ‘Broadway’': ['Q24192323', 'Q83690929'], 'Rosa ‘Clementine’': ['Q30893497', 'Q83666700'], 'Rosa ‘Cabana’': ['Q83646992', 'Q83738140'], 'Rosa ‘Violetta’': ['Q83670269', 'Q83690940'], 'Rosa ‘Bordeaux’': ['Q83671863', 'Q83690957'], 'Rosa ‘Golden Border’': ['Q83690731', 'Q83690886'], 'Rosa ‘Matilda’': ['Q83690853', 'Q83668974'], 'Rosa ‘Mambo’': ['Q83690866', 'Q16626382'], 'Rosa ‘Indian Summer’': ['Q83690882', 'Q83675226'], 'Rosa ‘Kathleen’': ['Q83690936', 'Q83667956'], 'Rosa ‘Bianca’': ['Q96278546', 'Q83667443'], 'Rosa ‘Elfe’': ['Q4048459', 'Q83690802'], 'Rosa ‘Silva’': ['Q83647122', 'Q83738965'], 'Rosa ‘Parks’ Yellow Tea-scented China’':

Q158501 has children with same taxon names:  {'Carex nebrascensis': ['Q5039106', 'Q65210266']} 

Q163988 has children with same taxon names:  {'Neottia subg. Listera': ['Q94537199', 'Q94537168'], 'Neottia grandiflora': ['Q94537379', 'Q94537413'], 'Neottia australis': ['Q94537238', 'Q94537260']} 

Q161173 has children with same taxon names:  {'Ipomoea ×sloteri': ['Q104485194', 'Q6065278']} 

Q141475 has children with same taxon names:  {'Orostachys ramosissima': ['Q94377165', 'Q94377212']} 

Q3283496 has children with same taxon names:  {'Pinalia connata': ['Q92667992', 'Q92668044']} 

Q157017 has children with same taxon names:  {'Magnolia soulangeana': ['Q104864635', 'Q104864638']} 

Q164454 has children with same taxon names:  {'Calliandra paganuccii': ['Q92627549', 'Q92627606']} 

Q15980308 has children with same taxon names:  {'Calonema bicalliatum': ['Q93416703', 'Q93416659'], 'Calonema flaccidum': ['Q93417844', 'Q93417813'], 'Calonema filamentosum': ['Q93417508', 'Q93417594'], 'C

Q775899 has children with same taxon names:  {'Dibrachia': ['Q21219947', 'Q51823874']} 

Q19360237 has children with same taxon names:  {'Polycoccum crespoae': ['Q21302728', 'Q105065248'], 'Polycoccum jamesii': ['Q105065249', 'Q21302684']} 

Q3919240 has children with same taxon names:  {'Lecidea arthonioides': ['Q87818246', 'Q87818397']} 

Q1933387 has children with same taxon names:  {'Conocybe candida': ['Q105051390', 'Q10458472'], 'Conocybe crispa': ['Q105051392', 'Q10458474'], 'Conocybe tenera': ['Q1641496', 'Q105051402'], 'Conocybe togularis': ['Q10458541', 'Q105051403'], 'Conocybe rickenii': ['Q105051399', 'Q5162125'], 'Conocybe blattaria': ['Q105051389', 'Q10458468'], 'Conocybe rickeniana': ['Q105051398', 'Q10458523'], 'Conocybe rugosa': ['Q105051400', 'Q10458525'], 'Conocybe apala': ['Q1632206', 'Q105051388']} 

Q5118293 has children with same taxon names:  {'Agylla virilis': ['Q7913066', 'Q99695551']} 

Q602790 has children with same taxon names:  {'Arctata': ['Q21221074', 'Q

Q305162 has children with same taxon names:  {'Puccinia cymbopogonis': ['Q105066251', 'Q1449695'], 'Puccinia pelargonii-zonalis': ['Q10646270', 'Q105066256'], 'Puccinia xanthii': ['Q288667', 'Q105066260'], 'Puccinia kenmorensis': ['Q105066252', 'Q1478613'], 'Puccinia unica': ['Q105066259', 'Q1232996'], 'Puccinia calthae': ['Q105066250', 'Q1446218']} 

Q1754819 has children with same taxon names:  {'Paraleirides': ['Q50742230', 'Q56022295'], 'Cumeres': ['Q56005944', 'Q4036769'], 'Leiromorpha': ['Q4042748', 'Q56022289'], 'Bradytulus': ['Q56005941', 'Q4035376'], 'Parapercosia': ['Q56022296', 'Q50742245'], 'Amathitis': ['Q4033872', 'Q56005935'], 'Armatoleirides': ['Q4034380', 'Q56005938'], 'Harpaloamara': ['Q4040427', 'Q56022283'], 'Hyalamara': ['Q4040846', 'Q56022286'], 'Microleirus': ['Q4044212', 'Q56022292'], 'Leuris': ['Q56022291', 'Q4042822'], 'Paracelia': ['Q56022294', 'Q16528102'], 'Zabroscelis': ['Q50742297', 'Q56033111'], 'Amarocelia': ['Q56005934', 'Q4033871'], 'Leiramara': ['Q56

Q306495 has children with same taxon names:  {'Cortinarius paleaceus': ['Q11887243', 'Q105052002'], 'Cortinarius cinnamomeus': ['Q1084167', 'Q105051865'], 'Cortinarius albidus': ['Q2998110', 'Q105051811'], 'Cortinarius croceus': ['Q4347259', 'Q105051871'], 'Cortinarius evernius': ['Q5481564', 'Q105051884'], 'Cortinarius corrosus': ['Q10437677', 'Q105051870'], 'Cortinarius jenolanensis': ['Q24916020', 'Q105051963'], 'Cortinarius sclerophyllarum': ['Q49620024', 'Q105052031'], 'Cortinarius alboviolaceus': ['Q105051812', 'Q3915839'], 'Cortinarius austroalbidus': ['Q105051824', 'Q10460044'], 'Cortinarius glaucopus': ['Q105051956', 'Q3694823'], 'Cortinarius piriformis': ['Q105052010', 'Q10460182'], 'Cortinarius venetus': ['Q105052115', 'Q10609165'], 'Cortinarius rotundisporus': ['Q5173326', 'Q105052019'], 'Cortinarius sanguineus': ['Q9387379', 'Q105052026'], 'Cortinarius largus': ['Q10570865', 'Q105051969'], 'Cortinarius fragilipes': ['Q63140788', 'Q105051887'], 'Cortinarius argyrionus': ['Q


Q10580981 has children with same taxon names:  {'Melaspilea gallowayi': ['Q21320763', 'Q105058740']} 

Q139742 has children with same taxon names:  {'Eopaederus': ['Q21360864', 'Q21439097'], 'Anomalopaederus': ['Q21360860', 'Q21439103']} 

Q7576501 has children with same taxon names:  {'Sphaerellothecium gallowayi': ['Q105066986', 'Q10674973']} 

Q10644329 has children with same taxon names:  {'Pseudosinella strinatii': ['Q14854805', 'Q71967695']} 

Q127248 has children with same taxon names:  {'Skyttea mayrhoferi': ['Q21326460', 'Q105066879']} 

Q19748 has children with same taxon names:  {'Macrolepiota mastoidea': ['Q206727', 'Q105058394'], 'Macrolepiota rhacodes': ['Q84832142', 'Q105058397'], 'Macrolepiota bonaerensis': ['Q105058390', 'Q105058391']} 

Q39275 has children with same taxon names:  {'Mus rattoides': ['Q72068322', 'Q72068243'], 'Mus cinnamomeus': ['Q72068582', 'Q72068645']} 

Q2336729 has children with same taxon names:  {'Balanus roseus': ['Q48337862', 'Q63724117']} 



Q10553033 has children with same taxon names:  {'Laetisaria marsonii': ['Q105057335', 'Q105057337']} 

Q1886599 has children with same taxon names:  {'Lepista sordida': ['Q3229951', 'Q105057799'], 'Lepista personata': ['Q105057797', 'Q25379773'], 'Lepista nuda': ['Q105057796', 'Q29362']} 

Q4801389 has children with same taxon names:  {'Pseudosesidia': ['Q21230633', 'Q56242036']} 

Q3936440 has children with same taxon names:  {'Gorgonia gracilis': ['Q74816659', 'Q3944406']} 

Q534535 has children with same taxon names:  {'Cavernicola': ['Q61785198', 'Q21616179']} 

Q3734558 has children with same taxon names:  {'Eupelmus orientalis': ['Q67173477', 'Q13623849']} 

Q3012600 has children with same taxon names:  {'Gastrochaena brevis': ['Q69336423', 'Q13640071']} 

Q1637675 has children with same taxon names:  {'Mutinus caninus': ['Q1500672', 'Q105059200'], 'Mutinus discolor': ['Q105059202', 'Q105059203']} 

Q2033960 has children with same taxon names:  {'Lentinus edodes': ['Q105057496', 

Q3015183 has children with same taxon names:  {'Genysa decorsei': ['Q1873973', 'Q2709981']} 

Q2907363 has children with same taxon names:  {'Pterocella': ['Q4263518', 'Q104401278']} 

Q5527933 has children with same taxon names:  {'Gautieria morchelliformis': ['Q2170668', 'Q105054627']} 

Q1934080 has children with same taxon names:  {'Austroboletus lacunosus': ['Q105049725', 'Q10422949']} 

Q341780 has children with same taxon names:  {'Agrocybe parasitica': ['Q10403508', 'Q105049013'], 'Agrocybe pusiola': ['Q10647200', 'Q105049017'], 'Agrocybe limonia': ['Q105049007', 'Q10403503']} 

Q10447771 has children with same taxon names:  {'Chaetopsis macroclada': ['Q5066711', 'Q105050413']} 

Q913940 has children with same taxon names:  {'Clathrus ruber': ['Q105050637', 'Q579190'], 'Clathrus cancellatus': ['Q105050630', 'Q21872369'], 'Clathrus archeri': ['Q105050629', 'Q1757951'], 'Clathrus denudatus': ['Q21872374', 'Q105050632']} 

Q3294497 has children with same taxon names:  {'Inuliphila

Q61653739 has children with same taxon names:  {'Armillariella novae-zelandiae': ['Q105049489', 'Q62079392']} 

Q5361179 has children with same taxon names:  {'Acteonimorpha': ['Q56252896', 'Q61677170']} 

Q61764291 has children with same taxon names:  {'Solenia sulphurea': ['Q105066887', 'Q61855185'], 'Solenia fasciculata': ['Q105066885', 'Q61855179'], 'Solenia candida': ['Q61764318', 'Q105066884']} 

Q61885451 has children with same taxon names:  {'Psalliota arvensis': ['Q105066070', 'Q61885815'], 'Psalliota elatior': ['Q105066077', 'Q105066076'], 'Psalliota langei': ['Q105066084', 'Q61929946'], 'Psalliota campestris': ['Q105066074', 'Q61914278'], 'Psalliota arenicola': ['Q105066069', 'Q61928264'], 'Psalliota gennadii': ['Q105066079', 'Q105066078']} 

Q25365897 has children with same taxon names:  {'Thraustochytrium proliferum': ['Q61998776', 'Q105067883']} 

Q10486300 has children with same taxon names:  {'Epilampra puncticollis': ['Q62108456', 'Q62108468']} 

Q18117279 has children

Q15869897 has children with same taxon names:  {'Cerosterna scabrator': ['Q28432070', 'Q15858557']} 

Q177879 has children with same taxon names:  {'Cobitoidea': ['Q100143625', 'Q2246081']} 

Q21342261 has children with same taxon names:  {'Inaequifurcata': ['Q21342260', 'Q61668001'], 'Aequifurcata': ['Q21342264', 'Q61668002']} 

Q3042119 has children with same taxon names:  {'Pleraplysilla': ['Q3392278', 'Q104399348']} 

Q14529689 has children with same taxon names:  {'Zygaena haberhaueri haberhaueri': ['Q21377873', 'Q83970548'], 'Zygaena haberhaueri optima': ['Q83971435', 'Q21377875'], 'Zygaena haberhaueri demangei': ['Q21377871', 'Q83970563']} 

Q3197786 has children with same taxon names:  {'Acodontaster': ['Q3198001', 'Q104913171']} 

Q134869 has children with same taxon names:  {'Dicyemida': ['Q21447146', 'Q2224427']} 

Q25365857 has children with same taxon names:  {'Cyanoptyche gloeocystis': ['Q105444383', 'Q69523820']} 

Q19939 has children with same taxon names:  {'Panthera t

In [8]:
len(repetitive_taxon_names)

1033

In [9]:
'''
for each parent, check if its children has different ranks
'''
different_rank_siblings = {}
for node_code, children_code in tree.items():
    try:
        children_ranks = {id_rank_dict[child] for child in children_code}
        if len(children_ranks) > 1:
            different_rank_siblings[node_code] = children_ranks                   
            print(node_code, "has children with different ranks: ", children_ranks)
    except:
        continue

Q602740 has children with different ranks:  {'Q767728', 'Q3181348', 'Q3238261', 'Q5998839', 'Q7432', 'NA'}
Q157396 has children with different ranks:  {'Q5998839', 'Q7432', 'Q3238261', 'Q3181348'}
Q157714 has children with different ranks:  {'Q5998839', 'Q7432', 'Q3238261', 'Q3181348'}
Q157337 has children with different ranks:  {'Q3181348', 'Q7432', 'Q68947', 'Q3238261'}
Q157656 has children with different ranks:  {'Q5998839', 'Q7432', 'Q3238261', 'Q3181348'}
Q1100242 has children with different ranks:  {'Q7432', 'Q3181348'}
Q1017395 has children with different ranks:  {'Q3025161', 'Q7432', 'Q3238261', 'Q3181348'}
Q136934 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2656869 has children with different ranks:  {'Q7432', 'Q3181348'}
Q156146 has children with different ranks:  {'Q3181348', 'Q3238261', 'Q5998839', 'Q7432', 'Q4886', 'Q3025161'}
Q159220 has children with different ranks:  {'Q5998839', 'Q7432', 'Q3238261', 'Q3181348'}
Q131511 has children with different ranks: 

Q8191252 has children with different ranks:  {'Q7432', 'Q3181348'}
Q311454 has children with different ranks:  {'Q3181348', 'Q7432', 'Q3238261'}
Q212815 has children with different ranks:  {'Q767728', 'Q3181348', 'Q7432', 'Q3238261'}
Q2738553 has children with different ranks:  {'Q7432', 'Q4886', 'Q3181348'}
Q5527608 has children with different ranks:  {'Q7432', 'Q3181348'}
Q159328 has children with different ranks:  {'Q7432', 'Q3181348'}
Q81513 has children with different ranks:  {'NA', 'Q7432', 'Q4886', 'Q3238261'}
Q15711445 has children with different ranks:  {'Q3181348', 'Q7432', 'Q3238261'}
Q3309645 has children with different ranks:  {'Q7432', 'Q3181348'}
Q2700651 has children with different ranks:  {'Q7432', 'Q3181348'}
Q152986 has children with different ranks:  {'Q5998839', 'Q7432', 'Q3238261', 'Q3181348'}
Q2705135 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q4675376 has children with different ranks:  {'Q7432', 'Q3181348'}
Q4907604 has children with 

Q1807600 has children with different ranks:  {'Q3025161', 'Q7432'}
Q1421617 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q164202 has children with different ranks:  {'Q7432', 'Q3181348'}
Q587958 has children with different ranks:  {'Q3181348', 'Q7432', 'Q5998839', 'Q3238261'}
Q133016 has children with different ranks:  {'Q5998839', 'Q7432', 'Q3181348'}
Q159525 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q2661782 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q44448 has children with different ranks:  {'Q3965313', 'Q227936', 'Q164280', 'Q34740'}
Q3233093 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q1140322 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2598732 has children with different ranks:  {'Q7432', 'Q3181348'}
Q209596 has children with different ranks:  {'Q3181348', 'Q3238261', 'Q13198444', 'Q7432', 'Q5998839', 'Q3025161'}
Q2255950 has children with different ranks:  {'

Q5712040 has children with different ranks:  {'Q7432', 'Q3238261'}
Q768400 has children with different ranks:  {'Q3965313', 'Q227936', 'Q7432', 'Q34740'}
Q2056201 has children with different ranks:  {'Q7432', 'Q3238261'}
Q5149175 has children with different ranks:  {'Q7432', 'Q34740'}
Q10762050 has children with different ranks:  {'Q7432', 'Q3238261'}
Q21225997 has children with different ranks:  {'Q227936', 'Q34740'}
Q896734 has children with different ranks:  {'Q164280', 'Q34740'}
Q2689942 has children with different ranks:  {'Q7432', 'Q3238261'}
Q3288921 has children with different ranks:  {'Q164280', 'Q34740'}
Q5972320 has children with different ranks:  {'Q34740', 'Q3965313'}
Q2066889 has children with different ranks:  {'Q164280', 'Q34740'}
Q842417 has children with different ranks:  {'Q7432', 'NA'}
Q13654329 has children with different ranks:  {'Q7432', 'Q3238261'}
Q13654726 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10631072 has children with different ranks:  {

Q131050 has children with different ranks:  {'Q227936', 'Q164280', 'Q35409', 'Q34740'}
Q2492757 has children with different ranks:  {'Q164280', 'Q34740'}
Q912976 has children with different ranks:  {'Q7432', 'Q3238261'}
Q7433136 has children with different ranks:  {'Q164280', 'Q34740'}
Q23390 has children with different ranks:  {'Q164280', 'NA', 'Q34740'}
Q938953 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q3430452 has children with different ranks:  {'Q164280', 'Q34740'}
Q10477938 has children with different ranks:  {'Q164280', 'Q34740'}
Q12161040 has children with different ranks:  {'Q164280', 'Q34740'}
Q772442 has children with different ranks:  {'Q35409', 'Q2136103', 'Q34740'}
Q134393 has children with different ranks:  {'Q35409', 'Q34740'}
Q15890729 has children with different ranks:  {'Q35409', 'Q2136103', 'Q34740'}
Q20722172 has children with different ranks:  {'Q227936', 'Q34740'}
Q2358824 has children with different ranks:  {'Q227936', 'Q34740'}
Q1096

Q5671359 has children with different ranks:  {'Q68947', 'Q767728'}
Q574234 has children with different ranks:  {'Q7432', 'Q3238261'}
Q148929 has children with different ranks:  {'Q68947', 'Q767728'}
Q159815 has children with different ranks:  {'Q68947', 'Q279749', 'Q767728'}
Q783863 has children with different ranks:  {'Q68947', 'Q767728'}
Q11959426 has children with different ranks:  {'Q7432', 'Q3238261'}
Q19796604 has children with different ranks:  {'Q68947', 'Q767728'}
Q19607745 has children with different ranks:  {'Q68947', 'Q767728'}
Q19607753 has children with different ranks:  {'Q68947', 'Q767728'}
Q157878 has children with different ranks:  {'Q68947', 'Q767728'}
Q476769 has children with different ranks:  {'Q68947', 'Q767728'}
Q157624 has children with different ranks:  {'Q68947', 'Q4886', 'Q279749', 'Q767728'}
Q883411 has children with different ranks:  {'Q7432', 'Q3238261'}
Q133410 has children with different ranks:  {'Q164280', 'Q34740'}
Q26726 has children with different r

Q10479166 has children with different ranks:  {'Q7432', 'Q3238261'}
Q21222624 has children with different ranks:  {'Q7432', 'Q3238261'}
Q11843466 has children with different ranks:  {'Q7432', 'Q3238261'}
Q21214562 has children with different ranks:  {'Q7432', 'Q3238261'}
Q882018 has children with different ranks:  {'Q7432', 'Q3181348'}
Q11844137 has children with different ranks:  {'Q7432', 'Q3238261'}
Q4629586 has children with different ranks:  {'Q7432', 'Q3238261'}
Q21215025 has children with different ranks:  {'Q7432', 'Q3238261'}
Q5197643 has children with different ranks:  {'Q7432', 'Q3238261'}
Q12055789 has children with different ranks:  {'Q7432', 'Q3238261'}
Q14888422 has children with different ranks:  {'Q7432', 'Q3238261'}
Q6484558 has children with different ranks:  {'Q7432', 'Q3238261'}
Q6395174 has children with different ranks:  {'Q7432', 'Q3238261'}
Q5952538 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2714090 has children with different ranks:  {'Q7432', 

Q15689531 has children with different ranks:  {'Q164280', 'Q34740'}
Q16969344 has children with different ranks:  {'Q164280', 'Q34740'}
Q895372 has children with different ranks:  {'Q7432', 'Q3238261'}
Q328082 has children with different ranks:  {'Q5867959', 'Q34740'}
Q666222 has children with different ranks:  {'Q164280', 'Q34740'}
Q9347572 has children with different ranks:  {'Q164280', 'Q34740'}
Q18060987 has children with different ranks:  {'Q164280', 'Q34740'}
Q17636633 has children with different ranks:  {'Q164280', 'Q34740'}
Q2380967 has children with different ranks:  {'Q35409', 'Q34740'}
Q16664399 has children with different ranks:  {'Q164280', 'Q34740'}
Q18477366 has children with different ranks:  {'Q164280', 'Q34740'}
Q1132966 has children with different ranks:  {'Q164280', 'Q34740'}
Q838992 has children with different ranks:  {'Q164280', 'Q34740'}
Q17400453 has children with different ranks:  {'Q5867959', 'Q35409'}
Q17400609 has children with different ranks:  {'Q5867959',

Q5881117 has children with different ranks:  {'Q164280', 'Q34740'}
Q133128 has children with different ranks:  {'Q767728', 'Q279749', 'Q68947'}
Q10933986 has children with different ranks:  {'Q767728', 'Q68947'}
Q15251391 has children with different ranks:  {'Q767728', 'Q279749'}
Q10952356 has children with different ranks:  {'Q767728', 'Q279749'}
Q10703203 has children with different ranks:  {'Q7432', 'Q3238261'}
Q163076 has children with different ranks:  {'Q767728', 'Q4886'}
Q1156537 has children with different ranks:  {'Q767728', 'Q279749'}
Q15325217 has children with different ranks:  {'Q68947', 'Q279749'}
Q5279588 has children with different ranks:  {'Q767728', 'Q68947'}
Q3015033 has children with different ranks:  {'Q7432', 'Q3238261'}
Q4036311 has children with different ranks:  {'Q7432', 'Q3238261'}
Q370516 has children with different ranks:  {'Q68947', 'Q279749'}
Q831681 has children with different ranks:  {'Q7432', 'Q3181348'}
Q3171074 has children with different ranks:  {'Q

Q3545503 has children with different ranks:  {'Q7432', 'Q3238261'}
Q3280399 has children with different ranks:  {'Q164280', 'Q34740'}
Q3542864 has children with different ranks:  {'Q164280', 'Q34740'}
Q7687255 has children with different ranks:  {'Q7432', 'Q3238261'}
Q1317200 has children with different ranks:  {'Q7432', 'Q3238261'}
Q3542992 has children with different ranks:  {'Q164280', 'Q34740'}
Q3543759 has children with different ranks:  {'Q164280', 'Q34740'}
Q3279483 has children with different ranks:  {'Q164280', 'Q34740'}
Q3273713 has children with different ranks:  {'Q164280', 'Q34740'}
Q906802 has children with different ranks:  {'Q164280', 'Q34740'}
Q136927 has children with different ranks:  {'Q164280', 'Q34740'}
Q2120695 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2196744 has children with different ranks:  {'Q7432', 'Q3238261'}
Q150866 has children with different ranks:  {'Q38348', 'Q3978005', 'Q35409'}
Q163822 has children with different ranks:  {'Q7432', 

Q206675 has children with different ranks:  {'Q7432', 'Q3238261'}
Q7442742 has children with different ranks:  {'Q35409', 'Q34740'}
Q133551 has children with different ranks:  {'Q5867051', 'Q35409', 'Q36602', 'Q34740'}
Q136846 has children with different ranks:  {'Q36602', 'Q34740'}
Q136590 has children with different ranks:  {'Q7432', 'Q3238261', 'Q3181348'}
Q146212 has children with different ranks:  {'Q767728', 'Q68947'}
Q1526537 has children with different ranks:  {'Q7432', 'Q3238261'}
Q463057 has children with different ranks:  {'Q164280', 'Q34740'}
Q869711 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2035198 has children with different ranks:  {'Q5867959', 'Q35409'}
Q157168 has children with different ranks:  {'Q164280', 'Q3238261', 'Q34740'}
Q149020 has children with different ranks:  {'Q227936', 'Q34740'}
Q867622 has children with different ranks:  {'Q164280', 'Q34740'}
Q7638745 has children with different ranks:  {'Q3965313', 'Q34740'}
Q121944 has children with d

Q915669 has children with different ranks:  {'Q164280', 'Q34740'}
Q826836 has children with different ranks:  {'Q164280', 'Q34740'}
Q148704 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2648604 has children with different ranks:  {'Q7432', 'Q3238261'}
Q106345 has children with different ranks:  {'Q35409', 'Q36602', 'Q1153785', 'Q37517', 'Q34740'}
Q20639619 has children with different ranks:  {'Q7432', 'Q3238261'}
Q5533631 has children with different ranks:  {'Q767728', 'Q279749'}
Q1931353 has children with different ranks:  {'Q68947', 'Q3238261'}
Q132255 has children with different ranks:  {'Q35409', 'Q34740'}
Q4521052 has children with different ranks:  {'Q35409', 'Q36602', 'Q34740'}
Q3127137 has children with different ranks:  {'Q7432', 'Q3238261'}
Q7887901 has children with different ranks:  {'Q227936', 'Q34740'}
Q1946464 has children with different ranks:  {'Q3965313', 'Q34740'}
Q3169619 has children with different ranks:  {'Q164280', 'Q34740'}
Q3138814 has children wi

Q5687695 has children with different ranks:  {'Q767728', 'Q68947'}
Q6540310 has children with different ranks:  {'Q767728', 'Q279749'}
Q7515560 has children with different ranks:  {'Q767728', 'Q68947'}
Q7661736 has children with different ranks:  {'Q767728', 'Q279749'}
Q311066 has children with different ranks:  {'Q767728', 'Q68947'}
Q310105 has children with different ranks:  {'Q767728', 'Q68947'}
Q1751037 has children with different ranks:  {'Q767728', 'Q279749', 'Q68947'}
Q140336 has children with different ranks:  {'Q767728', 'Q68947'}
Q6869729 has children with different ranks:  {'Q767728', 'Q68947'}
Q955003 has children with different ranks:  {'Q767728', 'Q68947'}
Q820986 has children with different ranks:  {'Q767728', 'Q279749', 'Q68947'}
Q1543836 has children with different ranks:  {'Q767728', 'Q68947'}
Q7445431 has children with different ranks:  {'Q767728', 'Q68947'}
Q7445443 has children with different ranks:  {'Q767728', 'Q279749', 'Q68947'}
Q15576904 has children with diff

Q21447774 has children with different ranks:  {'Q7432', 'Q3238261'}
Q1760811 has children with different ranks:  {'Q7432', 'Q3238261'}
Q3144140 has children with different ranks:  {'Q227936', 'Q34740'}
Q6534103 has children with different ranks:  {'Q68947', 'Q767728'}
Q134711 has children with different ranks:  {'Q164280', 'Q34740'}
Q3356801 has children with different ranks:  {'Q6311258', 'Q35409', 'Q2136103', 'Q34740'}
Q3381696 has children with different ranks:  {'Q164280', 'Q34740'}
Q47488557 has children with different ranks:  {'Q7432', 'Q3238261'}
Q1237203 has children with different ranks:  {'Q7432', 'Q3238261'}
Q5273881 has children with different ranks:  {'Q68947', 'Q767728'}
Q4731104 has children with different ranks:  {'Q227936', 'Q34740'}
Q21218506 has children with different ranks:  {'Q227936', 'Q34740'}
Q3391809 has children with different ranks:  {'Q227936', 'Q34740'}
Q21227095 has children with different ranks:  {'Q227936', 'Q34740'}
Q142119 has children with different 

Q12059153 has children with different ranks:  {'Q767728', 'Q279749'}
Q5609417 has children with different ranks:  {'Q767728', 'Q279749'}
Q5609420 has children with different ranks:  {'Q767728', 'Q279749', 'Q68947'}
Q1491422 has children with different ranks:  {'Q68947', 'Q767728'}
Q1301661 has children with different ranks:  {'Q767728', 'Q68947'}
Q134253 has children with different ranks:  {'Q767728', 'Q279749'}
Q15481782 has children with different ranks:  {'Q767728', 'Q68947'}
Q39799224 has children with different ranks:  {'Q767728', 'Q68947'}
Q49523784 has children with different ranks:  {'Q68947', 'Q767728'}
Q81284 has children with different ranks:  {'Q767728', 'Q279749'}
Q15571640 has children with different ranks:  {'Q68947', 'Q767728'}
Q276342 has children with different ranks:  {'Q68947', 'Q279749'}
Q169037 has children with different ranks:  {'Q68947', 'Q767728'}
Q15583770 has children with different ranks:  {'Q68947', 'Q767728'}
Q10657527 has children with different ranks:  

Q19851078 has children with different ranks:  {'Q227936', 'Q34740'}
Q21226285 has children with different ranks:  {'Q227936', 'Q34740'}
Q2849790 has children with different ranks:  {'Q164280', 'Q34740'}
Q4953549 has children with different ranks:  {'Q2889003', 'Q2136103'}
Q18558580 has children with different ranks:  {'Q164280', 'Q34740'}
Q20667009 has children with different ranks:  {'Q35409', 'Q2136103'}
Q3061296 has children with different ranks:  {'Q227936', 'Q34740'}
Q67363010 has children with different ranks:  {'Q767728', 'Q279749'}
Q1796255 has children with different ranks:  {'Q767728', 'Q279749'}
Q49600132 has children with different ranks:  {'Q767728', 'Q279749'}
Q49600035 has children with different ranks:  {'Q767728', 'Q279749'}
Q49631367 has children with different ranks:  {'Q767728', 'Q279749'}
Q49600018 has children with different ranks:  {'Q767728', 'Q279749'}
Q49603876 has children with different ranks:  {'Q767728', 'Q279749'}
Q49627100 has children with different ran

Q5317605 has children with different ranks:  {'Q7432', 'Q3238261'}
Q15543816 has children with different ranks:  {'Q767728', 'Q68947'}
Q671939 has children with different ranks:  {'Q767728', 'Q279749'}
Q15505372 has children with different ranks:  {'Q68947', 'Q279749', 'Q767728'}
Q15538398 has children with different ranks:  {'Q767728', 'Q68947'}
Q148137 has children with different ranks:  {'Q767728', 'Q68947'}
Q50984221 has children with different ranks:  {'Q767728', 'Q68947'}
Q159121 has children with different ranks:  {'Q767728', 'Q68947'}
Q11080319 has children with different ranks:  {'Q68947', 'Q767728'}
Q3906545 has children with different ranks:  {'Q7432', 'Q3238261'}
Q39550809 has children with different ranks:  {'Q68947', 'Q767728'}
Q218155 has children with different ranks:  {'Q68947', 'Q279749', 'Q767728'}
Q18745616 has children with different ranks:  {'Q7432', 'Q3238261'}
Q15547339 has children with different ranks:  {'Q767728', 'Q68947'}
Q1936142 has children with differen

Q18107170 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10481009 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2363186 has children with different ranks:  {'Q5867959', 'Q35409'}
Q599065 has children with different ranks:  {'Q164280', 'Q34740'}
Q10488609 has children with different ranks:  {'Q7432', 'Q3238261'}
Q20645733 has children with different ranks:  {'Q3965313', 'Q34740'}
Q10492313 has children with different ranks:  {'Q7432', 'Q3238261'}
Q4299079 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10502855 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10583642 has children with different ranks:  {'Q164280', 'Q34740'}
Q10504469 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10505503 has children with different ranks:  {'Q7432', 'Q767728'}
Q4039585 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10507932 has children with different ranks:  {'Q7432', 'Q3238261'}
Q10508551 has children with different ranks:  {'Q743

Q131768 has children with different ranks:  {'Q5867959', 'Q35409', 'Q34740'}
Q4269456 has children with different ranks:  {'Q767728', 'Q68947'}
Q15635929 has children with different ranks:  {'Q7432', 'Q3238261'}
Q21217988 has children with different ranks:  {'Q3965313', 'Q34740'}
Q7377823 has children with different ranks:  {'Q767728', 'Q68947'}
Q5783443 has children with different ranks:  {'Q68947', 'Q767728'}
Q165118 has children with different ranks:  {'Q36602', 'Q37517', 'Q34740'}
Q39936647 has children with different ranks:  {'Q68947', 'Q279749', 'Q767728'}
Q2706362 has children with different ranks:  {'Q767728', 'Q279749', 'Q68947'}
Q17241719 has children with different ranks:  {'Q4886', 'Q279749'}
Q159030 has children with different ranks:  {'Q767728', 'Q279749'}
Q15570515 has children with different ranks:  {'Q68947', 'Q767728'}
Q2392155 has children with different ranks:  {'Q68947', 'Q767728'}
Q163675 has children with different ranks:  {'Q68947', 'Q767728'}
Q15562383 has chil

Q15244959 has children with different ranks:  {'Q767728', 'Q279749'}
Q652089 has children with different ranks:  {'Q767728', 'Q68947'}
Q3470725 has children with different ranks:  {'Q767728', 'Q68947'}
Q10959168 has children with different ranks:  {'Q767728', 'Q279749'}
Q11177752 has children with different ranks:  {'Q767728', 'Q279749'}
Q2176031 has children with different ranks:  {'Q767728', 'Q68947'}
Q911666 has children with different ranks:  {'Q767728', 'Q279749'}
Q11148061 has children with different ranks:  {'Q68947', 'Q767728'}
Q11178665 has children with different ranks:  {'Q767728', 'Q279749'}
Q5381965 has children with different ranks:  {'Q35409', 'Q34740'}
Q162374 has children with different ranks:  {'Q767728', 'Q279749'}
Q21257996 has children with different ranks:  {'Q68947', 'Q767728'}
Q5548520 has children with different ranks:  {'Q227936', 'Q34740'}
Q26870 has children with different ranks:  {'Q2889003', 'Q35409'}
Q7046729 has children with different ranks:  {'Q35409',

Q944411 has children with different ranks:  {'Q36602', 'Q34740'}
Q19816401 has children with different ranks:  {'Q227936', 'Q34740'}
Q6694331 has children with different ranks:  {'Q35409', 'Q34740'}
Q3374924 has children with different ranks:  {'Q5868144', 'Q36602'}
Q18713253 has children with different ranks:  {'Q164280', 'Q35409', 'Q2136103', 'Q36602'}
Q1003460 has children with different ranks:  {'Q68947', 'NA'}
Q1027827 has children with different ranks:  {'Q35409', 'Q2136103'}
Q265488 has children with different ranks:  {'Q68947', 'NA'}
Q4050101 has children with different ranks:  {'Q227936', 'Q34740'}
Q21214877 has children with different ranks:  {'Q164280', 'Q34740'}
Q21214247 has children with different ranks:  {'Q3965313', 'Q34740'}
Q21012405 has children with different ranks:  {'Q2889003', 'Q35409'}
Q1380570 has children with different ranks:  {'Q227936', 'Q34740'}
Q3438569 has children with different ranks:  {'Q227936', 'Q34740'}
Q21227811 has children with different ranks: 

Q138921 has children with different ranks:  {'Q2889003', 'Q5867959'}
Q19754542 has children with different ranks:  {'Q35409', 'Q34740'}
Q132904 has children with different ranks:  {'Q35409', 'Q34740', 'NA'}
Q1343544 has children with different ranks:  {'Q227936', 'Q34740'}
Q24966129 has children with different ranks:  {'Q36732', 'Q2752679'}
Q3546395 has children with different ranks:  {'Q7432', 'Q3238261'}
Q2583869 has children with different ranks:  {'Q35409', 'Q36602', 'Q34740'}
Q809613 has children with different ranks:  {'Q164280', 'Q34740'}
Q134870 has children with different ranks:  {'Q38348', 'NA'}
Q2814783 has children with different ranks:  {'Q35409', 'Q34740'}
Q496013 has children with different ranks:  {'Q1153785', 'Q3504061'}
Q7129481 has children with different ranks:  {'Q35409', 'Q34740'}
Q13514368 has children with different ranks:  {'Q35409', 'Q36602', 'NA'}
Q2751478 has children with different ranks:  {'Q2889003', 'Q34740'}
Q2043179 has children with different ranks:  

Q1349838 has children with different ranks:  {'Q5867959', 'Q34740'}
Q623232 has children with different ranks:  {'Q35409', 'Q34740'}
Q14899727 has children with different ranks:  {'Q3491997', 'Q334460'}
Q139213 has children with different ranks:  {'Q35409', 'Q34740'}
Q14857444 has children with different ranks:  {'Q10861426', 'Q35409', 'Q2136103'}
Q17583501 has children with different ranks:  {'Q35409', 'Q36602'}
Q1440794 has children with different ranks:  {'Q35409', 'Q2136103'}
Q1801629 has children with different ranks:  {'NA', 'Q36602'}
Q893320 has children with different ranks:  {'Q3965313', 'Q34740'}
Q1744606 has children with different ranks:  {'Q5867959', 'Q35409'}
Q4589415 has children with different ranks:  {'Q19858692', 'NA'}
Q1051622 has children with different ranks:  {'Q5867051', 'Q36602'}
Q129012 has children with different ranks:  {'Q35409', 'Q36602'}
Q1305665 has children with different ranks:  {'Q227936', 'Q34740'}
Q10328397 has children with different ranks:  {'Q2279

In [10]:
len(different_rank_siblings)

18716

In [19]:
'''
There are also entries with no parent information therefore they are not present in any trees
'''
data[data["parent_taxon"].isna()]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
31362,P1070,,,,PlantList-ID
37931,P1137,,,,fossil found in this unit
68300,P1772,,,,USDA PLANTS ID
70202,Q20672521,,,,Serpentine rouge
87883,Q21398146,,,,Puccinia asteris ß chrysanthemi-leucanthemi
...,...,...,...,...,...
2790422,P960,,,,Tropicos ID
2794225,Q667161,Tetrao tetrix × urogallus,,,Rackelhahn
2821400,Q2382443,Biota,Q22666877,,biota
2848888,Q2005736,,,,SIG Sauer P225


In [11]:
'''
There are two Biota, but the second one is the root
'''
data[data["taxon_name"] == "Biota"]

Unnamed: 0,id,taxon_name,taxon_rank,parent_taxon,species_name_value
1669271,Q87177320,Biota,Q34740,Q146037,Biota
2821400,Q2382443,Biota,Q22666877,,biota


In [None]:
# build the tree of life with Biota as root
tree_from_biota = build_tree_from_root("Q2382443", tree, id_name_dict, id_rank_dict)
with open('tree.json', 'w') as fp:
    json.dump(tree_from_biota, fp)

In [18]:
# build the "dangling trees" for roots that are not Biota
for missing_parent in missing_parents:
    dangling_tree = build_tree_from_root(missing_parent, tree, id_name_dict, id_rank_dict)
    file_name = os.path.join(WIKIDIR, "dangling_trees",missing_parent + ".json")
    with open(file_name, 'w') as fp:
        json.dump(dangling_tree, fp, indent=4)