In [1]:
import pandas as pd
import os
import json
import requests

DATADIR = "../dataset"


In [9]:
wiki, pbdb = pd.read_csv(os.path.join(DATADIR, "wikidata","processed","full_data.csv"), index_col = "id"), \
pd.read_csv(os.path.join(DATADIR, "pbdb","data.csv"))

In [258]:
all_ranks = wiki[wiki["taxon_rank"].notna()]["taxon_rank"].unique().tolist()

In [257]:
all_taxon_names = set(wiki[wiki["taxon_name"].notna()]["taxon_name"].unique())

In [259]:
accepted_names, phylums, classes, orders, families, genuses = \
pbdb["accepted_name"].unique().tolist(), \
pbdb["phylum"].unique().tolist(), \
pbdb["class"].unique().tolist(), \
pbdb["order"].unique().tolist(), \
pbdb["family"].unique().tolist(), \
pbdb["genus"].unique().tolist()


In [196]:
pbdb["accepted_rank"].unique()

array(['genus', 'species', 'phylum', 'family', 'subclass', 'kingdom',
       'class', 'order', 'subgenus', 'superphylum', 'unranked clade',
       'subphylum', 'subfamily', 'tribe', 'subkingdom', 'superfamily',
       'superorder', 'subspecies', 'suborder', 'superclass', 'infraclass',
       'infraorder', 'subtribe', 'informal'], dtype=object)

In [264]:
missing_accepted_names, missing_phylums, missing_classes, missing_orders, missing_families, missing_genuses = \
[name for name in accepted_names if name not in all_taxon_names], \
[name for name in phylums if name not in all_taxon_names], \
[name for name in classes if name not in all_taxon_names], \
[name for name in orders if name not in all_taxon_names], \
[name for name in families if name not in all_taxon_names], \
[name for name in genuses if name not in all_taxon_names]

print("accepted names: missing: ", len(missing_accepted_names), ", all: ",len(accepted_names))
print("phylums: missing: ", len(missing_phylums), ", all: ",len(phylums))
print("classes: missing: ", len(missing_classes), ", all: ",len(classes))
print("orders: missing: ", len(missing_orders), ", all: ",len(orders))
print("families: missing: ", len(missing_families), ", all: ",len(families))
print("genuses: missing: ", len(missing_genuses), ", all: ",len(genuses))

accepted names: missing:  132707 , all:  180470
phylums: missing:  16 , all:  61
classes: missing:  31 , all:  166
orders: missing:  183 , all:  1148
families: missing:  1271 , all:  8575
genuses: missing:  34576 , all:  63780


In [337]:
pbdb_no_match = pbdb[(~pbdb["accepted_name"].isin(all_taxon_names)) & (~pbdb["phylum"].isin(all_taxon_names)) & (~pbdb["class"].isin(all_taxon_names)) & (~pbdb["order"].isin(all_taxon_names)) & (~pbdb["family"].isin(all_taxon_names)) & (~pbdb["genus"].isin(all_taxon_names))]

In [409]:
pbdb_no_match.shape

(7127, 18)

In [369]:
no_match_accepted_names, no_match_phylums, no_match_classes, no_match_orders, no_match_families, no_match_genuses = \
[i for i in pbdb_no_match["accepted_name"].unique().tolist() if str(i) != "nan"], \
[i for i in pbdb_no_match["phylum"].unique().tolist() if str(i) != "nan"], \
[i for i in pbdb_no_match["class"].unique().tolist() if str(i) != "nan"], \
[i for i in pbdb_no_match["order"].unique().tolist() if str(i) != "nan"], \
[i for i in pbdb_no_match["family"].unique().tolist() if str(i) != "nan"], \
[i for i in pbdb_no_match["genus"].unique().tolist() if str(i) != "nan"]

In [388]:
no_match_accpeted_names_split = set()
for name_list in [name.split() for name in no_match_accepted_names]:
    for a_name in name_list:
        no_match_accpeted_names_split.add(a_name)
no_match_accpeted_names_split = list(no_match_accpeted_names_split)        

In [389]:
all_no_match_names = set(no_match_accpeted_names_split + no_match_phylums + no_match_classes + no_match_orders + no_match_families + no_match_genuses)

In [315]:
pbdb_no_match["accepted_rank"]

array(['species', 'genus', 'kingdom', 'phylum', 'unranked clade',
       'subgenus', 'subkingdom', 'class', 'family'], dtype=object)

In [393]:
def query_data(my_query): 
    return requests.get(f"https://www.wikidata.org/w/api.php?action=wbsearchentities&search={my_query}&language=en&format=json").json()["search"]


In [None]:
cache = dict()
if os.path.isfile("../dataset/wikidata/processed/cache_for_pbdb_names.json"):
        with open("../dataset/wikidata/processed/cache_for_pbdb_names.json", "r") as f:
            cache = json.load(f)

search_result = dict()
for query in all_no_match_names:
    if query not in cache:
        query_result = query_data(query)
        cache[query] = query_result
        with open("../dataset/wikidata/processed/cache_for_pbdb_names.json", "w") as f:
            json.dump(cache, f)
        
    else:
        query_result = cache[query]
    
    search_result[query] = query_result

In [463]:
effective_names = [name for name in search_result if search_result[name] != []]
all_pbdb_ranks = pbdb["accepted_rank"].unique().tolist()
for name in effective_names:
    results = search_result[name]
    meaningful_results = []
    for result in results:
        try:
            if any(rank in result["description"] for rank in all_pbdb_ranks):
                meaningful_results.append(result)
        except KeyError:
            continue
    search_result[name] = meaningful_results

In [460]:
name_id_dict = {name:search_result[name][0]["id"] for name in search_result if len(search_result[name]) == 1} 

In [471]:
pbdb.head()

Unnamed: 0,occurrence_no,identified_name,accepted_name,identified_rank,accepted_rank,phylum,class,order,family,genus,max_ma,min_ma,lng,lat,paleomodel,paleolng,paleolat,geoplate
0,3285,Tetradium cellulosum,Tetradium,species,genus,Rhodophyta,,,,Tetradium,460.9,449.5,-75.456108,43.212776,gp_mid,-105.36,-17.34,101
1,3679,Tetradium sp.,Tetradium,genus,genus,Rhodophyta,,,,Tetradium,470.0,458.4,-75.456108,43.212776,gp_mid,-101.77,-8.16,101
2,4409,Tetradium sp.,Tetradium,genus,genus,Rhodophyta,,,,Tetradium,449.5,443.7,-83.828613,39.445278,gp_mid,-111.61,-25.18,101
3,4534,Tetradium ontario,Tetradium huronense,species,species,Rhodophyta,,,,Tetradium,449.5,443.7,-85.012779,39.423058,gp_mid,-112.52,-24.78,101
4,5487,Tetradium sp.,Tetradium,genus,genus,Rhodophyta,,,,Tetradium,449.5,443.7,-84.0,39.0,gp_mid,-111.97,-25.52,101


In [481]:
len([j for j in [i for i in accepted_names if len(i.split())==2] if j in all_taxon_names])

26984