In [None]:
import json
import csv
import zipfile
import re
import requests
import os

# Step1. Excluding entities – based on 'instance of', 'subclass of'
* Entities that have certain values of "instance of" and "subclass of" ("to_exclude.json")
* Entities without any entity terms

In [None]:
# reading the file with the P31 and P279 values to exlude
# the file was created manually based on the analysis of the search results

with open('to_exlude.json') as jf_exlude:
    to_exclude = json.load(jf_exlude)

### EN

In [None]:
# unzipping the search results to a local directory
with zipfile.ZipFile('wikidata_search_results_en.json.zip','r') as unzip:
    # set your path (~62 MB)
    unzip.extractall('')

In [None]:
# reading the results file
# set your path
with open('/wikidata_search_results.json') as jf:
    wd_search_results_en = json.load(jf)

In [None]:
# reading the file with the query terms and putting them in a list

with open('wikidata_query_terms_en.txt','r') as txt_file:
    terms_en = txt_file.read()
    
queried_terms_en = [s.strip("\'") for s in terms_en.lstrip("[").rstrip("]").split(", ")]

In [None]:
# storing all the entities to exclude from the results in a list

entities_to_exlude = []

In [None]:
# iterating over the claim files
# claims for every term are stored in separate files (they were saved in the previous step, 'query_wikidata_en_nl.ipynb')
# claims are stored in list of dicts
# the number of dicts corresponds to the number of requests made to retrieve claims

for term in queried_terms_en:
    print(term)
    # set your path to the claims directory
    with open(f'/claims_en/{term}_claims.json','r') as jf:
        claims_file = json.load(jf)
        
    if claims_file["entities"] != []: # if there are claims

        for i in claims_file["entities"]:
            for e_id, values in i.items():
                if 'claims' in values:
                    for v in values['claims'].values():
                        for l in v:
                            if 'datavalue' in l['mainsnak']:
                                # if the values of P31 or P279 match the ids to exclude,
                                # add them to the list 'entities_to_exlude'
                                if (l['mainsnak']['property'] == 'P31' or l['mainsnak']['property'] == 'P279') \
                                and l['mainsnak']['datavalue']['value']['id'] in to_exclude.keys():
                                    entities_to_exlude.append(e_id)
        print(len(entities_to_exlude),"\n")

In [None]:
# ~ 69K entities
len(entities_to_exlude)

### NL

In [None]:
# unzipping the search results to a local directory
with zipfile.ZipFile('wikidata_search_results_nl.json.zip','r') as unzip:
    # set your path (~39 MB)
    unzip.extractall('/Users/anesterov/reps/testing')

In [None]:
# reading the results file
# set your path
with open('/wikidata_search_results_nl.json','r') as jf:
    wd_search_results_nl = json.load(jf)

In [None]:
# reading the file with the query terms and putting them in a list

with open('wikidata_query_terms_nl_with_hits.txt','r') as txt_file:
    terms_nl = txt_file.read()
    
queried_terms_nl = terms_nl.split('\n')

In [None]:
# storing all the NL entities to exclude from the results in a list

entities_to_exclude_nl = []

In [None]:
# iterating over the claim files
# claims for every term are stored in separate files (they were saved in the previous step, 'query_wikidata_en_nl.ipynb')
# claims are stored in list of dicts
# the number of dicts corresponds to the number of requests made to retrieve claims

for term in queried_terms_nl:
    print(term)
    # set your path to the claims directory
    with open(f'/claims_nl/{term}_claims.json','r') as jf:
        claims_file = json.load(jf)
        
    if claims_file["entities"] != []: # if there are claims

        for i in claims_file["entities"]:
            for e_id, values in i.items():
                if 'claims' in values:
                    for v in values['claims'].values():
                        for l in v:
                            if 'datavalue' in l['mainsnak']:
                                # if the values of P31 or P279 match the ids to exclude,
                                # add them to the list 'entities_to_exlude_nl'
                                if (l['mainsnak']['property'] == 'P31' or l['mainsnak']['property'] == 'P279') \
                                and l['mainsnak']['datavalue']['value']['id'] in to_exclude.keys():
                                    entities_to_exclude_nl.append(e_id)
        print(len(entities_to_exclude_nl),"\n")

In [None]:
# ~ 16K entities
len(entities_to_exclude_nl)

## Step 1.1. Saving the clean version
Also excluding entities without entity terms

### EN

In [None]:
filtered_results_en = {}
for key,value in wd_search_results_en.items():
    sub_dict = {}
    for k,v in value.items():
        if 'title' in v and 'entityterms' in v and v['title'] not in entities_to_exlude:
            sub_dict[k] = v
    filtered_results_en[key] = sub_dict

In [None]:
# set your path (~47 MB)
with open('/wikidata_search_results_en_clean.json','w') as jf:
    json.dump(filtered_results_en, jf)

### NL

In [None]:
# merging the lists of EN and NL entities to exclude
exclude_all = entities_to_exlude + entities_to_exclude_nl

In [None]:
filtered_results_nl = {}
for key,value in wd_search_results_nl.items():
    sub_dict = {}
    for k,v in value.items():
        if 'title' in v and 'entityterms' in v and v['title'] not in exclude_all:
            sub_dict[k] = v
    filtered_results_nl[key] = sub_dict

In [None]:
# set your path (~28 MB)
with open('/wikidata_search_results_nl_clean.json','w') as jf:
    json.dump(filtered_results_nl, jf)

## Step 1.2. Requesting additional claims
* Not all the entities have claims, additional requests are needed
* Check if an entity has claims, if not send a request 

### EN

#### checking which entities have missing claims

In [None]:
# reading the cleaned file or reuse the dict 'filtered_results_en'
with open('/wikidata_search_results_en_clean.json','r') as jf:
    wd_en_clean = json.load(jf)

In [None]:
# list of all entities with claims (across all the files)

all_entities_with_claims = []

for query_term in wd_en_clean.keys():
    # set your path to the EN claims directory
    with open(f'/claims_en/{query_term}_claims.json','r') as jf:
        claims_file = json.load(jf)
        
    for claim_list in claims_file["entities"]:
        all_entities_with_claims.extend(list(claim_list.keys()))

In [None]:
# dict {"query_word":[entities_without_claims]}

no_claims = {}

for query_term,value in wd_en_clean.items():
    # list of entities per query word
    list_of_entities_per_term = [i['title'] for i in value.values()] 
    print("term: ", query_term, "|", "total entities: ", len(list_of_entities_per_term))
    
    # checking which entities do not have claims
    entities_without_claims = []
    for i in list_of_entities_per_term:
        if i not in list(set(all_entities_with_claims)):
            entities_without_claims.extend([i])
    
    print("no claims: ", len(entities_without_claims))
    
    if entities_without_claims != []:
        no_claims[query_term] = entities_without_claims

In [None]:
# 22 terms have missing claims
# this file was created for documentation, and was not used further
with open('no_claims.json','w') as json_write:
    json.dump(no_claims, json_write)

### NL

#### checking which entities have missing claims

In [None]:
# reading the cleaned file or reuse the dict 'filtered_results_nl'
with open('/wikidata_search_results_nl_clean.json','r') as jf:
    wd_nl_clean = json.load(jf)

In [None]:
# list of all entities with claims (across all the files)

all_entities_with_claims_nl = []

for query_term in wd_nl_clean.keys():
    # set your path to the NL claims directory
    with open(f'/claims_nl/{query_term}_claims.json','r') as jf:
        claims_file = json.load(jf)
        
    for claim_list in claims_file["entities"]:
        all_entities_with_claims_nl.extend(list(claim_list.keys()))

In [None]:
# dict {"query_word":[entities_without_claims]}

no_claims_nl = {}

for query_term,value in wd_nl_clean.items():
    # list of entities per query word
    list_of_entities_per_term = [i['title'] for i in value.values()] 
    print("term: ", query_term, "|", "total entities: ", len(list_of_entities_per_term))
    
    # checking which entities do not have claims
    entities_without_claims = []
    for i in list_of_entities_per_term:
        if i not in list(set(all_entities_with_claims_nl)):
            entities_without_claims.extend([i])
    
    print("no claims: ", len(entities_without_claims))
    
    if entities_without_claims != []:
        no_claims_nl[query_term] = entities_without_claims

In [None]:
# checking if the NL claims are not in the EN claims also

no_claims_nl_request = {}
for key,list_of_entities in no_claims_nl.items():
    not_found = [i for i in list_of_entities if i not in all_entities_with_claims]
    if not_found != []:
        no_claims_nl_request[key] = not_found

In [None]:
# 19 NL terms are missing claims
len(no_claims_nl_request)

### Requesting the missing claims

### EN

In [None]:
# 'wbgetentities': constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"wbgetentities",
          "ids":"", # string of entities (max=50) goes here
          "props":"claims",
          "languages":"en",
          "format":"json"
         }
# set your header
headers = {"user-agent":"bot getting claims of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
list_of_requested_entities = []

In [None]:
additional_claims = {}

for term, no_claims_list in no_claims.items():
    
    params["ids"] = ""
    
    # for debugging
    
    print("term:",term,"|","entities:",len(no_claims_list))

# - CONDITIONS - #

    # if len of the list > 50 and there's a remainder
    if len(no_claims_list) > 50 and len(no_claims_list) % 50 > 0:
        loops = len(no_claims_list) // 50 + 1 # add another loop for requests
        
    # if len of the list > 50 and no remainder    
    if len(no_claims_list) > 50 and len(no_claims_list) % 50 == 0:
        loops = len(no_claims_list) // 50
        
    # otherwise loops = 1 (len < 50 or len = 50)    
    if len(no_claims_list) < 50:
        loops = 1
        
    # for debugging
    
    print("loops:",loops)
        
# - REQUEST LOOPS - #   
        
    # counters to slice list_of_entities
    start = 0 
    end = 0
    query_result_list = []
    
    for i in range(0,loops):
        ids_string = "" # putting Qs in one string
        end = end + 50

        for q in no_claims_list[start:end]:
            if q not in list_of_requested_entities:
                ids_string = ids_string + f"{q}|"
                list_of_requested_entities.append(q) # remebering requested entities to prevent duplicates

        start = start + 50

        # updating params

        params["ids"] = ids_string.rstrip("|")

        # sending a request
        d = requests.get(url,params=params,headers=headers)
        claims = d.json() # claims per request
        
        if 'entities' in claims:
            query_result_list.append(claims['entities']) # saving all claims
    
    # for debugging        
    print("actual_results:",len(query_result_list),[len(i) for i in query_result_list])

# - SAVING RESULTS - #
        
    additional_claims[term] = query_result_list

### Merging EN claims

In [None]:
# set your path to the EN claims directory 
path_to_json = '/claims_en/'
json_files = [path_to_json + jf for jf in os.listdir(path_to_json) if jf.endswith('.json')]

In [None]:
merged_en = {}
for jf in json_files:
    with open(jf,'r') as claim_file:
        claims_per_term = json.load(claim_file)
        merged_en[jf.split('/')[-1].split('_')[0]] = claims_per_term['entities']

In [None]:
# 104
len(merged_en)

In [None]:
# merging additional claims with all the claims
for term, claims in additional_claims.items():
    if claims != []:
        merged_en[term].extend(claims)

In [None]:
# saving all the claims
# set your path (~3,89 GB)
with open('/claims_en/merged_claims_en.json', 'w') as output_file:
    json.dump(merged_en, output_file)

### NL

In [None]:
# 'wbgetentities': constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"wbgetentities",
          "ids":"", # string of entities (max=50) goes here
          "props":"claims",
          "languages":"en",
          "format":"json"
         }
# set your header
headers = {"user-agent":"bot getting claims of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
list_of_requested_entities_nl = []

In [None]:
additional_claims_nl = {}

for term, no_claims_list in no_claims_nl_request.items():
    
    params["ids"] = ""
    
    # for debugging
    
    print("term:",term,"|","entities:",len(no_claims_list))

# - CONDITIONS - #

    # if len of the list > 50 and there's a remainder
    if len(no_claims_list) > 50 and len(no_claims_list) % 50 > 0:
        loops = len(no_claims_list) // 50 + 1 # add another loop for requests
        
    # if len of the list > 50 and no remainder    
    if len(no_claims_list) > 50 and len(no_claims_list) % 50 == 0:
        loops = len(no_claims_list) // 50
        
    # otherwise loops = 1 (len < 50 or len = 50)    
    if len(no_claims_list) < 50:
        loops = 1
        
    # for debugging
    
    print("loops:",loops)
        
# - REQUEST LOOPS - #   
        
    # counters to slice list_of_entities
    start = 0 
    end = 0
    query_result_list = []
    
    for i in range(0,loops):
        ids_string = "" # putting Qs in one string
        end = end + 50

        for q in no_claims_list[start:end]:
            if q not in list_of_requested_entities_nl:
                ids_string = ids_string + f"{q}|"
                list_of_requested_entities_nl.append(q) # remebering requested entities to prevent duplicates

        start = start + 50

        # updating params

        params["ids"] = ids_string.rstrip("|")

        # sending a request
        d = requests.get(url,params=params,headers=headers)
        claims = d.json() # claims per request
        
        if 'entities' in claims:
            query_result_list.append(claims['entities']) # saving all claims
    
    # for debugging        
    print("actual_results:",len(query_result_list),[len(i) for i in query_result_list])

# - SAVING RESULTS - #
        
    additional_claims_nl[term] = query_result_list

### Merging NL claims

In [None]:
# set your path to the NL claims directory 
path_to_json = '/claims_nl/'
json_files = [path_to_json + jf for jf in os.listdir(path_to_json) if jf.endswith('.json')]

In [None]:
merged_nl = {}
for jf in json_files:
    with open(jf,'r') as claim_file:
        claims_per_term = json.load(claim_file)
        merged_nl[jf.split('/')[-1].split('_')[0]] = claims_per_term['entities']

In [None]:
# 104
len(merged_nl)

In [None]:
# merging additional claims with all the claims
for term, claims in additional_claims_nl.items():
    if claims != []:
        merged_nl[term].extend(claims)

In [None]:
# saving all the claims
# set your path (~1,24 GB)
with open('/claims_nl/merged_claims_nl.json', 'w') as output_file:
    json.dump(merged_nl, output_file)

## Step 1.3. Re-running Step 1 with new claims

### EN

In [None]:
for term, lists in additional_claims_nl.items():
    for i in lists:
        for e_id, claims in i.items():
            for v in values['claims'].values():
                for l in v:
                    if 'datavalue' in l['mainsnak']:
                        if (l['mainsnak']['property'] == 'P31' or l['mainsnak']['property'] == 'P279') \
                        and l['mainsnak']['datavalue']['value']['id'] in to_exclude.keys():
                            entities_to_exlude.append(e_id)

In [None]:
len(entities_to_exlude)

In [None]:
# saving the list of entities that will be filtered out
# this file was created for documentation, and was not used further

with open('exclude_entities_en.txt','w') as txt_file:
    for i in entities_to_exlude:
        txt_file.writelines(f"{i},\n")

### NL

In [None]:
for term, lists in additional_claims_nl.items():
    for i in lists:
        for e_id, claims in i.items():
            for v in claims['claims'].values():
                for l in v:
                    if 'datavalue' in l['mainsnak']:
                        if (l['mainsnak']['property'] == 'P31' or l['mainsnak']['property'] == 'P279') \
                        and l['mainsnak']['datavalue']['value']['id'] in to_exclude.keys():
                            entities_to_exclude_nl.append(e_id)

In [None]:
len(entities_to_exclude_nl)

In [None]:
# saving the list of entities that will be filtered out
# this file was created for documentation, and was not used further

with open('exclude_entities_nl.txt','w') as txt_file:
    for i in entities_to_exclude_nl:
        txt_file.writelines(f"{i},\n")

## Step 1.4 Rewriting the clean version

### EN

In [None]:
filtered_results_en = {}
for key,value in wd_search_results_en.items():
    sub_dict = {}
    for k,v in value.items():
        if 'title' in v and 'entityterms' in v and v['title'] not in entities_to_exlude:
            sub_dict[k] = v
    filtered_results_en[key] = sub_dict

In [None]:
# set your path
with open('/wikidata_search_results_en_clean.json','w') as jf:
    json.dump(filtered_results_en, jf)

### NL

In [None]:
# merging the lists of EN and NL entities to exclude
exclude_all = entities_to_exlude + entities_to_exclude_nl

In [None]:
filtered_results_nl = {}
for key,value in wd_search_results_nl.items():
    sub_dict = {}
    for k,v in value.items():
        if 'title' in v and 'entityterms' in v and v['title'] not in exclude_all:
            sub_dict[k] = v
    filtered_results_nl[key] = sub_dict

In [None]:
# set your path
with open('/wikidata_search_results_nl_clean.json','w') as jf:
    json.dump(filtered_results_nl, jf)

# Step 2. Excluding entities – proper names
* Excluding names: if the target word is used in a label with uppercase AND if an entity has the properties "family name" and "given name"

### EN

In [None]:
# Files with entity literals:
# filtered_results_en
# filtered_results_nl

# Files with claims:
# all_claims – English
# all_claims_nl – Duth

# properties
# P735 – "given name"
# P734 – "family name"

In [None]:
names_en = []

# iterating over the "filtered_results_en" dict
# that was exported to a file 'wikidata_search_results_en_clean.json' in the previous step

for target_word, entityterms in filtered_results_en.items():

    # taking unigrams only
    # n-grams are put in "", and complex words have '-'
    if '"' not in target_word and '-' not in target_word:
        
        for page_id, values in entityterms.items():
            if 'label' in values['entityterms'] and \
            re.search(target_word.capitalize(),values['entityterms']['label'][0]) != None:
                names_en.append(values['title'])
                

In [None]:
# 70K entities have capitalized target terms in their labels
len(names_en)

In [None]:
names_to_filter = []

for target_word, claims_lists in all_claims.items():
    for claims in claims_lists:
        for e_id in claims.keys():
            # if there are the properties "family name" or "given name"
            if e_id in names_en and ('P734' in claims[e_id]['claims'].keys() or \
                                     'P735' in claims[e_id]['claims'].keys()):
                names_to_filter.append(e_id)

In [None]:
# excluding the entities from the results

clean_en = {}
for target_word, entityterms in filtered_results_en.items():
    sub_dict = {}
    for page_id, values in entityterms.items():
        if values['title'] not in names_to_filter:
            sub_dict[page_id] = values
            
    clean_en[target_word] = sub_dict

In [None]:
# rewriting the clean file
# set your path
with open('/wikidata_search_results_en_clean.json','w') as jf:
    json.dump(clean_en, jf)

### NL

In [None]:
names_nl = []

# iterating over the "filtered_results_en" dict
# that was exported to a file 'wikidata_search_results_nl_clean.json' in the previous step

for target_word, entityterms in filtered_results_nl.items():

    # taking unigrams only
    # n-grams are put in "", and complex words have '-'
    if '"' not in target_word and '-' not in target_word:
        
        for page_id, values in entityterms.items():
            if 'label' in values['entityterms'] and \
            re.search(target_word.capitalize(),values['entityterms']['label'][0]) != None:
                names_nl.append(values['title'])

In [None]:
# NL entities with capitalized target terms
len(names_nl)

In [None]:
names_to_filter_nl = []

# checking both EN and NL claims

# English claims

for target_word, claims_lists in all_claims.items():
    for claims in claims_lists:
        for e_id in claims.keys():
            # if there are the properties "family name" or "given name"
            if e_id in names_nl and ('P734' in claims[e_id]['claims'].keys() or \
                                     'P735' in claims[e_id]['claims'].keys()):
                names_to_filter_nl.append(e_id)
                
# Dutch claims
                
for target_word, claims_lists in all_claims_nl.items():
    for claims in claims_lists:
        for e_id in claims.keys():
            # if there are the properties "family name" or "given name"
            if e_id in names_nl and ('P734' in claims[e_id]['claims'].keys() or \
                                     'P735' in claims[e_id]['claims'].keys()):
                names_to_filter_nl.append(e_id)

In [None]:
len(names_to_filter_nl)

In [None]:
# excluding the entities from the results

clean_nl = {}
for target_word, entityterms in filtered_results_nl.items():
    sub_dict = {}
    for page_id, values in entityterms.items():
        if values['title'] not in names_to_filter_nl:
            sub_dict[page_id] = values
            
    clean_nl[target_word] = sub_dict

In [None]:
# rewriting the clean file
# set your path
with open('/wikidata_search_results_nl_clean.json','w') as jf:
    json.dump(clean_nl, jf)