# Required package
[qwikidata](https://qwikidata.readthedocs.io/en/stable/)
[json](https://docs.python.org/3/library/json.html)
[bz2](https://docs.python.org/3/library/bz2.html)
# Required data
[wikidata dump](https://dumps.wikimedia.org/wikidatawiki/entities/)

In [None]:
import time
import json
import bz2
import sys
from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.utils import dump_entities_to_json

In [None]:
WIKI_DATA_FULL = '../latest-all.json.bz2' #path to wiki data dump
WIKI_DATA_FILTERED = '../filtered_politician.json.bz2' #path to filtered wiki data dump
P_OCCUPATION = "P106" #politician ID in wikidata
Q_POLITICIAN = "Q82955" #politician ID in wikidata
P_COUNTRY = "P27" #country of recognized citizenship in wikidata

In [None]:
def has_occupation_politician(item: WikidataItem, truthy: bool = True) -> bool:
    """Return True if the Wikidata Item has occupation politician."""
    if truthy:
        claim_group = item.get_truthy_claim_group(P_OCCUPATION)
    else:
        claim_group = item.get_claim_group(P_OCCUPATION)

    occupation_qids = [
        claim.mainsnak.datavalue.value["id"]
        for claim in claim_group
        if claim.mainsnak.snaktype == "value"
    ]
    return Q_POLITICIAN in occupation_qids

In [None]:
# create an instance of WikidataJsonDump
wjd_dump_path = WIKI_DATA_FULL
wjd = WikidataJsonDump(wjd_dump_path)

In [None]:
# create an instance of WikidataJsonDump
wjd_dump_path = WIKI_DATA_FULL
wjd = WikidataJsonDump(wjd_dump_path)


# create an iterable of WikidataItem representing politicians
politicians = []
t1 = time.time()
with bz2.open(WIKI_DATA_FILTERED, 'wb') as d_file:
    for ii, entity_dict in enumerate(wjd):
        if entity_dict["type"] == "item":
            entity = WikidataItem(entity_dict)
            try:
                if ('P570' not in entity_dict['claims'].keys()):
                    if has_occupation_politician(entity):
                        try:
                            politicians.append(dict(zip(['qid', 'name','nationality'], [entity_dict['id'],entity_dict['labels']['en']['value'],entity_dict['claims']['P27'][-1]['mainsnak']['datavalue']['value']['id']])))# dump entity qid, name and nationality only
                        except KeyError as e:
                            print(e,entity_dict['id'])
            except Exception:
                print (Exception)
        if ii % 10000 == 0:
            t2 = time.time()
            dt = t2 - t1
            print(
                "Dumped {} politicians among {} entities [entities/s: {:.2f}]".format(
                    len(politicians), ii, ii / dt
                )
            )
            d_file.write(('\n'.join(map(json.dumps, politicians))+'\n').encode('utf-8'))
            politicians = []
#         if ii > 10000:
#             break

In [5]:


from qwikidata.sparql import return_sparql_query_results

# send any sparql query to the wikidata query service and get full result back
# you can also use the service directly https://query.wikidata.org/
# I think this is much faster

# the query is modified based on example: List of countries by age of the head of government
# note head of the state and head of government are different
# only current leader is selected
sparql_query = """
# head of government
SELECT DISTINCT ?country ?countryLabel ?hgovernment ?hgovernmentLabel
{
  ?country wdt:P31 wd:Q3624078 .
  FILTER NOT EXISTS {?country wdt:P31 wd:Q3024240}
  ?country p:P6 ?statement .    
  ?statement ps:P6 ?hgovernment .
  ?country wdt:P6 ?hgovernment .
  FILTER NOT EXISTS { ?statement pq:P582 ?x } 
#   ?hgovernment wdt:P570 wd:Q543287 . # only living people TODO
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
"""
# result is a json file
# we can also download from the website in other formats
res = return_sparql_query_results(sparql_query)