In [13]:
import csv
from time import sleep
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON


endpoint_url = "https://query.wikidata.org/sparql"
sparql = SPARQLWrapper(endpoint_url)

def get_metadata(wiki_id):
    # Construct SPARQL query for the given batch of individuals
    sparql_query = """
    SELECT ?subject ?subjectLabel 
        ?occupation ?occupationLabel 
        ?dateOfBirth ?dateOfDeath 
        ?gender ?genderLabel 
        ?countryOfBirth ?countryOfBirthLabel 
        ?countryOfDeath ?countryOfDeathLabel 
        ?countryOfCitizenship ?countryOfCitizenshipLabel 
        ?birthCity ?birthCityLabel 
        ?deathCity ?deathCityLabel 
        ?birthCityPlace ?birthCityPlaceLabel 
        ?deathCityPlace ?deathCityPlaceLabel 
    WHERE {
    BIND(wd:%s AS ?subject)
    
    OPTIONAL { ?subject wdt:P106 ?occupation. }
    OPTIONAL { ?subject wdt:P569 ?dateOfBirth. }
    OPTIONAL { ?subject wdt:P570 ?dateOfDeath. }
    OPTIONAL { ?subject wdt:P21 ?gender. }
    OPTIONAL { ?subject wdt:P19 ?countryOfBirth. }
    OPTIONAL { ?subject wdt:P20 ?countryOfDeath. }
    OPTIONAL { ?subject wdt:P27 ?countryOfCitizenship. }
    OPTIONAL { ?subject wdt:P19 ?birthCity. }
    OPTIONAL { ?subject wdt:P20 ?deathCity. }
    OPTIONAL { ?subject wdt:P19 ?birthCityPlace. }
    OPTIONAL { ?subject wdt:P20 ?deathCityPlace. }
    
    SERVICE wikibase:label { bd:serviceParam wikibase:language 'en'. }
    }
    """% (wiki_id)

    return sparql_query



In [18]:
def extract_values(data):
    """
    Extracts only the values from the given dictionary.

    Args:
        data (dict): Dictionary containing Wikidata information.

    Returns:
        dict: Dictionary containing only the values.
    """
    extracted_data = {}
    for key, value in data.items():
        if isinstance(value, dict) and 'value' in value:
            extracted_data[key] = value['value']
    return extracted_data



def extract_info(wiki_id):
    # Set query and retrieve results
    sparql.setQuery(get_metadata(wiki_id))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    results = results["results"]["bindings"]
    # results = [
    #         result["item"]["value"] for result in results["results"]["bindings"]
    #     ]

    results = [extract_values(result) for result in results]
    
    return results

results = extract_info("Q859")

In [25]:
from multiprocessing import Pool


chunk = ["Q859", "Q859"]

with Pool(8) as p:
    results = list(tqdm(p.imap(extract_info, chunk), total=len(chunk)))

with open(f"raw_data/extracts/results_{chunk_index}.json", "w") as f:
    json.dump(results, f)

Process SpawnPoolWorker-9:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.10/3.10.13_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/homebrew/Cellar/python@3.10/3.10.13_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/homebrew/Cellar/python@3.10/3.10.13_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/homebrew/Cellar/python@3.10/3.10.13_1/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'extract_info' on <module '__main__' (built-in)>
Process SpawnPoolWorker-10:
Traceback (most recent call last):
  File "/opt/homebrew/Cellar/python@3.10/3.10.13_1/F

KeyboardInterrupt: 

In [24]:
def process_chunk(chunk, chunk_index):
    with Pool(8) as p:
        results = list(tqdm(p.imap(final_function, chunk), total=len(chunk)))
    with open(f"raw_data/extracts/results_{chunk_index}.json", "w") as f:
        json.dump(results, f)


['Q859', 'Q859']

  0%|          | 0/2 [00:00<?, ?it/s]


TypeError: 'list' object is not callable

In [22]:
def process_chunk(chunk, chunk_index):
  
    with open(f"raw_data/extracts/results_{chunk_index}.json", "w") as f:
        json.dump(results, f)

[{'subject': 'http://www.wikidata.org/entity/Q859',
  'occupation': 'http://www.wikidata.org/entity/Q36180',
  'dateOfBirth': '-0428-01-01T00:00:00Z',
  'dateOfDeath': '-0348-01-01T00:00:00Z',
  'gender': 'http://www.wikidata.org/entity/Q6581097',
  'countryOfBirth': 'http://www.wikidata.org/entity/Q1524',
  'countryOfDeath': 'http://www.wikidata.org/entity/Q1524',
  'countryOfCitizenship': 'http://www.wikidata.org/entity/Q844930',
  'birthCity': 'http://www.wikidata.org/entity/Q1524',
  'deathCity': 'http://www.wikidata.org/entity/Q1524',
  'birthCityPlace': 'http://www.wikidata.org/entity/Q1524',
  'deathCityPlace': 'http://www.wikidata.org/entity/Q1524',
  'subjectLabel': 'Plato',
  'occupationLabel': 'writer',
  'genderLabel': 'male',
  'countryOfBirthLabel': 'Athens',
  'countryOfDeathLabel': 'Athens',
  'countryOfCitizenshipLabel': 'Classical Athens',
  'birthCityLabel': 'Athens',
  'deathCityLabel': 'Athens',
  'birthCityPlaceLabel': 'Athens',
  'deathCityPlaceLabel': 'Athens'},