In [None]:
# Supplement Wikidata items about Met's objects

# Script to supplement P31 statements for Met objects in Wikidata
# Start with getting all P31 statements
# Lookup the objectName in the Met database

# SPARQL query to list Met items without the P1932 set:
# https://w.wiki/6NiH

# Consider adding references
# Example: https://www.wikidata.org/wiki/Q18177495
# retrieved - 14 May 2015
# reference URL - http://www.metmuseum.org/collection/the-collection-online/search/435976

# Written to run on JupyterLab via https://paws.wmcloud.org

import pywikibot
import pandas as pd
from pywikibot import pagegenerators as pg
from pywikibot.data.sparql import SparqlQuery
import logging

from types import SimpleNamespace

import requests
import re
import numpy as np

from joblib import Parallel, delayed

from time import perf_counter
from tqdm.notebook import tqdm

def stringify_pywikibot_target(intarget) -> str:
    '''
    Take pywikibot target object and return a string version, or QID, or printed date object
    '''
    returnstring = ''
    # print ('Type', type(intarget))
    if isinstance(intarget, str):
        return intarget
    elif isinstance(intarget, pywikibot.page._wikibase.ItemPage):
        return intarget.id
    elif isinstance(intarget, pywikibot.WbTime):
        return str(intarget)
    return None

def retrieve_claim_propqual(item:pywikibot.page.ItemPage, inclaimprop, inclaimtarget=None, inqualprop=None, inqualvalue=None) -> list:
    ''' Retrieve an entire Wikidata claim if property/qualifier match 
        Only works with returning strings or items, which get returned as QIDs
    '''
    returnlist = []

    item.get(force=True)
    if not item.claims.get(inclaimprop):
        return returnlist
    for statement in item.claims[inclaimprop]:
        if not inclaimtarget:
            returnlist += [stringify_pywikibot_target(statement.target)]
        elif statement.target.id == inclaimtarget:
            # Process qualifiers, if they exist
            if inqualprop and inqualprop in statement.qualifiers:
                for qual in statement.qualifiers[inqualprop]: #iterate over all P1932
                    returnqualstring = stringify_pywikibot_target(qual.target)
                    if inqualvalue:
                        if inqualvalue == returnqualstring:
                            returnlist.append(returnqualstring)
                    else:
                        returnlist.append(returnqualstring)
            elif not inqualprop:
                # inclaimprop-inclaimtarget triple matched, but no qualifier specified
                # Then just return the inclaimtarget QID
                returnlist.append(stringify_pywikibot_target(statement.target))
    return returnlist

def metid_to_objectName (in_df, metid:str) -> str:
    '''
    Lookup Met id (integer) in Dataframe loaded from Met CSV
    '''
    result = None
    foundrow = in_df.loc[in_df['Object ID'] == int(metid)]

    if not foundrow.empty:
        result = foundrow.at[foundrow.index[0],'Object Name']
        return result.strip() if isinstance(result, str) else None
    return None


def handle_item (item:pywikibot.page.ItemPage) -> bool:
    '''
    Function to read in the P31 statement for the item, and add a qualifer:
    P1932 -> <Met object name>
    
    This uses the "metdf" global Dataframe that has been read in from CSV
    
    It will abort if:
        - More than one Met object ID is associated with item (probably needs splitting up)
        - An existing P1932 qualifier alredy exists, to be safe
        - Bad string data from The Met, such as vertical whitespace or other odd characters
        - Bad string data from The Met, if the object name is just missing
    '''
    global metdf

    object_name = None
    matched_qids = None

    counter.total += 1

    # Grab Met ID, which should return exactly one value
    metidlist    = retrieve_claim_propqual(item, 'P3634')
    instancelist = retrieve_claim_propqual(item, 'P31')

    if len(metidlist) != 1:
        tqdm.write(f"  {item.id}: Error, Met ID should be exactly one. Instead: {metidlist}")
        return False

    metid = metidlist[0]  # Extract what should be solo Met ID

    # Look up object name, make sure to strip whitespace!
    object_name = metid_to_objectName (metdf, metid)
    if not object_name:
        tqdm.write(f"  {item.id}: {metid}, error object name lookup with Met database returned empty")
        return False

    # Wikidata doesn't like leading or trailing whitespace, so get rid of it
    try:
        object_name = object_name.strip()
    except AttributeError:
        tqdm.write(f"{item.id}: AttributeError for object_name: {object_name}")
        return False

    # Wikidata will not allow vertical whitespace, abort and report if found
    forbidden_chars_list = ['\n', '\r', '\f', '\t', '\v']
    if any(c in object_name for c in forbidden_chars_list):
        tqdm.write(f"{item.id}: Vertical or forbidden whitespace in object_name: {object_name}")
        return False

    # Grab Wikidata P31 claims
    item.claims.get('P31')
    for statement in item.claims.get('P31'):  # Get instance of

        if 'P1932' in statement.qualifiers:
            for qual in statement.qualifiers['P1932']: #iterate over all
                tqdm.write(f"{item.id}: Error, found unexpected P31|{statement.target.id}|P1932 claims: {qual}")
                # object_name = stringify_pywikibot_target(qual.target)
            continue

        # Output status message
        pbar.set_postfix_str(f"{item.id}: {metid}, should be {object_name}")

        tqdm.write(f"Add {item.id}: {metid}, {statement.target.id}, '{object_name}'")

        # Add qualifier with objectName string from Met
        qualifier = pywikibot.Claim(repo, 'P1932')
        try:
            qualifier.setTarget(object_name)
        except ValueError:
            tqdm.write(f"  could not set object name to {object_name}")
            return False
        statement.addQualifier(qualifier) # summary=u'Adding a qualifier.'

        counter.statements += 1

    counter.supplemented += 1
    return True

if __name__ == '__main__':

    # Initialize counters
    counterdict = {key: 0 for key in ('metdf_start_time', 'metdf_end_time', 
                                      'sparql_start_time', 'sparql_end_time', 
                                      'total', 'supplemented', 'statements', 
                                      'start_time', 'end_time')}
    counter = SimpleNamespace(**counterdict)

    # LOG = "met-supplement-p31.log"
    # logging.basicConfig(filename=LOG, filemode="w", level=logging.DEBUG)

    # Load Met CSV efficiently if on different runs
    try:
        metdf
        tqdm.write("Met database already loaded, not reloading")
    except NameError:
        counter.metdf_start_time = perf_counter()
        metdf = pd.read_csv('MetObjects.csv',low_memory=False)
        counter.metdf_end_time = perf_counter()
        tqdm.write(f"Met database load time: {counter.metdf_end_time-counter.metdf_start_time:.2f} secs")

    # Do SPARQL query to grab all entries of P31->Q18593264 and no qualifier
    # Return QID, Met ID
    QUERY = '''
    SELECT ?item ?thing ?metid WHERE {
      ?item wdt:P3634 ?metid .
      MINUS { ?item p:P31 [ps:P31 ?thing; pq:P1932 [] ] }
      ?item wdt:P31 ?thing .
    } LIMIT 30000
    '''

    # For SPARQL generator
    wikidata_site = pywikibot.Site("wikidata", "wikidata")
    repo = wikidata_site.data_repository()

    # SPARQL query
    counter.sparql_start_time = perf_counter()
    generator = pg.WikidataSPARQLPageGenerator(QUERY, site=wikidata_site)
    counter.sparql_end_time = perf_counter()
    tqdm.write(f"SPARQL query time: {counter.sparql_end_time-counter.sparql_start_time:.2f} secs")

    pbar = tqdm(generator) # For progress bar
    counter.start_time = perf_counter()
    for item in pbar:
        handle_item(item)
    counter.end_time = perf_counter()

    # Possible joblib speedup? Not happening, probably because of pywikibot blocking or GIL
    # Parallel(n_jobs=8, prefer="threads")(delayed(handle_item)(i) for i in generator)

    # Output final report
    tqdm.write(f"Finished")
    tqdm.write(f"Time elapsed: {counter.end_time-counter.start_time:.2f} secs")
    tqdm.write(f"Items examined: {counter.total}")
    tqdm.write(f"Items supplemented: {counter.supplemented}")
    tqdm.write(f"Statements added: {counter.statements}")