In [60]:
# Supplement Wikidata items about Met's objects

# Script to supplement P31 statements for Met objects in Wikidata
# Start with getting all P31 statements
# Lookup the objectName in the Met database

# SPARQL query to list Met items without the P1932 set:
# https://w.wiki/6NiH

# Consider adding references
# Example: https://www.wikidata.org/wiki/Q18177495
# retrieved - 14 May 2015
# reference URL - http://www.metmuseum.org/collection/the-collection-online/search/435976

# Written to run on JupyterLab via https://paws.wmcloud.org

import pywikibot
import pandas as pd
from pywikibot import pagegenerators as pg
from pywikibot.data.sparql import SparqlQuery
import logging

from types import SimpleNamespace

import requests
import re
import numpy as np

from joblib import Parallel, delayed

from time import perf_counter
from tqdm.notebook import tqdm

def stringify_pywikibot_target(intarget) -> str:
    '''
    Take pywikibot target object and return a string version, or QID, or printed date object
    '''
    returnstring = ''
    # print ('Type', type(intarget))
    if isinstance(intarget, str):
        return intarget
    elif isinstance(intarget, pywikibot.page._wikibase.ItemPage):
        return intarget.id
    elif isinstance(intarget, pywikibot.WbTime):
        return str(intarget)
    return None

def retrieve_claim_propqual(item, inclaimprop, inclaimtarget=None, inqualprop=None, inqualvalue=None) -> list:
    ''' Retrieve an entire Wikidata claim if property/qualifier match 
        Only works with returning strings or items, which get returned as QIDs
    '''
    returnlist = []

    item.get(force=True)
    if not item.claims.get(inclaimprop):
        return returnlist
    for statement in item.claims[inclaimprop]:
        if not inclaimtarget:
            returnlist += [stringify_pywikibot_target(statement.target)]
        elif statement.target.id == inclaimtarget:
            # Process qualifiers, if they exist
            if inqualprop and inqualprop in statement.qualifiers:
                for qual in statement.qualifiers[inqualprop]: #iterate over all P1932
                    returnqualstring = stringify_pywikibot_target(qual.target)
                    if inqualvalue:
                        if inqualvalue == returnqualstring:
                            returnlist.append(returnqualstring)
                    else:
                        returnlist.append(returnqualstring)
            elif not inqualprop:
                # inclaimprop-inclaimtarget triple matched, but no qualifier specified
                # Then just return the inclaimtarget QID
                returnlist.append(stringify_pywikibot_target(statement.target))
    return returnlist

def metid_to_objectName (in_df, metid:str) -> str:
    '''Lookup met id (integer) in Dataframe loaded from Met CSV'''
    result = None
    foundrow = in_df.loc[in_df['Object ID'] == int(metid)]
    if not foundrow.empty:
        result = foundrow.at[foundrow.index[0],'Object Name']
    return result

def handle_item (item:pywikibot.page.ItemPage):
    # global counter
    
    object_name = None
    matched_qids = None

    counter.total += 1

    # tqdm.write(f"{item.id}")

    # Grab Met ID, which should return exactly one value
    metidlist    = retrieve_claim_propqual(item, 'P3634')
    instancelist = retrieve_claim_propqual(item, 'P31')
    if len(metidlist) != 1:
        tqdm.write(f"  {item.id}: Error, Met ID should be exactly one. Instead: {metidlist}")
        return

    metid = metidlist[0]  # Extract the lone Met ID

    # Look up object name, make sure to strip whitespace!
    object_name = metid_to_objectName (metdf, metid)
    if not object_name:
        tqdm.write(f"  {item.id}: Error, object name lookup with Met database returned empty")
        return
    try:
        object_name = object_name.strip()
    except AttributeError:
        tqdm.write(f"{item.id}: AttributeError for object_name: {object_name}")
        return

    # Grab Wikidata P31 claims
    item.claims.get('P31')
    for statement in item.claims.get('P31'):  # Get instance of

        if 'P1932' in statement.qualifiers:
            for qual in statement.qualifiers['P1932']: #iterate over all
                tqdm.write(f"{item.id}: Error, found unexpected P31|{statement.target.id}|P1932 claims: {qual}")
                # object_name = stringify_pywikibot_target(qual.target)
            continue

        # Output status message
        pbar.set_postfix_str(f"{item.id}: {metid}, should be {object_name}")

        tqdm.write(f"Add {item.id}: {metid}, {statement.target.id}, '{object_name}'")

        # Add qualifier with objectName string from Met
        qualifier = pywikibot.Claim(repo, 'P1932')
        try:
            qualifier.setTarget(object_name)
        except ValueError:
            tqdm.write(f"  could not set object name to {object_name}")
            return
        statement.addQualifier(qualifier) # summary=u'Adding a qualifier.'

        counter.statements += 1

    counter.supplemented += 1
    return

if __name__ == '__main__':

    # Init counters
    counterdict = {
        'total': 0,
        'supplemented': 0,
        'statements': 0,
        'start_time': 0,
        'end_time': 0
    }
    counter = SimpleNamespace(**counterdict)

    # LOG = "met-supplement-p31.log"
    # logging.basicConfig(filename=LOG, filemode="w", level=logging.DEBUG)

    # LOAD Met CSV
    if metdf.empty:
        metdf = pd.read_csv('MetObjects.csv',low_memory=False)

    # Do SPARQL query to grab all entries of P31->Q18593264 and no qualifier
    # Return QID, Met ID
    QUERY = '''
    SELECT ?item ?thing ?metid WHERE {
      ?item wdt:P3634 ?metid .
      MINUS { ?item p:P31 [ps:P31 ?thing; pq:P1932 [] ] }
      ?item wdt:P31 ?thing .
    } 
    '''

    # For SPARQL generator
    wikidata_site = pywikibot.Site("wikidata", "wikidata")
    repo = wikidata_site.data_repository()
    generator = pg.WikidataSPARQLPageGenerator(QUERY, site=wikidata_site)

    pbar = tqdm(generator) # For progress bar
    counter.start_time = perf_counter()
    for item in pbar:
        handle_item(item)

    # Possible joblib speedup? Not happening
    # Parallel(n_jobs=8, prefer="threads")(delayed(handle_item)(i) for i in generator)
    counter.end_time = perf_counter()

    # Output final report
    tqdm.write(f"Finished")
    tqdm.write(f"Time elapsed: {counter.end_time-counter.start_time} secs")
    tqdm.write(f"Items examined: {counter.total}")
    tqdm.write(f"Items supplemented: {counter.supplemented}")
    tqdm.write(f"Statements added: {counter.statements}")

0it [00:00, ?it/s]

  Q20172025: Error, object name lookup with Met database returned empty
Q97013416: AttributeError for object_name: nan
Add Q96184317: 364154, Q11060274, 'Print'
Add Q78619827: 670942, Q3305213, 'Hanging scroll'
Add Q116244673: 451492, Q6916434, 'Mosque lamp'
Add Q116410533: 468598, Q1264081, 'Censer'
Add Q104413027: 359882, Q11060274, 'Print'
Add Q116329992: 14495, Q191851, 'Vase'
Add Q116410564: 469930, Q1066288, 'Statuette'
Add Q96802968: 271932, Q125191, 'Photograph'
Add Q19925198: 12703, Q3305213, 'Painting, miniature'
Add Q116398959: 197569, Q245005, 'Teapot'
Add Q116352455: 6069, Q57216, 'Plate'
Add Q98821665: 787917, Q125191, 'Photograph'
Add Q116371127: 2656, Q60310748, 'Cream pot'
Add Q116313540: 49694, Q4364339, 'Plaque'
Add Q116235479: 490684, Q132397, 'Pitcher'
Add Q78683300: 75079, Q3305213, 'Folio'
Add Q116445313: 823315, Q81727, 'Cup'
Add Q104413108: 751510, Q2568536, 'Cradleboard'
Add Q116273230: 454675, Q1348059, 'Panel'
Add Q116364971: 6154, Q57216, 'Plate'
Add Q11625

ERROR: Retrying failed OAuth authentication for wikidata:wikidata: The authorization headers in your request are not valid: Nonce already used: 21654220307546377761677402127


Add Q78712102: 73299, Q3305213, 'Handscrolls'
Add Q116281220: 461283, Q2537127, 'Plaquette'
Add Q20200241: 482449, Q3305213, 'Drawing'
Add Q19923564: 486720, Q3305213, 'Drawing'
Add Q116434136: 559301, Q1422576, 'Model dish, Perneb'
Add Q116289741: 248393, Q860861, 'Head of a woman'
Add Q116295944: 248777, Q1066288, 'Statuette of an actor'
Add Q116366608: 4671, Q193358, 'Ladle'
Add Q116311602: 188779, Q1066288, 'Statuette'
Add Q98821644: 716313, Q125191, 'Photograph'
Add Q116390629: 1367, Q151771, 'Candlestick'
Add Q97852593: 775514, Q11060274, 'Print'
Add Q116242861: 452027, Q860861, 'Figural vessel'
Add Q116246220: 591035, Q726826, 'Shabti, Henettawy C'
Add Q20194803: 488695, Q3305213, 'Painting'
Add Q20197879: 491747, Q22075301, 'Textile piece'
Add Q20197879: 491747, Q29639251, 'Textile piece'
Add Q116306994: 206455, Q57216, 'Plate'
Add Q20183472: 459108, Q3305213, 'Painting'
Add Q78827925: 49131, Q3305213, 'Album'
Add Q78827925: 49131, Q1173065, 'Album'
Add Q19924960: 18113, Q33052



APIError: modification-failed: String should not start or end with whitespace nor include vertical whitespace or tabs: Painting Frigate Presiden Frigate President and H.B.M. Endymion
[messages: [{'name': 'wikibase-validator-illegal-string-chars', 'parameters': ['Painting\r\nFrigate Presiden\r\nFrigate President and\r\nH.B.M. Endymion'], 'html': {'*': 'String should not start or end with whitespace nor include vertical whitespace or tabs: Painting\r\nFrigate Presiden\r\nFrigate President and\r\nH.B.M. Endymion'}}];
 servedby: mw1465;
 help: See https://www.wikidata.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/&gt; for notice of API deprecations and breaking changes.]

In [None]:
print (type(item))

<class 'pywikibot.page._wikibase.ItemPage'>


In [60]:
# Supplement Wikidata items about Met's objects

# Script to supplement P31 statements for Met objects in Wikidata
# Start with getting all P31 statements
# Lookup the objectName in the Met database

# SPARQL query to list Met items without the P1932 set:
# https://w.wiki/6NiH

# Consider adding references
# Example: https://www.wikidata.org/wiki/Q18177495
# retrieved - 14 May 2015
# reference URL - http://www.metmuseum.org/collection/the-collection-online/search/435976

# Written to run on JupyterLab via https://paws.wmcloud.org

import pywikibot
import pandas as pd
from pywikibot import pagegenerators as pg
from pywikibot.data.sparql import SparqlQuery
import logging

from types import SimpleNamespace

import requests
import re
import numpy as np

from joblib import Parallel, delayed

from time import perf_counter
from tqdm.notebook import tqdm

def stringify_pywikibot_target(intarget) -> str:
    '''
    Take pywikibot target object and return a string version, or QID, or printed date object
    '''
    returnstring = ''
    # print ('Type', type(intarget))
    if isinstance(intarget, str):
        return intarget
    elif isinstance(intarget, pywikibot.page._wikibase.ItemPage):
        return intarget.id
    elif isinstance(intarget, pywikibot.WbTime):
        return str(intarget)
    return None

def retrieve_claim_propqual(item, inclaimprop, inclaimtarget=None, inqualprop=None, inqualvalue=None) -> list:
    ''' Retrieve an entire Wikidata claim if property/qualifier match 
        Only works with returning strings or items, which get returned as QIDs
    '''
    returnlist = []

    item.get(force=True)
    if not item.claims.get(inclaimprop):
        return returnlist
    for statement in item.claims[inclaimprop]:
        if not inclaimtarget:
            returnlist += [stringify_pywikibot_target(statement.target)]
        elif statement.target.id == inclaimtarget:
            # Process qualifiers, if they exist
            if inqualprop and inqualprop in statement.qualifiers:
                for qual in statement.qualifiers[inqualprop]: #iterate over all P1932
                    returnqualstring = stringify_pywikibot_target(qual.target)
                    if inqualvalue:
                        if inqualvalue == returnqualstring:
                            returnlist.append(returnqualstring)
                    else:
                        returnlist.append(returnqualstring)
            elif not inqualprop:
                # inclaimprop-inclaimtarget triple matched, but no qualifier specified
                # Then just return the inclaimtarget QID
                returnlist.append(stringify_pywikibot_target(statement.target))
    return returnlist

def metid_to_objectName (in_df, metid:str) -> str:
    '''Lookup met id (integer) in Dataframe loaded from Met CSV'''
    result = None
    foundrow = in_df.loc[in_df['Object ID'] == int(metid)]
    if not foundrow.empty:
        result = foundrow.at[foundrow.index[0],'Object Name']
    return result

def handle_item (item:pywikibot.page.ItemPage):
    # global counter
    
    object_name = None
    matched_qids = None

    counter.total += 1

    # tqdm.write(f"{item.id}")

    # Grab Met ID, which should return exactly one value
    metidlist    = retrieve_claim_propqual(item, 'P3634')
    instancelist = retrieve_claim_propqual(item, 'P31')
    if len(metidlist) != 1:
        tqdm.write(f"  {item.id}: Error, Met ID should be exactly one. Instead: {metidlist}")
        return

    metid = metidlist[0]  # Extract the lone Met ID

    # Look up object name, make sure to strip whitespace!
    object_name = metid_to_objectName (metdf, metid)
    if not object_name:
        tqdm.write(f"  {item.id}: Error, object name lookup with Met database returned empty")
        return
    try:
        object_name = object_name.strip()
    except AttributeError:
        tqdm.write(f"{item.id}: AttributeError for object_name: {object_name}")
        return

    # Grab Wikidata P31 claims
    item.claims.get('P31')
    for statement in item.claims.get('P31'):  # Get instance of

        if 'P1932' in statement.qualifiers:
            for qual in statement.qualifiers['P1932']: #iterate over all
                tqdm.write(f"{item.id}: Error, found unexpected P31|{statement.target.id}|P1932 claims: {qual}")
                # object_name = stringify_pywikibot_target(qual.target)
            continue

        # Output status message
        pbar.set_postfix_str(f"{item.id}: {metid}, should be {object_name}")

        tqdm.write(f"Add {item.id}: {metid}, {statement.target.id}, '{object_name}'")

        # Add qualifier with objectName string from Met
        qualifier = pywikibot.Claim(repo, 'P1932')
        try:
            qualifier.setTarget(object_name)
        except ValueError:
            tqdm.write(f"  could not set object name to {object_name}")
            return
        statement.addQualifier(qualifier) # summary=u'Adding a qualifier.'

        counter.statements += 1

    counter.supplemented += 1
    return

if __name__ == '__main__':

    # Init counters
    counterdict = {
        'total': 0,
        'supplemented': 0,
        'statements': 0,
        'start_time': 0,
        'end_time': 0
    }
    counter = SimpleNamespace(**counterdict)

    # LOG = "met-supplement-p31.log"
    # logging.basicConfig(filename=LOG, filemode="w", level=logging.DEBUG)

    # LOAD Met CSV
    if metdf.empty:
        metdf = pd.read_csv('MetObjects.csv',low_memory=False)

    # Do SPARQL query to grab all entries of P31->Q18593264 and no qualifier
    # Return QID, Met ID
    QUERY = '''
    SELECT ?item ?thing ?metid WHERE {
      ?item wdt:P3634 ?metid .
      MINUS { ?item p:P31 [ps:P31 ?thing; pq:P1932 [] ] }
      ?item wdt:P31 ?thing .
    } 
    '''

    # For SPARQL generator
    wikidata_site = pywikibot.Site("wikidata", "wikidata")
    repo = wikidata_site.data_repository()
    generator = pg.WikidataSPARQLPageGenerator(QUERY, site=wikidata_site)

    pbar = tqdm(generator) # For progress bar
    counter.start_time = perf_counter()
    for item in pbar:
        handle_item(item)

    # Possible joblib speedup? Not happening
    # Parallel(n_jobs=8, prefer="threads")(delayed(handle_item)(i) for i in generator)
    counter.end_time = perf_counter()

    # Output final report
    tqdm.write(f"Finished")
    tqdm.write(f"Time elapsed: {counter.end_time-counter.start_time} secs")
    tqdm.write(f"Items examined: {counter.total}")
    tqdm.write(f"Items supplemented: {counter.supplemented}")
    tqdm.write(f"Statements added: {counter.statements}")

0it [00:00, ?it/s]

  Q20172025: Error, object name lookup with Met database returned empty
Q97013416: AttributeError for object_name: nan
Add Q96184317: 364154, Q11060274, 'Print'
Add Q78619827: 670942, Q3305213, 'Hanging scroll'
Add Q116244673: 451492, Q6916434, 'Mosque lamp'
Add Q116410533: 468598, Q1264081, 'Censer'
Add Q104413027: 359882, Q11060274, 'Print'
Add Q116329992: 14495, Q191851, 'Vase'
Add Q116410564: 469930, Q1066288, 'Statuette'
Add Q96802968: 271932, Q125191, 'Photograph'
Add Q19925198: 12703, Q3305213, 'Painting, miniature'
Add Q116398959: 197569, Q245005, 'Teapot'
Add Q116352455: 6069, Q57216, 'Plate'
Add Q98821665: 787917, Q125191, 'Photograph'
Add Q116371127: 2656, Q60310748, 'Cream pot'
Add Q116313540: 49694, Q4364339, 'Plaque'
Add Q116235479: 490684, Q132397, 'Pitcher'
Add Q78683300: 75079, Q3305213, 'Folio'
Add Q116445313: 823315, Q81727, 'Cup'
Add Q104413108: 751510, Q2568536, 'Cradleboard'
Add Q116273230: 454675, Q1348059, 'Panel'
Add Q116364971: 6154, Q57216, 'Plate'
Add Q11625

ERROR: Retrying failed OAuth authentication for wikidata:wikidata: The authorization headers in your request are not valid: Nonce already used: 21654220307546377761677402127


Add Q78712102: 73299, Q3305213, 'Handscrolls'
Add Q116281220: 461283, Q2537127, 'Plaquette'
Add Q20200241: 482449, Q3305213, 'Drawing'
Add Q19923564: 486720, Q3305213, 'Drawing'
Add Q116434136: 559301, Q1422576, 'Model dish, Perneb'
Add Q116289741: 248393, Q860861, 'Head of a woman'
Add Q116295944: 248777, Q1066288, 'Statuette of an actor'
Add Q116366608: 4671, Q193358, 'Ladle'
Add Q116311602: 188779, Q1066288, 'Statuette'
Add Q98821644: 716313, Q125191, 'Photograph'
Add Q116390629: 1367, Q151771, 'Candlestick'
Add Q97852593: 775514, Q11060274, 'Print'
Add Q116242861: 452027, Q860861, 'Figural vessel'
Add Q116246220: 591035, Q726826, 'Shabti, Henettawy C'
Add Q20194803: 488695, Q3305213, 'Painting'
Add Q20197879: 491747, Q22075301, 'Textile piece'
Add Q20197879: 491747, Q29639251, 'Textile piece'
Add Q116306994: 206455, Q57216, 'Plate'
Add Q20183472: 459108, Q3305213, 'Painting'
Add Q78827925: 49131, Q3305213, 'Album'
Add Q78827925: 49131, Q1173065, 'Album'
Add Q19924960: 18113, Q33052



APIError: modification-failed: String should not start or end with whitespace nor include vertical whitespace or tabs: Painting Frigate Presiden Frigate President and H.B.M. Endymion
[messages: [{'name': 'wikibase-validator-illegal-string-chars', 'parameters': ['Painting\r\nFrigate Presiden\r\nFrigate President and\r\nH.B.M. Endymion'], 'html': {'*': 'String should not start or end with whitespace nor include vertical whitespace or tabs: Painting\r\nFrigate Presiden\r\nFrigate President and\r\nH.B.M. Endymion'}}];
 servedby: mw1465;
 help: See https://www.wikidata.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/&gt; for notice of API deprecations and breaking changes.]