In [93]:
# Resolve and replace Wikidata items about Met's objects

# Script to resolve and replace P31 statements for Met objects in Wikidata
# Start with getting all P31 -> item of collection or exhibition (Q18593264)
# Assume that there is a qualifier such as object named as (P1932) -> "Halberd"
# (A separate bot has been designed to use Met CSV database to add these)

# SPARQL query to list these generic objects that have P1932 set:
# https://w.wiki/6Ndi

# This script will use two crosswalk databases, stored on Wikidata:
# https://www.wikidata.org/wiki/Wikidata:GLAM/Metropolitan_Museum_of_Art/glamingest/objectName
# https://www.wikidata.org/wiki/Wikidata:GLAM/Metropolitan_Museum_of_Art/glamingest/objectName_regex

# A report on a run done on February 24, 2023: https://gitlab.wikimedia.org/-/snippets/60

# Code to read in a wikitable of a crosswalk database for Met object name to Wikidata item matching
# From: https://github.com/fuzheado/glamingest/blob/master/pywikibottest.py

# For running in JupyterLab at https://paws.wmcloud.org
# !pip install wikitables
# !pip install tabulate

import pywikibot
import pandas as pd
from pywikibot import pagegenerators as pg
from pywikibot.data.sparql import SparqlQuery
import logging

import requests
import re
import numpy as np
import mwparserfromhell as mwp

from tabulate import tabulate
from wikitables import import_tables, WikiTable
from wikitables.util import ftag

from tqdm.notebook import tqdm

crosswalk_page_name = u'Wikidata:GLAM/Metropolitan_Museum_of_Art/glamingest/objectName'
crosswalk_regex_page_name = u'Wikidata:GLAM/Metropolitan Museum of Art/glamingest/objectName regex'
wikidata_api_url = 'https://www.wikidata.org/w/api.php'

def import_tables_from_wikitext(wikitext, title=None):
    # Set default title value
    if title is None:
        title = 'generic'

    body = wikitext

    ## parse for tables
    raw_tables = mwp.parse(body).filter_tags(matches=ftag('table'))

    def _table_gen():
        for idx, table in enumerate(raw_tables):
            name = '%s[%s]' % (title, idx)
            yield WikiTable(name, table)

    return list(_table_gen())

def import_tables_from_url(api_url, title):
    params = {'prop': 'revisions',
              'format': 'json',
              'action': 'query',
              'explaintext': '',
              'titles': title,
              'rvprop': 'content'}

    r = requests.get(api_url, params)
    r.raise_for_status()
    pages = r.json()["query"]["pages"]

    # use key from first result in 'pages' array
    pageid = list(pages.keys())[0]
    if pageid == '-1':
        raise ArticleNotFound('no matching articles returned')

    page = pages[pageid]
    body = page['revisions'][0]['*']

    return import_tables_from_wikitext(body, page['title'])


def import_tables_from_url_full(api_url, title):
    params = {'prop': 'revisions',
              'format': 'json',
              'action': 'query',
              'explaintext': '',
              'titles': title,
              'rvprop': 'content'}

    r = requests.get(api_url, params)
    r.raise_for_status()
    pages = r.json()["query"]["pages"]

    # use key from first result in 'pages' array
    pageid = list(pages.keys())[0]
    if pageid == '-1':
        raise ArticleNotFound('no matching articles returned')

    page = pages[pageid]
    body = page['revisions'][0]['*']

    ## parse for tables
    raw_tables = mwp.parse(body).filter_tags(matches=ftag('table'))

    def _table_gen():
        for idx, table in enumerate(raw_tables):
            name = '%s[%s]' % (page['title'], idx)
            yield WikiTable(name, table)

    return list(_table_gen())


def cross_lookup_object(in_df:pd.DataFrame, objectName:str, regex_search=False):
    '''
    Lookup a Met object name in crosswalk database that is a Dataframe
    Should have the following columns: Object Name, qid, extrastatement, extraqualifier (mostly unused)
    '''
    return_dict = {}
    found_qids = None
    
    if regex_search:
        for index, row in in_df.iterrows():
            regex = row['Object Name']
            if re.match(regex, objectName):
                found_qids = [item for item in [row[1], row[2]] if isinstance(item, str)]
    else:
        # Make new DF from crosswalk matches to objectName, but may be more than one match or qid+extrastatement
        sliced_df = in_df[in_df['Object Name'] == objectName]
        # Make list out of qids, with possible nan values
        found_qids = sliced_df['qid'].values.tolist() + sliced_df['extrastatement'].values.tolist()
        # Clean the list of NaN values
        found_qids = [item for item in found_qids if isinstance(item, str)]

    if found_qids:
        return_dict['qid'] = found_qids
        # TOOD: handle possible qualifiers

    return return_dict

def cross_lookup_object_combined(exact_df:pd.DataFrame, regex_df:pd.DataFrame, objectName:str):
    '''
    Combined search searches two different dataframes
        exact_df - for exact matches
        regex_df - for regular expression matches
    '''
    result = cross_lookup_object(exact_df, objectName)
    if not result:
        result = cross_lookup_object(regex_df, objectName, regex_search=True)
    return result

def stringify_pywikibot_target(intarget):
    '''
    Take pywikibot target object and return a string version, or QID, or printed date object
    '''
    returnstring = ''
    # print ('Type', type(intarget))
    if isinstance(intarget, str):
        return intarget
    elif isinstance(intarget, pywikibot.page._wikibase.ItemPage):
        return intarget.id
    elif isinstance(intarget, pywikibot.WbTime):
        return str(intarget)
    return None

def retrieve_claim_propqual(item, inclaimprop, inclaimtarget=None, inqualprop=None, inqualvalue=None):
    ''' Retrieve an entire Wikidata claim if property/qualifier match 
        Only works with returning strings or items, which get returned as QIDs
    '''
    returnlist = []

    item.get(force=True)
    if not item.claims.get(inclaimprop):
        return returnlist
    for statement in item.claims[inclaimprop]:
        if not inclaimtarget:
            returnlist += [stringify_pywikibot_target(statement.target)]
        elif statement.target.id == inclaimtarget:
            # Process qualifiers, if they exist
            if inqualprop and inqualprop in statement.qualifiers:
                for qual in statement.qualifiers[inqualprop]: #iterate over all P1932
                    returnqualstring = stringify_pywikibot_target(qual.target)
                    if inqualvalue:
                        if inqualvalue == returnqualstring:
                            returnlist.append(returnqualstring)
                    else:
                        returnlist.append(returnqualstring)
            elif not inqualprop:
                # inclaimprop-inclaimtarget triple matched, but no qualifier specified
                # Then just return the inclaimtarget QID
                returnlist.append(stringify_pywikibot_target(statement.target))
    return returnlist
    
if __name__ == '__main__':

    # Load crosswalk tables via API call, JSON
    cross_tables = import_tables_from_url(wikidata_api_url, crosswalk_page_name)
    cross_regex_tables = import_tables_from_url(wikidata_api_url, crosswalk_regex_page_name)

    # Load as a Dataframe, but replace QID with qid as column name
    cross_df = pd.read_json(cross_tables[0].json()).rename(columns={'QID': 'qid'}).replace(r'^\s*$', np.nan, regex=True)
    cross_regex_df = pd.read_json(cross_regex_tables[0].json()).rename(columns={'QID': 'qid'}).replace(r'^\s*$', np.nan, regex=True)

    # print (cross_df.sample(3))
    # print (cross_regex_df.sample(3))
    # print ('Test cross_lookup_object_combined')
    # testobjects = [
    #     'Foo', 'Bar', 'Jar', 'Jug', 'Wine cooler', 'Linen fragment', 'Figurine of Sa', 'Jug with waterspout'
    # ]
    # for obj in testobjects:
    #     print (obj, cross_lookup_object_combined(cross_df, cross_regex_df, obj))

    # Do SPARQL query to grab all entries of P31->Q18593264 and no qualifier
    # Return QID, Met ID
    # Create generator
    QUERY = '''
    SELECT ?item ?metid WHERE {
      ?item wdt:P31 wd:Q18593264 .
      ?item wdt:P3634 ?metid .
    } ORDER BY RAND() LIMIT 1000
    '''

    # For SPARQL generator
    wikidata_site = pywikibot.Site("wikidata", "wikidata")
    repo = wikidata_site.data_repository()
    generator = pg.WikidataSPARQLPageGenerator(QUERY, site=wikidata_site)

    # Other method to use SPARQL in one shot
    # wikiquery = SparqlQuery()
    # data = wikiquery.select(QUERY)

    total_counter = 0
    replaced_counter = 0

    pbar = tqdm(generator)
    for item in pbar:
        object_name = None
        matched_qids = None
        generic_statement = None

        # Grab Met ID, which should return exactly one value
        metidlist = retrieve_claim_propqual(item, 'P3634')
        if len(metidlist) != 1:
            tqdm.write(f"{item.id}: Error, Met ID should be exactly one. Instead: {metid}")
            continue

        metid = metidlist[0]  # Extract the lone Met ID

        total_counter += 1
        
        # Retrieve object name from qualifier
        # NOW DONE MORE EFFICIENTLY BELOW
        # object_result = retrieve_claim_propqual(item, 'P31', 'Q18593264', 'P1932')
        # object_name = object_result[0] if len(object_result) == 1 else None

        #Grab Wikidata P31 claims
        item.claims.get('P31')
        for statement in item.claims.get('P31'):  # Get instance of
            if statement.target.id == 'Q18593264': # See if it's item of collection or exhibition
                # Check for existing one
                if 'P1932' in statement.qualifiers:
                    for qual in statement.qualifiers['P1932']: #iterate over all
                        object_name = stringify_pywikibot_target(qual.target)

                # Output status message
                pbar.set_postfix_str(f"{item.id}: {metid}, {object_name}")

                # Lookup in exact/regex crosswalks for a P31 mapping
                lookup_result = cross_lookup_object_combined(cross_df, cross_regex_df, object_name)
                matched_qids = lookup_result.get('qid')
                generic_statement = statement # Remember this statement so we can remove later

        # Add new statement specific for P31, if matched
        if not matched_qids: 
            continue # Skip if no match

        for qid in matched_qids:
            tqdm.write(f"{item.id}: {metid}, {object_name} -> {qid}")

            # Add new P31 specific claim
            new_statement = pywikibot.Claim(repo, 'P31')
            target = pywikibot.ItemPage(repo, qid)
            new_statement.setTarget(target)
            item.addClaim(new_statement)

            # Add qualifier with objectName string from Met
            qualifier = pywikibot.Claim(repo, 'P1932')
            qualifier.setTarget(object_name)
            new_statement.addQualifier(qualifier) # summary=u'Adding a qualifier.'

        # Remove generic statement
        if generic_statement:
            item.removeClaims(generic_statement)
            replaced_counter += 1

    # Output final report
    tqdm.write(f"Finished")
    tqdm.write(f"Total examined: {total_counter}")
    tqdm.write(f"Replaced: {replaced_counter}")

0it [00:00, ?it/s]

Q116297331: 242177, Statuette of a lion -> Q16738862
Q116293073: 246424, Statuette of a youth -> Q16738862
Q116287353: 251629, Statuette of a warrior, 10 -> Q16738862
Q116290591: 249180, Statuette of a woman reclining -> Q16738862
Q116287941: 257430, Statuette of a bearded man -> Q16738862
Q116290458: 241173, Statuette of a woman carrying a jar -> Q16738862
Q116390387: 967, Brandy tumbler -> Q16180001
Q116297691: 242066, Statuette of a priest (?) wearing bull mask -> Q16738862
Q116287364: 251618, Statuette of Athena, 5 -> Q16738862
Q116287202: 251656, Statuette of Poseidon, 4 -> Q16738862
Q116390392: 969, Brandy tumbler -> Q16180001
Q116403314: 251615, Statuette of Athena -> Q16738862
Q116402375: 247574, Statuette of standing woman -> Q16738862
Q116390375: 965, Brandy tumbler -> Q16180001
Q116291641: 254769, Statuette of a goddess -> Q16738862
Q116390369: 962, Brandy tumbler -> Q16180001
Q116390389: 968, Brandy tumbler -> Q16180001
Q116390371: 964, Brandy tumbler -> Q16180001
Q11639037