In [None]:
# Bot to examine Wikimedia Commons category and add structured data (SDC) info for The Met Museum:
#   Met Object ID (P3634) -> object id
#   collection (P195) -> Q160236
#
# For well-structured uploads to Commons (mostly done in 2017 with GLAM Wiki Toolset)
#   the object id can be determined by looking at the template "source" field
#   for a URL of pattern:
#   https://www.metmuseum.org/art/collection/search/12345
#
# Most of the institutional uploads located in: 
#   https://commons.wikimedia.org/wiki/Category:Metropolitan_Museum_of_Art_by_department
#
# Author: Andrew Lih (User:Fuzheado), based on SDC routines from Botmultichill

import pywikibot
from pywikibot import pagegenerators
from pywikibot.comms import http

import json
import re
from itertools import islice

# Parameters specific to the institution
PARAMS = {
    'test_mode': False,
    'category': u'Category:Japanese prints in the Metropolitan Museum of Art',
    'recurse': False,
    'search_string': r'source[ ]*=[ ]*http[s]*://www.metmuseum.org/art/collection/search/(\d+)',
    'pid':'P3634',
    'pid_summary': u'importing Met Object ID from source info from Commons template',
    'institution_pid':'P195',
    'institution_qid':'Q160236',
    'institution_summary': u'adding collection based on Met Object ID',
    'http_retries': 3
}

# Counters for final report
counter = {
    'id': 0,
    'institution': 0,
    'id_skip': 0,
    'institution_skip': 0,
    'unmatched': 0
}

def addClaim(mediaid, pid, instring, summary='', claimtype='qid') -> int:
    """addClaim - add a Wikibase claim for a property/pid that is a Q object
    
    :param mediaid: MID of Commons file
    :param pid: Wikidata property (eg. P195)
    :param instring: either QID for an object (eg. Q160236) or a text string
    :param summary: edit summary to append to automated summary
    :param claimtype: should be 'qid' or 'string'
    :return: return value
    """
    pywikibot.output(u'Edit %s: %s->%s. %s' % (mediaid, pid, instring, summary))

    # Check for existing entry - if it exists at all, skip and honor existing entry
    request = site._simple_request(action='wbgetentities',ids=mediaid)
    data = request.submit()
    
    # Check in case there are no SDC statements, or existing pid statement
    try:
        if (data.get(u'entities').get(mediaid).get(u'statements').get(pid)):
            pywikibot.output(u'  Existing entry: skipping to be safe.')
            return 0  # Skip
    except AttributeError:
        pass
        
    tokenrequest = http.fetch(u'https://commons.wikimedia.org/w/api.php?action=query&meta=tokens&type=csrf&format=json')

    tokendata = json.loads(tokenrequest.text)
    token = tokendata.get(u'query').get(u'tokens').get(u'csrftoken')
    
    # Determine claimtype
    if claimtype == 'qid':
        jstring = {"entity-type":"item","numeric-id": instring.replace(u'Q', u'')}
        postvalue = json.dumps(jstring)
    elif claimtype == 'string':
        postvalue = '"'+instring+'"'
    else:
        # Bad claimtype passed in, should never get here
        pywikibot.output(u'  Error: improper claimtype passed %s' % (claimtype,))
        return -1

    postdata = {u'action' : u'wbcreateclaim',
                u'format' : u'json',
                u'entity' : mediaid,
                u'property' : pid,
                u'snaktype' : u'value',
                u'value' : postvalue,
                u'token' : token,
                u'summary' : summary
                }

    if PARAMS['test_mode']:
        pywikibot.output('TEST_MODE: addClaim: %s, %s' % (claimtype, postdata))
        pass
    else:
        apipage = http.fetch(u'https://commons.wikimedia.org/w/api.php', method='POST', data=postdata)
        if not apipage.ok:
            pywikibot.output('  Error http status code %s: %s %s' % (apipage.status_code, claimtype, postdata))
            return -2
        
    return 1  # Successful add


def media_id(page: pywikibot.page.FilePage) -> str:
    """Return MID from page ID: just prepend an M to number"""
    return u'M%s' % (page.pageid,)


# Start execution

# Set this manually if you want to override the default
# PARAMS['category'] = u'Category:Bathing suits in the Metropolitan Museum of Art',

# Shouldn't need to touch these
site = pywikibot.Site(u'commons', u'commons')
cat = pywikibot.Category(site,PARAMS['category'])
gen = pagegenerators.CategorizedPageGenerator(cat, recurse=PARAMS['recurse'])
patt = re.compile(PARAMS['search_string'])

# Start processing items in category
# for page in islice(gen, 10):
for page in gen:

    text = page.text  # Grab commons page text
    mediaid = media_id(page) # Determine MID of form M23456

    m = patt.search(text) # Find URL string that contains object number
    if m:
        id_string = m.group(1)  # Extract object ID from regex match group
        summary = PARAMS['pid_summary']
        result = addClaim(mediaid, PARAMS['pid'], id_string, summary, claimtype='string')

        if result>0:
            counter['id'] += 1
        else:
            counter['id_skip'] += 1

        if 'institution_pid' not in PARAMS:
            next
            
        institution = PARAMS['institution_qid']  # Met QID
        summary = PARAMS['institution_summary']
        result = addClaim(mediaid, PARAMS['institution_pid'], institution, summary, claimtype='qid')

        if result>0:
            counter['institution'] += 1
        else:
            counter['institution_skip'] += 1
    else:
        counter['unmatched'] += 1
        pywikibot.output(u'Unmatched %s' % (mediaid, ))

# Output report
pywikibot.output ('Final report')
pywikibot.output (PARAMS['category'])
for x in counter:
    pywikibot.output('%s: %s' % (x, counter[x]))