In [None]:
# Bot to examine Wikimedia Commons category and add structured data (SDC) info for The Met Museum:
#   Met Object ID (P3634) -> object id
#   collection (P195) -> Q160236
#
# For well-structured uploads to Commons (mostly done in 2017 with GLAM Wiki Toolset)
#   the object id can be determined by looking at the template "source" field
#   for a URL of pattern:
#   https://www.metmuseum.org/art/collection/search/12345
#
# Most of the institutional uploads located in: 
#   https://commons.wikimedia.org/wiki/Category:Metropolitan_Museum_of_Art_by_department
#
# Author: Andrew Lih (User:Fuzheado), based on SDC routines from Botmultichill

import pywikibot
from pywikibot import pagegenerators
from pywikibot.comms import http

import json
import re
from itertools import islice

# Parameters specific to the institution
PARAMS = {
    'test_mode': False,
    'category': u'Category:Chinese furniture in the Metropolitan Museum of Art',
    'recurse': True,
    'search_string': r'source[ ]*=[ ]*http[s]*://www.metmuseum.org/art/collection/search/(\d+)',
    'pid':'P3634',
    'pid_summary': u'importing Met Object ID from source info from Commons template',
    'institution_pid':'P195',
    'institution_qid':'Q160236',
    'institution_summary': u'adding collection based on Met Object ID'
}

# Counters for final report
counter = {
    'id': 0,
    'institution': 0,
    'id_skip': 0,
    'institution_skip': 0,
    'unmatched': 0
}

def addClaim(mediaid, pid, qid, summary='') -> int:
    """addClaim - add a Wikibase claim for a property/pid that is a Q object
    
    :param mediaid: MID of Commons file
    :param pid: Wikidata property (eg. P195)
    :param qid: QID for the object (eg. Q160236)
    :param summary: edit summary to append to automated summary
    :return: return value
    """
    pywikibot.output(u'Adding %s->%s to %s. %s' % (pid, qid, mediaid, summary))

    # Check for existing entry - if it exists at all, skip and honor existing entry
    request = site._simple_request(action='wbgetentities',ids=mediaid)
    data = request.submit()
    if (data.get(u'entities').get(mediaid).get(u'statements').get(pid)):
        pywikibot.output(u'  Existing entry: skipping to be safe.')
        return 0  # Skip

    tokenrequest = http.fetch(u'https://commons.wikimedia.org/w/api.php?action=query&meta=tokens&type=csrf&format=json')

    tokendata = json.loads(tokenrequest.text)
    token = tokendata.get(u'query').get(u'tokens').get(u'csrftoken')

    postvalue = {"entity-type":"item","numeric-id": qid.replace(u'Q', u'')}

    postdata = {u'action' : u'wbcreateclaim',
                u'format' : u'json',
                u'entity' : mediaid,
                u'property' : pid,
                u'snaktype' : u'value',
                u'value' : json.dumps(postvalue),
                u'token' : token,
                u'summary' : summary
                }

    if PARAMS['test_mode']:
        # print ('TEST_MODE: addClaim:', postdata)
        pass
    else:
        apipage = http.fetch(u'https://commons.wikimedia.org/w/api.php', method='POST', data=postdata)

    return 1  # Successful add

def addClaimString(mediaid, pid, instring, summary=''):
    """ addClaimString - add a Wikibase claim for a property/pid that is a string object
    
    :param mediaid: MID of Commons file
    :param pid: Wikidata property (eg. P3634)
    :param instring: string for above property
    :param summary: edit summary to append to automated summary
    :return: return value
    """
    pywikibot.output(u'Adding %s->%s to %s. %s' % (pid, instring, mediaid, summary))

    # Check for existing entry - if it exists at all, skip and honor existing entry
    request = site._simple_request(action='wbgetentities',ids=mediaid)
    data = request.submit()
    if (data.get(u'entities').get(mediaid).get(u'statements').get(pid)):
        pywikibot.output(u'  Existing entry: skipping to be safe.')
        return 0
    
    tokenrequest = http.fetch(u'https://commons.wikimedia.org/w/api.php?action=query&meta=tokens&type=csrf&format=json')

    tokendata = json.loads(tokenrequest.text)
    token = tokendata.get(u'query').get(u'tokens').get(u'csrftoken')

    postdata = {u'action' : u'wbcreateclaim',
                u'format' : u'json',
                u'entity' : mediaid,
                u'property' : pid,
                u'snaktype' : u'value',
                u'value' : '"'+instring+'"',
                u'token' : token,
                u'summary' : summary
                }

    if PARAMS['test_mode']:
        # print ('TEST_MODE: addClaim:', postdata)
        pass
    else:
        apipage = http.fetch(u'https://commons.wikimedia.org/w/api.php', method='POST', data=postdata)

    return 1    


def media_id(page: pywikibot.page.FilePage) -> str:
    """Return MID from page ID: just prepend an M to number"""
    return u'M%s' % (page.pageid,)


# Start execution

# Set this manually if you want to override the default
PARAMS['category'] = u'Category:Bathing suits in the Metropolitan Museum of Art',

# Shouldn't need to touch these
site = pywikibot.Site(u'commons', u'commons')
cat = pywikibot.Category(site,PARAMS['category'])
gen = pagegenerators.CategorizedPageGenerator(cat, recurse=PARAMS['recurse'])
patt = re.compile(PARAMS['search_string'])

for page in islice(gen, 10):
# for page in gen:
    #Do something with the page object, for example:
    text = page.text
    mediaid = media_id(page)

    m = patt.search(text)
    if m:
        id_string = m.group(1)
        # print ('  ', id_string)
        summary = PARAMS['pid_summary']
        result = addClaimString(mediaid, PARAMS['pid'], id_string, summary)
        if result>0:
            counter['id'] += 1
        else:
            counter['id_skip'] += 1

        if 'institution_pid' in PARAMS:
            institution = PARAMS['institution_qid']  # Met
            summary = PARAMS['institution_summary']
            result = addClaim(mediaid, PARAMS['institution_pid'], institution, summary)
            if result>0:
                counter['institution'] += 1
            else:
                counter['institution_skip'] += 1
    else:
        counter['unmatched'] += 1
        pywikibot.output(u'Unmatched %s' % (mediaid, ))

    print ()

print ('Final report')
{print ('%s: %s' % (x, counter[x])) for x in counter}