In [4]:
import requests, re

### Data Gathering

In [186]:
def get_revisions(Property,rvcontinue=None):
    """
    Input:
    Property,rvcontinue
    
    Output:
    revisions, rvcontinue
    """
    URL = "https://www.wikidata.org/w/api.php"
    # defining a params dict for the parameters to be sent to the API
    PARAMS = {"action":"query",
              "prop":"revisions",
              "format":"json",
              "rvlimit":"500",
              "titles":"Property:{}".format(Property),
              "rvdir":"newer",
              "rvcontinue": rvcontinue}
    
    # sending get request and saving the response as response object
    r = requests.get(url = URL,params = PARAMS)

    # extracting data in json format
    data = r.json()
    
    # Get revisions
    pages = data['query']['pages']
    keys = pages.keys()
    revisions = [pages[k]['revisions'] for k in keys]
    rvcontinue = data['continue']['rvcontinue'] if 'continue' in data else None
    
    return revisions, rvcontinue

In [6]:
def get_all_revisions(Property,rvcontinue=None):
    
    all_revisions = []
        
    while True:
        revisions, rvcontinue = get_revisions(Property,rvcontinue)
        all_revisions += revisions[0]
        
        if rvcontinue is None:
            break

    return all_revisions

In [201]:
all_revisions = get_all_revisions('P856')

### Data parsing


|Attribute |Add| Update |Remove|
|---|---|---|---|
|label |wbsetlabel-add| wbsetlabel-set| wbsetlabel-remove|
|description |wbsetdescription-add| wbsetdescription-set| wbsetdescription-remove|
|alias |wbsetaliases-add |wbsetaliases-add-remove| wbsetaliases-remove|
Edit message for changes related to labels, descriptions and aliases of Wikidata
properties

In [192]:
tag_pattern = "(?<=\/\*)(.*)(?=\*\/)"
description_pattern = "(?<=\*\/)[^\]]+"
language_pattern = "(?<=\|)[^\]]+"

In [193]:
attributes_dict = {"label":"wbsetlabel",
                 "description":"wbsetdescription",
                 "alias":"wbsetaliases"}

actions_dict = {"-add":"add",
                "-set":"update",
                "-add-remove":"update",
                "-remove":"remove"}

In [202]:
for revision in all_revisions.copy():
    comment = revision['comment']
    tag = re.findall(tag_pattern, comment)
    if(not tag):
        all_revisions.remove(revision)
        continue
        
    tag =tag[0]
    attribute = [k for k,v in attributes_dict.items() if v in tag]
    action =[v for k,v in actions_dict.items() if k in tag]
    
    if(not attribute or not action):
        all_revisions.remove(revision)
        continue
    
    description =re.findall(description_pattern, comment)[0].strip()
    language = re.findall(language_pattern,tag)

    revision['attribute'] = attribute[0]
    revision['action'] = action[0]
    revision['description'] = description
    revision['language'] = language[0]

    revision.pop('comment', None)
    if('anon' in revision):
        revision.pop('anon',None)

In [203]:
import csv

keys = all_revisions[0].keys()

with open('output.csv', 'w') as output_file: # wb if python <3.x 
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(all_revisions)

In [176]:
len(all_revisions)

15

In [184]:
len(all_revisions)

15

In [195]:
all_revisions

[{'revid': 69576084,
  'parentid': 69573941,
  'user': '*Youngjin',
  'timestamp': '2013-09-10T22:59:51Z',
  'attribute': 'label',
  'action': 'add',
  'description': '공식 홈페이지',
  'language': 'ko'},
 {'revid': 69580593,
  'parentid': 69576084,
  'user': 'Laddo',
  'timestamp': '2013-09-10T23:37:22Z',
  'attribute': 'label',
  'action': 'add',
  'description': 'site internet officiel',
  'language': 'fr'},
 {'revid': 69590945,
  'parentid': 69580692,
  'user': 'Ricordisamoa',
  'timestamp': '2013-09-11T01:09:22Z',
  'attribute': 'label',
  'action': 'add',
  'description': 'sito ufficiale',
  'language': 'it'},
 {'revid': 69592864,
  'parentid': 69590945,
  'user': 'SPQRobin',
  'timestamp': '2013-09-11T01:28:08Z',
  'attribute': 'description',
  'action': 'add',
  'description': 'URL to the website of this item',
  'language': 'en'},
 {'revid': 69592878,
  'parentid': 69592864,
  'user': 'SPQRobin',
  'timestamp': '2013-09-11T01:28:13Z',
  'attribute': 'label',
  'action': 'add',
  'de