In [2]:
import requests, re, random
from SPARQLWrapper import SPARQLWrapper, JSON

### Data Gathering

In [3]:
def get_revisions(Property,rvcontinue=None):
    """
    Input:
    Property,rvcontinue
    
    Output:
    revisions, rvcontinue
    """
    URL = "https://www.wikidata.org/w/api.php"
    # defining a params dict for the parameters to be sent to the API
    PARAMS = {"action":"query",
              "prop":"revisions",
              "format":"json",
              "rvlimit":"500",
              "titles":"Property:{}".format(Property),
              "rvdir":"newer",
              "rvcontinue": rvcontinue}
    
    # sending get request and saving the response as response object
    r = requests.get(url = URL,params = PARAMS)

    # extracting data in json format
    data = r.json()
    
    # Get revisions
    pages = data['query']['pages']
    keys = pages.keys()
    revisions = [pages[k]['revisions'] for k in keys]
    rvcontinue = data['continue']['rvcontinue'] if 'continue' in data else None
    
    return revisions, rvcontinue

In [4]:
def get_all_revisions(Property,rvcontinue=None):
    
    all_revisions = []
        
    while True:
        revisions, rvcontinue = get_revisions(Property,rvcontinue)
        all_revisions += revisions[0]
        
        if rvcontinue is None:
            break

    return all_revisions

In [5]:
all_revisions = get_all_revisions('P856')

### Data parsing


|Attribute |Add| Update |Remove|
|---|---|---|---|
|label |wbsetlabel-add| wbsetlabel-set| wbsetlabel-remove|
|description |wbsetdescription-add| wbsetdescription-set| wbsetdescription-remove|
|alias |wbsetaliases-add |wbsetaliases-add-remove| wbsetaliases-remove|

wbsetlabeldescriptionaliases
wbeditentity-create

In [6]:
tag_pattern = "(?<=\/\*)(.*)(?=\*\/)"
description_pattern = "(?<=\*\/)[^\]]+"
language_pattern = "(?<=\|)[^\]]+"

In [7]:
attributes_dict = {"wbsetlabel-":["label"],
                   "wbeditentity-create":["label"],
                   "wbsetdescription-":["description"],
                   "wbsetaliases-":["alias"],
                   "wbsetlabeldescriptionaliases":["label","description","alias"]}

actions_dict = {"-add":"add",
                "-update":"update",
                "-set":"update",
                "-add-remove":"update",
                "-remove":"remove"}

In [8]:
def parse_revisions(all_revisions):
    for revision in all_revisions.copy():
        comment = revision['comment']
        tag = re.findall(tag_pattern, comment)
        if(not tag):
            all_revisions.remove(revision)
            continue

        tag =tag[0]
        attributes = [v for k,v in attributes_dict.items() if k in tag]

        # If there is no attribute, we delete the element from the list
        if(not attributes):
            all_revisions.remove(revision)
            continue
        attributes = attributes[0]

        action = [v for k,v in actions_dict.items() if k in tag]
        if not action:
            if 'wbeditentity-create' in tag:
                action = ['add']
            elif 'wbsetlabeldescriptionaliases' in tag:
                action = ['uptdate']
        #description =re.findall(description_pattern, comment)[0].strip()
        language = re.findall(language_pattern,tag)[0].strip()

        revision['label'] = language if 'label' in attributes else None
        revision['description'] = language if 'description' in attributes else None
        revision['alias'] = language if 'alias' in attributes else None

        revision['action'] = action[0]
        #revision['description'] = description
        revision['language'] = language

        revision.pop('comment', None)
        revision.pop('parentid',None)
        revision.pop('user',None)
        if('anon' in revision):
            revision.pop('anon',None)
    
    return all_revisions

### Gather data from several properties

In [11]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""SELECT DISTINCT ?property
    WHERE
    {
      ?property rdf:type wikibase:Property.
    }
    ORDER by ?property""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

property_name_list = []
for result in results["results"]["bindings"]:
    property_name_list.append(re.findall('([^\/]*)$',result['property']['value'])[0])

In [12]:
random.seed(1000)
property_number = 500
sample_property_name_list = random.choices(property_name_list,k=property_number - 1)
sample_property_name_list += ['P856']

In [13]:
sample_property_name_list = list(set(sample_property_name_list))

In [15]:
output = []
for property_name in sample_property_name_list:
    print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_revisions(all_revisions)
    for r in all_revisions:
        r['property'] = property_name
    output += all_revisions

Get P127 revisions
Get P2827 revisions
Get P506 revisions
Get P542 revisions
Get P4993 revisions
Get P3975 revisions
Get P667 revisions
Get P962 revisions
Get P2917 revisions
Get P1799 revisions
Get P791 revisions
Get P2144 revisions
Get P5222 revisions
Get P3159 revisions
Get P4256 revisions
Get P2154 revisions
Get P4280 revisions
Get P4965 revisions
Get P4216 revisions
Get P2522 revisions
Get P1979 revisions
Get P4859 revisions
Get P4997 revisions
Get P4390 revisions
Get P3569 revisions
Get P3559 revisions
Get P3524 revisions
Get P5616 revisions
Get P2957 revisions
Get P180 revisions
Get P2201 revisions
Get P5226 revisions
Get P3434 revisions
Get P1735 revisions
Get P4729 revisions
Get P3113 revisions
Get P4516 revisions
Get P2212 revisions
Get P4873 revisions
Get P4887 revisions
Get P4036 revisions
Get P599 revisions
Get P2954 revisions
Get P4695 revisions
Get P4164 revisions
Get P5359 revisions
Get P2361 revisions
Get P2635 revisions
Get P610 revisions
Get P1056 revisions
Get P852 

### Save data into a CSV

In [16]:
import csv

keys = all_revisions[0].keys()

with open('500_revisions.csv', 'w+') as output_file: # wb if python <3.x 
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(output)