In [388]:
import requests, re, random
from SPARQLWrapper import SPARQLWrapper, JSON

### Data Gathering

In [186]:
def get_revisions(Property,rvcontinue=None):
    """
    Input:
    Property,rvcontinue
    
    Output:
    revisions, rvcontinue
    """
    URL = "https://www.wikidata.org/w/api.php"
    # defining a params dict for the parameters to be sent to the API
    PARAMS = {"action":"query",
              "prop":"revisions",
              "format":"json",
              "rvlimit":"500",
              "titles":"Property:{}".format(Property),
              "rvdir":"newer",
              "rvcontinue": rvcontinue}
    
    # sending get request and saving the response as response object
    r = requests.get(url = URL,params = PARAMS)

    # extracting data in json format
    data = r.json()
    
    # Get revisions
    pages = data['query']['pages']
    keys = pages.keys()
    revisions = [pages[k]['revisions'] for k in keys]
    rvcontinue = data['continue']['rvcontinue'] if 'continue' in data else None
    
    return revisions, rvcontinue

In [6]:
def get_all_revisions(Property,rvcontinue=None):
    
    all_revisions = []
        
    while True:
        revisions, rvcontinue = get_revisions(Property,rvcontinue)
        all_revisions += revisions[0]
        
        if rvcontinue is None:
            break

    return all_revisions

In [324]:
all_revisions = get_all_revisions('P856')

### Data parsing


|Attribute |Add| Update |Remove|
|---|---|---|---|
|label |wbsetlabel-add| wbsetlabel-set| wbsetlabel-remove|
|description |wbsetdescription-add| wbsetdescription-set| wbsetdescription-remove|
|alias |wbsetaliases-add |wbsetaliases-add-remove| wbsetaliases-remove|

wbsetlabeldescriptionaliases
wbeditentity-create

In [192]:
tag_pattern = "(?<=\/\*)(.*)(?=\*\/)"
description_pattern = "(?<=\*\/)[^\]]+"
language_pattern = "(?<=\|)[^\]]+"

In [293]:
attributes_dict = {"wbsetlabel-":["label"],
                   "wbeditentity-create":["label"],
                   "wbsetdescription-":["description"],
                   "wbsetaliases-":["alias"],
                   "wbsetlabeldescriptionaliases":["label","description","alias"]}

actions_dict = {"-add":"add",
                "-update":"update",
                "-set":"update",
                "-add-remove":"update",
                "-remove":"remove"}

In [398]:
def parse_revisions(all_revisions):
    for revision in all_revisions.copy():
        comment = revision['comment']
        tag = re.findall(tag_pattern, comment)
        if(not tag):
            all_revisions.remove(revision)
            continue

        tag =tag[0]
        attributes = [v for k,v in attributes_dict.items() if k in tag]

        # If there is no attribute, we delete the element from the list
        if(not attributes):
            all_revisions.remove(revision)
            continue
        attributes = attributes[0]

        action = [v for k,v in actions_dict.items() if k in tag]
        if not action:
            if 'wbeditentity-create' in tag:
                action = ['add']
            elif 'wbsetlabeldescriptionaliases' in tag:
                action = ['uptdate']
        #description =re.findall(description_pattern, comment)[0].strip()
        language = re.findall(language_pattern,tag)[0].strip()

        revision['label'] = language if 'label' in attributes else None
        revision['description'] = language if 'description' in attributes else None
        revision['alias'] = language if 'alias' in attributes else None

        revision['action'] = action[0]
        #revision['description'] = description
        revision['language'] = language

        revision.pop('comment', None)
        revision.pop('parentid',None)
        revision.pop('user',None)
        if('anon' in revision):
            revision.pop('anon',None)
    
    return all_revisions

### Gather data from several properties

In [340]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""SELECT DISTINCT ?property
    WHERE
    {
      ?property rdf:type wikibase:Property.
    }
    ORDER by ?property""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

property_name_list = []
for result in results["results"]["bindings"]:
    property_name_list.append(re.findall('([^\/]*)$',result['property']['value'])[0])

In [401]:
random.seed(1000)
property_number = 50
sample_property_name_list = random.choices(property_list,k=property_number - 1)
sample_property_name_list += ['P856']

In [402]:
sample_property_name_list = list(set(sample_property_name_list))

In [405]:
output = []
for property_name in sample_property_name_list:
    print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_revisions(all_revisions)
    for r in all_revisions:
        r['property'] = property_name
    output += all_revisions

Get P4212 revisions
Get P3862 revisions
Get P1055 revisions
Get P1363 revisions
Get P4401 revisions
Get P2525 revisions
Get P3573 revisions
Get P2914 revisions
Get P1453 revisions
Get P4989 revisions
Get P2531 revisions
Get P210 revisions
Get P3471 revisions
Get P2540 revisions
Get P2198 revisions
Get P5524 revisions
Get P467 revisions
Get P5254 revisions
Get P1895 revisions
Get P5429 revisions
Get P860 revisions
Get P4457 revisions
Get P3421 revisions
Get P887 revisions
Get P2655 revisions
Get P675 revisions
Get P3184 revisions
Get P2682 revisions
Get P5147 revisions
Get P3796 revisions
Get P4976 revisions
Get P1798 revisions
Get P2854 revisions
Get P3566 revisions
Get P836 revisions
Get P856 revisions
Get P1999 revisions
Get P4439 revisions
Get P249 revisions
Get P3469 revisions
Get P2046 revisions
Get P5353 revisions
Get P1614 revisions
Get P2178 revisions
Get P1399 revisions
Get P4464 revisions
Get P3438 revisions
Get P4596 revisions
Get P3420 revisions
Get P5559 revisions


### Save data into a CSV

In [406]:
import csv

keys = all_revisions[0].keys()

with open('output.csv', 'w+') as output_file: # wb if python <3.x 
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(output)