In [1]:
import requests, re, random
from SPARQLWrapper import SPARQLWrapper, JSON

### Data Gathering

In [2]:
def get_revisions(Property,rvcontinue=None):
    """
    Input:
    Property,rvcontinue
    
    Output:
    revisions, rvcontinue
    """
    URL = "https://www.wikidata.org/w/api.php"
    # defining a params dict for the parameters to be sent to the API
    PARAMS = {"action":"query",
              "prop":"revisions",
              "format":"json",
              "rvlimit":"500",
              "titles":"Property:{}".format(Property),
              "rvdir":"newer",
              "rvcontinue": rvcontinue}
    
    # sending get request and saving the response as response object
    r = requests.get(url = URL,params = PARAMS)

    # extracting data in json format
    data = r.json()
    
    # Get revisions
    pages = data['query']['pages']
    keys = pages.keys()
    revisions = [pages[k]['revisions'] for k in keys]
    rvcontinue = data['continue']['rvcontinue'] if 'continue' in data else None
    
    return revisions, rvcontinue

In [3]:
def get_all_revisions(Property,rvcontinue=None):
    
    all_revisions = []
        
    while True:
        revisions, rvcontinue = get_revisions(Property,rvcontinue)
        all_revisions += revisions[0]
        
        if rvcontinue is None:
            break

    return all_revisions

In [12]:
all_revisions = get_all_revisions('P54')

### Data parsing


|Attribute |Add| Update |Remove|
|---|---|---|---|
|label |wbsetlabel-add| wbsetlabel-set| wbsetlabel-remove|
|description |wbsetdescription-add| wbsetdescription-set| wbsetdescription-remove|
|alias |wbsetaliases-add |wbsetaliases-add-remove| wbsetaliases-remove|

wbsetlabeldescriptionaliases
wbeditentity-create

In [5]:
tag_pattern = "(?<=\/\*)(.*)(?=\*\/)"
description_pattern = "(?<=\*\/)[^\]]+"
language_pattern = "(?<=\|)[^\]]+"

In [6]:
attributes_dict = {"wbsetlabel-":["label"],
                   "wbeditentity-create":["label"],
                   "wbsetdescription-":["description"],
                   "wbsetaliases-":["alias"],
                   "wbsetlabeldescriptionaliases":["label","description","alias"]}

actions_dict = {"-add":"add",
                "-update":"update",
                "-set":"update",
                "-add-remove":"update",
                "-remove":"remove"}

In [22]:
def parse_revisions(all_revisions):
    for revision in all_revisions.copy():

        if(not 'comment' in revision):
            all_revisions.remove(revision)
            continue
            
        comment = revision['comment']
        tag = re.findall(tag_pattern, comment)
        if(not tag):
            all_revisions.remove(revision)
            continue

        tag =tag[0]
        attributes = [v for k,v in attributes_dict.items() if k in tag]

        # If there is no attribute, we delete the element from the list
        if(not attributes):
            all_revisions.remove(revision)
            continue
        attributes = attributes[0]

        action = [v for k,v in actions_dict.items() if k in tag]
        if not action:
            if 'wbeditentity-create' in tag:
                action = ['add']
            elif 'wbsetlabeldescriptionaliases' in tag:
                action = ['uptdate']
        #description =re.findall(description_pattern, comment)[0].strip()
        language = re.findall(language_pattern,tag)[0].strip()

        revision['label'] = language if 'label' in attributes else None
        revision['description'] = language if 'description' in attributes else None
        revision['alias'] = language if 'alias' in attributes else None

        revision['action'] = action[0]
        #revision['description'] = description
        revision['language'] = language

        revision.pop('comment', None)
        revision.pop('parentid',None)
        revision.pop('user',None)
        if('anon' in revision):
            revision.pop('anon',None)
    
    return all_revisions

### Gather data from several properties

In [8]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""SELECT DISTINCT ?property
    WHERE
    {
      ?property rdf:type wikibase:Property.
    }
    ORDER by ?property""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

property_name_list = []
for result in results["results"]["bindings"]:
    property_name_list.append(re.findall('([^\/]*)$',result['property']['value'])[0])

In [9]:
random.seed(1000)
property_number = 500
sample_property_name_list = random.choices(property_name_list,k=property_number - 1)
sample_property_name_list += ['P856']

In [13]:
sample_property_name_list = list(set(sample_property_name_list))

In [66]:
output = []

for property_name in property_name_list[3659:]:#sample_property_name_list
    #print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_revisions(all_revisions)
    for r in all_revisions:
        r['property'] = property_name
    output += all_revisions

Get P4655 revisions
Get P4656 revisions
Get P4657 revisions
Get P4658 revisions
Get P4659 revisions
Get P466 revisions
Get P4660 revisions
Get P4661 revisions
Get P4662 revisions
Get P4663 revisions
Get P4664 revisions
Get P4665 revisions
Get P4666 revisions
Get P4667 revisions
Get P4668 revisions
Get P4669 revisions
Get P467 revisions
Get P4670 revisions
Get P4671 revisions
Get P4672 revisions
Get P4673 revisions
Get P4674 revisions
Get P4675 revisions
Get P4676 revisions
Get P4677 revisions
Get P4678 revisions
Get P4679 revisions
Get P468 revisions
Get P4680 revisions
Get P4681 revisions
Get P4682 revisions
Get P4683 revisions
Get P4684 revisions
Get P4685 revisions
Get P4686 revisions
Get P4687 revisions
Get P4688 revisions
Get P4689 revisions
Get P469 revisions
Get P4690 revisions
Get P4691 revisions
Get P4692 revisions
Get P4693 revisions
Get P4694 revisions
Get P4695 revisions
Get P4696 revisions
Get P4697 revisions
Get P4698 revisions
Get P4699 revisions
Get P470 revisions
Get P

Get P5043 revisions
Get P5044 revisions
Get P5045 revisions
Get P5046 revisions
Get P5047 revisions
Get P5048 revisions
Get P5049 revisions
Get P505 revisions
Get P5050 revisions
Get P5051 revisions
Get P5052 revisions
Get P5053 revisions
Get P5054 revisions
Get P5055 revisions
Get P5056 revisions
Get P5057 revisions
Get P5058 revisions
Get P5059 revisions
Get P506 revisions
Get P5061 revisions
Get P5062 revisions
Get P5063 revisions
Get P5064 revisions
Get P5065 revisions
Get P5066 revisions
Get P5067 revisions
Get P5068 revisions
Get P5069 revisions
Get P507 revisions
Get P5070 revisions
Get P5071 revisions
Get P5072 revisions
Get P5073 revisions
Get P5075 revisions
Get P5076 revisions
Get P5077 revisions
Get P5078 revisions
Get P5079 revisions
Get P508 revisions
Get P5080 revisions
Get P5081 revisions
Get P5082 revisions
Get P5083 revisions
Get P5084 revisions
Get P5085 revisions
Get P5086 revisions
Get P5087 revisions
Get P5088 revisions
Get P509 revisions
Get P5090 revisions
Get P

Get P5438 revisions
Get P5439 revisions
Get P5440 revisions
Get P5441 revisions
Get P5442 revisions
Get P5443 revisions
Get P5444 revisions
Get P5445 revisions
Get P5446 revisions
Get P5447 revisions
Get P5448 revisions
Get P5449 revisions
Get P545 revisions
Get P5450 revisions
Get P5451 revisions
Get P5452 revisions
Get P5453 revisions
Get P5454 revisions
Get P5455 revisions
Get P5456 revisions
Get P5457 revisions
Get P5458 revisions
Get P5459 revisions
Get P546 revisions
Get P5460 revisions
Get P5461 revisions
Get P5462 revisions
Get P5463 revisions
Get P5464 revisions
Get P5465 revisions
Get P5466 revisions
Get P5467 revisions
Get P5468 revisions
Get P5469 revisions
Get P547 revisions
Get P5470 revisions
Get P5471 revisions
Get P5473 revisions
Get P5474 revisions
Get P5475 revisions
Get P5476 revisions
Get P5477 revisions
Get P5478 revisions
Get P5479 revisions
Get P548 revisions
Get P5480 revisions
Get P5481 revisions
Get P5482 revisions
Get P5483 revisions
Get P5485 revisions
Get 

Get P689 revisions
Get P690 revisions
Get P691 revisions
Get P692 revisions
Get P693 revisions
Get P694 revisions
Get P695 revisions
Get P696 revisions
Get P697 revisions
Get P698 revisions
Get P699 revisions
Get P700 revisions
Get P701 revisions
Get P702 revisions
Get P703 revisions
Get P704 revisions
Get P705 revisions
Get P706 revisions
Get P707 revisions
Get P708 revisions
Get P709 revisions
Get P710 revisions
Get P711 revisions
Get P712 revisions
Get P713 revisions
Get P714 revisions
Get P715 revisions
Get P716 revisions
Get P717 revisions
Get P718 revisions
Get P720 revisions
Get P721 revisions
Get P722 revisions
Get P723 revisions
Get P724 revisions
Get P725 revisions
Get P726 revisions
Get P727 revisions
Get P729 revisions
Get P730 revisions
Get P731 revisions
Get P732 revisions
Get P733 revisions
Get P734 revisions
Get P735 revisions
Get P736 revisions
Get P737 revisions
Get P739 revisions
Get P740 revisions
Get P741 revisions
Get P742 revisions
Get P744 revisions
Get P745 rev

### Save data into a CSV

In [68]:
# Clean output
for el in output:
    if 'userhidden' in el:
        el.pop('userhidden',None)
    if 'suppressed' in el:
        el.pop('suppressed',None)
    

In [67]:
import csv

keys = all_revisions[0].keys()

with open('data/all_revisions.csv', 'w+') as output_file: # wb if python <3.x 
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(output)