In [1]:
import requests, re, random
from SPARQLWrapper import SPARQLWrapper, JSON

### Data Gathering

In [2]:
def get_revisions(Property,rvcontinue=None):
    """
    Input:
    Property,rvcontinue
    
    Output:
    revisions, rvcontinue
    """
    URL = "https://www.wikidata.org/w/api.php"
    # defining a params dict for the parameters to be sent to the API
    PARAMS = {"action":"query",
              "prop":"revisions",
              "format":"json",
              "rvlimit":"500",
              "titles":"Property:{}".format(Property),
              "rvdir":"newer",
              "rvcontinue": rvcontinue}
    
    # sending get request and saving the response as response object
    r = requests.get(url = URL,params = PARAMS)

    # extracting data in json format
    data = r.json()
    
    # Get revisions
    pages = data['query']['pages']
    keys = pages.keys()
    revisions = [pages[k]['revisions'] for k in keys]
    rvcontinue = data['continue']['rvcontinue'] if 'continue' in data else None
    
    return revisions, rvcontinue

In [3]:
def get_all_revisions(Property,rvcontinue=None):
    
    all_revisions = []
        
    while True:
        revisions, rvcontinue = get_revisions(Property,rvcontinue)
        all_revisions += revisions[0]
        
        if rvcontinue is None:
            break

    return all_revisions

In [12]:
all_revisions = get_all_revisions('P54')

### Data parsing


|Attribute |Add| Update |Remove|
|---|---|---|---|
|label |wbsetlabel-add| wbsetlabel-set| wbsetlabel-remove|
|description |wbsetdescription-add| wbsetdescription-set| wbsetdescription-remove|
|alias |wbsetaliases-add |wbsetaliases-add-remove| wbsetaliases-remove|

wbsetlabeldescriptionaliases
wbeditentity-create

In [5]:
tag_pattern = "(?<=\/\*)(.*)(?=\*\/)"
description_pattern = "(?<=\*\/)[^\]]+"
language_pattern = "(?<=\|)[^\]]+"

In [6]:
attributes_dict = {"wbsetlabel-":["label"],
                   "wbeditentity-create":["label"],
                   "wbsetdescription-":["description"],
                   "wbsetaliases-":["alias"],
                   "wbsetlabeldescriptionaliases":["label","description","alias"]}

actions_dict = {"-add":"add",
                "-update":"update",
                "-set":"update",
                "-add-remove":"update",
                "-remove":"remove"}

In [17]:
def parse_revisions(all_revisions):
    for revision in all_revisions.copy():
        print("test")
        print(revision)
        if(not comment in revision):
            pri
        comment = revision['comment']
        tag = re.findall(tag_pattern, comment)
        if(not tag):
            all_revisions.remove(revision)
            continue

        tag =tag[0]
        attributes = [v for k,v in attributes_dict.items() if k in tag]

        # If there is no attribute, we delete the element from the list
        if(not attributes):
            all_revisions.remove(revision)
            continue
        attributes = attributes[0]

        action = [v for k,v in actions_dict.items() if k in tag]
        if not action:
            if 'wbeditentity-create' in tag:
                action = ['add']
            elif 'wbsetlabeldescriptionaliases' in tag:
                action = ['uptdate']
        #description =re.findall(description_pattern, comment)[0].strip()
        language = re.findall(language_pattern,tag)[0].strip()

        revision['label'] = language if 'label' in attributes else None
        revision['description'] = language if 'description' in attributes else None
        revision['alias'] = language if 'alias' in attributes else None

        revision['action'] = action[0]
        #revision['description'] = description
        revision['language'] = language

        revision.pop('comment', None)
        revision.pop('parentid',None)
        revision.pop('user',None)
        if('anon' in revision):
            revision.pop('anon',None)
    
    return all_revisions

### Gather data from several properties

In [8]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""SELECT DISTINCT ?property
    WHERE
    {
      ?property rdf:type wikibase:Property.
    }
    ORDER by ?property""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

property_name_list = []
for result in results["results"]["bindings"]:
    property_name_list.append(re.findall('([^\/]*)$',result['property']['value'])[0])

In [9]:
random.seed(1000)
property_number = 500
sample_property_name_list = random.choices(property_name_list,k=property_number - 1)
sample_property_name_list += ['P856']

In [13]:
sample_property_name_list = list(set(sample_property_name_list))

In [11]:
output = []

for property_name in property_name_list:#sample_property_name_list
    print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_revisions(all_revisions)
    for r in all_revisions:
        r['property'] = property_name
    output += all_revisions

Get P10 revisions
Get P101 revisions
Get P102 revisions
Get P103 revisions
Get P105 revisions
Get P106 revisions
Get P108 revisions
Get P109 revisions
Get P110 revisions
Get P111 revisions
Get P112 revisions
Get P113 revisions
Get P114 revisions
Get P115 revisions
Get P117 revisions
Get P118 revisions
Get P119 revisions
Get P121 revisions
Get P122 revisions
Get P123 revisions
Get P126 revisions
Get P127 revisions
Get P128 revisions
Get P129 revisions
Get P131 revisions
Get P134 revisions
Get P135 revisions
Get P136 revisions
Get P137 revisions
Get P138 revisions
Get P14 revisions
Get P140 revisions
Get P141 revisions
Get P143 revisions
Get P144 revisions
Get P149 revisions
Get P15 revisions
Get P150 revisions
Get P154 revisions
Get P155 revisions
Get P156 revisions
Get P157 revisions
Get P158 revisions
Get P159 revisions
Get P16 revisions
Get P161 revisions
Get P162 revisions
Get P163 revisions
Get P166 revisions
Get P167 revisions
Get P169 revisions
Get P17 revisions
Get P170 revision

KeyError: 'comment'

### Save data into a CSV

In [16]:
import csv

keys = all_revisions[0].keys()

with open('all_revisions.csv', 'w+') as output_file: # wb if python <3.x 
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(output)

# Test

In [19]:
all_revisions = get_all_revisions('P54')
parse_revisions(all_revisions)

test
{'revid': 5801553, 'parentid': 0, 'user': 'Sven Manguard', 'timestamp': '2013-02-05T01:53:43Z', 'comment': '/* special-create-property:2|en */ member of sports team, team or club that a sportsperson currently represents'}
test
{'revid': 5801610, 'parentid': 5801553, 'user': 'Jdforrester', 'timestamp': '2013-02-05T01:55:29Z', 'comment': '/* wbsetdescription-set:1|en */ subject is or was a member of the sports team object'}
test
{'revid': 5801675, 'parentid': 5801610, 'user': 'Jdforrester', 'timestamp': '2013-02-05T01:57:16Z', 'comment': '/* wbsetdescription-set:1|en */ subject is currently a member of the sports team object'}
test
{'revid': 5802058, 'parentid': 5801675, 'user': 'Delusion23', 'timestamp': '2013-02-05T02:10:04Z', 'comment': '/* wbsetdescription-set:1|en */ team or club that a sportsperson currently represents'}
test
{'revid': 5811818, 'parentid': 5802058, 'user': 'Hawk-Eye', 'timestamp': '2013-02-05T07:07:40Z', 'comment': "/* wbsetlabel-set:1|fr */ Membre actuel d'un

KeyError: 'comment'