In [239]:
import requests, re, random, json, time
from SPARQLWrapper import SPARQLWrapper, JSON

### Data Gathering

In [196]:
def get_revisions(Property,rvcontinue=None):
    """
    Input:
    Property,rvcontinue
    
    Output:
    revisions, rvcontinue
    """
    URL = "https://www.wikidata.org/w/api.php"
    # defining a params dict for the parameters to be sent to the API
    PARAMS = {"action":"query",
              "prop":"revisions",
              "format":"json",
              "titles":"Property:{}".format(Property),
              "rvlimit":"500",
              "rvdir":"newer",
              "rvprop": "content|timestamp|comment",
              "rvcontinue": rvcontinue}
    
    # sending get request and saving the response as response object
    r = requests.get(url = URL,params = PARAMS)

    # extracting data in json format
    data = r.json()
    
    # Get revisions
    pages = data['query']['pages']
    keys = pages.keys()
    revisions = [pages[k]['revisions'] for k in keys]
    rvcontinue = data['continue']['rvcontinue'] if 'continue' in data else None
    
    return revisions, rvcontinue

In [420]:
def get_all_revisions(Property,rvcontinue=None):
    """
    
    output :
    
    - all_revisions
    - limit_reached 
    
    """
    
    all_revisions, continue_limit, i = [], 10, 0
        
    while True:
        revisions, rvcontinue = get_revisions(Property,rvcontinue)
        all_revisions += revisions[0]
        
        if(rvcontinue is None):
            limit_reached = False 
            break
        
        if (i > continue_limit) :
            limit_reached = True
            break
        
        i += 1

    return all_revisions, limit_reached

In [428]:
all_revisions,limit_reached = get_all_revisions('P54')

### Data parsing

#### Method 1: using comment

|Attribute |Add| Update |Remove|
|---|---|---|---|
|label |wbsetlabel-add| wbsetlabel-set| wbsetlabel-remove|
|description |wbsetdescription-add| wbsetdescription-set| wbsetdescription-remove|
|alias |wbsetaliases-add |wbsetaliases-add-remove| wbsetaliases-remove|

wbsetlabeldescriptionaliases
wbeditentity-create

In [5]:
tag_pattern = "(?<=\/\*)(.*)(?=\*\/)"
description_pattern = "(?<=\*\/)[^\]]+"
language_pattern = "(?<=\|)[^\]]+"

In [6]:
attributes_dict = {"wbsetlabel-":["label"],
                   "wbeditentity-create":["label"],
                   "wbsetdescription-":["description"],
                   "wbsetaliases-":["alias"],
                   "wbsetlabeldescriptionaliases":["label","description","alias"]}

actions_dict = {"-add":"add",
                "-update":"update",
                "-set":"update",
                "-add-remove":"update",
                "-remove":"remove"}

In [22]:
def parse_revisions(all_revisions):
    for revision in all_revisions.copy():

        if(not 'comment' in revision):
            all_revisions.remove(revision)
            continue
            
        comment = revision['comment']
        tag = re.findall(tag_pattern, comment)
        if(not tag):
            all_revisions.remove(revision)
            continue

        tag =tag[0]
        attributes = [v for k,v in attributes_dict.items() if k in tag]

        # If there is no attribute, we delete the element from the list
        if(not attributes):
            all_revisions.remove(revision)
            continue
        attributes = attributes[0]

        action = [v for k,v in actions_dict.items() if k in tag]
        if not action:
            if 'wbeditentity-create' in tag:
                action = ['add']
            elif 'wbsetlabeldescriptionaliases' in tag:
                action = ['uptdate']
        #description =re.findall(description_pattern, comment)[0].strip()
        language = re.findall(language_pattern,tag)[0].strip()

        revision['label'] = language if 'label' in attributes else None
        revision['description'] = language if 'description' in attributes else None
        revision['alias'] = language if 'alias' in attributes else None

        revision['action'] = action[0]
        #revision['description'] = description
        revision['language'] = language

        revision.pop('comment', None)
        revision.pop('parentid',None)
        revision.pop('user',None)
        if('anon' in revision):
            revision.pop('anon',None)
    
    return all_revisions

#### Method 2: using content

In [235]:
def parse_content(revisions):
    output,languages = [],[]
    for revision in revisions:
        
        content = revision['*']
        content = json.loads(content)
        
        if 'labels' not in content or not content['labels']:
            continue
            
        current_labels = content['labels'].keys()
        for label in current_labels:
            if label not in languages:
                languages.append(label)
                output.append({'timestamp':revision['timestamp'],
                               'property':content['id'],
                               'language':label})
                break
        
    return output

### Gather data from several properties

In [218]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""SELECT DISTINCT ?property
    WHERE
    {
      ?property rdf:type wikibase:Property.
    }
    ORDER by ?property""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

property_name_list = []
for result in results["results"]["bindings"]:
    property_name_list.append(re.findall('([^\/]*)$',result['property']['value'])[0])

In [227]:
random.seed(1000)
property_number = 50
sample_property_name_list = random.choices(property_name_list,k=property_number - 1)
sample_property_name_list += ['P856']

In [228]:
sample_property_name_list = list(set(sample_property_name_list))

In [440]:
def get_raw_data(properties):
    """
    
    """
    output,i = [],0
    total_property = len(properties)
    step = 25
    
    keys = ['timestamp', 'property', 'language']
    
    # Save the result in a file
    with open('data/all_labels.csv', 'w+') as output_file: # wb if python <3.x 
                dict_writer = csv.DictWriter(output_file, keys)
                dict_writer.writeheader()
    
    
    for property_name in properties:
    
        # Sleep 300ms after each batches of request
        time.sleep(0.3)
        
        try:
            all_revisions, limit_reached = get_all_revisions(property_name)
            all_revisions = parse_content(all_revisions)
        except:
            print("ERROR on {}".format(property_name))
        
        if limit_reached:
            print("Limit reached on {}".format(property_name))
            break
        
        # Save the result in a file
        with open('data/all_labels.csv', 'a+') as output_file: # wb if python <3.x 
                dict_writer = csv.DictWriter(output_file, keys)
                #dict_writer.writeheader()
                dict_writer.writerows(all_revisions)
        

        # Log and save every 50 requests
        if  i%step == 0:
            now = datetime.datetime.now()        
            print("In progress:\t{0:0.2f}%\t{1}:{2}:{3}".format(100*(i/total_property),
                                                                       now.hour,
                                                                       now.minute,
                                                                       now.second))
        i+=1    

In [None]:
get_raw_data(property_name_list)

In progress:	0.00%	11:2:45


In [396]:
f=open("data/test.csv", "r")

In [397]:
t = f.read()

In [375]:
import pandas as pd

In [401]:
t[:100]

'timestamp,contentformat,contentmodel,comment,*\n2016-05-26T09:29:14Z,application/json,wikibase-proper'

In [364]:
j = json.loads(t)

In [229]:
output = []

for property_name in property_name_list:#sample_property_name_list
    #print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_revisions(all_revisions)
    for r in all_revisions:
        r['property'] = property_name
    output += all_revisions

KeyboardInterrupt: 

In [None]:
"{:0.2f}".format(0.1223)

In [217]:
1234324//50

24686

In [246]:
all_revisions = get_all_revisions(property_name)

In [341]:
import json
with open('data/raw_data.json', 'w+') as outfile:
    json.dump(all_revisions, outfile)

In [342]:
all_revisions

[{'timestamp': '2017-12-21T20:54:16Z',
  'contentformat': 'application/json',
  'contentmodel': 'wikibase-property',
  'comment': '/* wbeditentity-create:2|en */ Eldoblaje original actor ID, identifier for an original actor in a Spanish dubbed film',
  '*': '{"type":"property","datatype":"external-id","id":"P4682","labels":{"en":{"language":"en","value":"Eldoblaje original actor ID"}},"descriptions":{"en":{"language":"en","value":"identifier for an original actor in a Spanish dubbed film"}},"aliases":[],"claims":[]}'},
 {'timestamp': '2017-12-21T20:54:28Z',
  'contentformat': 'application/json',
  'contentmodel': 'wikibase-property',
  'comment': '/* wbsetclaim-create:2||1 */ [[Property:P31]]: [[Q19595382]]',
  '*': '{"type":"property","datatype":"external-id","id":"P4682","labels":{"en":{"language":"en","value":"Eldoblaje original actor ID"}},"descriptions":{"en":{"language":"en","value":"identifier for an original actor in a Spanish dubbed film"}},"aliases":[],"claims":{"P31":[{"main

In [None]:
output,i = [],0
total_property = len(property_name_list)
step = total_property//50

for property_name in sample_property_name_list:#sample_property_name_list   property_name_list
    
    print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_content(all_revisions)

In [236]:
output,i = [],0
total_property = len(property_name_list)
step = total_property//50

for property_name in sample_property_name_list:#sample_property_name_list   property_name_list
    
    print("Get {} revisions".format(property_name))
    all_revisions = get_all_revisions(property_name)
    all_revisions = parse_content(all_revisions)
    for r in all_revisions:
        r['property'] = property_name
    output += all_revisions
    
    # progress
    #if  i%step == 0:
    #    print("{:0.2f}".format(i/total_property))
    #i +=1
    # end progress

Get P2860 revisions
Get P1802 revisions
Get P3581 revisions
Get P3575 revisions
Get P2923 revisions
Get P2662 revisions
Get P2536 revisions
Get P1455 revisions
Get P2688 revisions
Get P4452 revisions
Get P3193 revisions
Get P210 revisions
Get P1365 revisions
Get P856 revisions
Get P5369 revisions
Get P5575 revisions
Get P4609 revisions
Get P4224 revisions
Get P1899 revisions
Get P4477 revisions
Get P3478 revisions
Get P836 revisions
Get P2548 revisions
Get P25 revisions
Get P2203 revisions
Get P3430 revisions
Get P4414 revisions
Get P5006 revisions
Get P859 revisions
Get P2003 revisions
Get P527 revisions
Get P3874 revisions
Get P3480 revisions
Get P887 revisions
Get P2050 revisions
Get P1401 revisions
Get P5540 revisions
Get P3429 revisions
Get P447 revisions
Get P1056 revisions
Get P4992 revisions
Get P5162 revisions
Get P2531 revisions
Get P2183 revisions
Get P674 revisions
Get P5447 revisions
Get P3447 revisions
Get P3805 revisions
Get P1617 revisions
Get P4682 revisions


In [243]:
time.sleep(0.3)

### Save data into a CSV

In [68]:
# Clean output
for el in output:
    if 'userhidden' in el:
        el.pop('userhidden',None)
    if 'suppressed' in el:
        el.pop('suppressed',None)
    

In [67]:
import csv

keys = all_revisions[0].keys()

with open('data/all_revisions.csv', 'w+') as output_file: # wb if python <3.x 
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(output)