In [271]:
import pandas as pd
import numpy as np
import requests
import json
from tqdm import tqdm

## Get Revision Quality Scores.
To analyze the shift in quality of Wikipedia articles, we can query Wikipedias internal tool ORES to get estimates of the quality of an article. For Turkish wiki, ORES implements three scoring algorithms namely ```articlequality```, ```damaging``` and ```goodfaith``` that we retrieve in this code. We are interested in revisions edits to articles so I restrict myself with the namespace ```0```, and I only take the relevant times of study, meaning the pre-block, block and the post-block periods defined before, to save time on the queries to the ORES API.

In [4]:
DATA_PATH = '/dlabdata1/turkish_wiki'

In [39]:
# Retrieve revisions
revisions =  pd.read_csv(f'{DATA_PATH}/revision_events.tsv.gz', sep="\t", usecols = ['event_entity', 'event_type', 'revision_id', 'event_timestamp', 'page_namespace', 'revision_is_identity_revert', 'event_user_id', 'event_user_is_bot_by'], error_bad_lines=False, warn_bad_lines=True, compression = 'gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
block_dates = [pd.to_datetime('2017-04-29', utc = True), pd.to_datetime('2020-01-15', utc = True)]
blocked_days = (block_dates[1]-block_dates[0]).days

In [44]:
# Filter revisions to get only those coming from non-bot users, 
# those who are in the main namespace 0 and those who are not reverts.
create_revision_mask = (revisions.event_entity=='revision') & (revisions.event_type == 'create')
ns_mask = revisions.page_namespace == 0
norevert_mask = revisions.revision_is_identity_revert != True
nobot_mask = revisions.event_user_is_bot_by.isna()

revisions = revisions[create_revision_mask & ns_mask & nobot_mask & norevert_mask]
revisions['event_timestamp'] = pd.to_datetime(revisions['event_timestamp'],  utc = True, errors = 'coerce')

In [47]:
# Filter revisions who come before the ban, longer than the block period (991 days)
revisions = revisions[revisions['event_timestamp'] >= (block_dates[0] - pd.Timedelta(days=blocked_days))]

In [50]:
revisions.revision_id = revisions.revision_id.astype(int).astype(str)
revision_ids = revisions.revision_id.values

In [244]:
# The API Allows only chunks of 50 articles to be called at once, thus we use this method.
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [228]:
# Lists to save the results of the three algorithms
article_qualities = []
damaging = []
goodfaith = []

In [None]:
for index, chunk in tqdm(enumerate(chunks(revision_ids, 50))):
    
    try:
        # Query the API with 50 revision IDs
        query = '|'.join(chunk)
        response = requests.get(f'https://ores.wikimedia.org/scores/trwiki/?models=articlequality|damaging|goodfaith&revids={query}')
        data = response.json()
        
        # Process the response and append to the lists
        for elem in data:
            quality = data[elem].get('articlequality').get('probability')
            damage = data[elem].get('damaging').get('probability')
            goodf = data[elem].get('goodfaith').get('probability')

            if quality is not None:
                article_qualities.append({
                    "rev_id" : elem,
                    "quality" : quality
                })

            if damage is not None:
                damaging.append({
                    "rev_id" : elem,
                    "damaging" : damage
                })

            if goodf is not None:
                goodfaith.append({
                    "rev_id" : elem,
                    "goodfaith" : goodf
                })
                
        # Save the results every 1000 iterations         
        if index % 1000 == 0:
            with open(f"{DATA_PATH}/article_quality.json", "w") as outfile:
                json.dump(article_qualities, outfile)
            with open(f"{DATA_PATH}/damaging.json", "w") as outfile:
                json.dump(damaging, outfile)
            with open(f"{DATA_PATH}/goodfaith.json", "w") as outfile:
                json.dump(goodfaith, outfile)
    except:
        continue



0it [00:00, ?it/s][A
9513it [00:05, 1867.67it/s][A
9520it [00:30,  3.31s/it]  

In [243]:
# Save the results one last time
with open(f"{DATA_PATH}/article_quality.json", "w") as outfile:
    json.dump(article_qualities, outfile)
with open(f"{DATA_PATH}/damaging.json", "w") as outfile:
    json.dump(damaging, outfile)
with open(f"{DATA_PATH}/goodfaith.json", "w") as outfile:
    json.dump(goodfaith, outfile)

In [255]:
df_article_quality = pd.DataFrame(article_qualities)
df_damaging = pd.DataFrame(damaging)
df_goodfaith = pd.DataFrame(goodfaith)

In [274]:
df_article_quality[['b', 'baslagıç', 'c', 'km', 'sm', 'taslak']] = df_article_quality.quality.apply(pd.Series)

In [275]:
df_damaging[['false', 'true']] = df_damaging.damaging.apply(pd.Series)

In [276]:
df_goodfaith[['false', 'true']] = df_goodfaith.goodfaith.apply(pd.Series)

In [277]:
df_goodfaith

Unnamed: 0,rev_id,goodfaith,false,true
0,14613355,"{'false': 0.7179449317240227, 'true': 0.282055...",0.717945,0.282055
1,14613357,"{'false': 0.7522729475492588, 'true': 0.247727...",0.752273,0.247727
2,14613359,"{'false': 0.8090542141849668, 'true': 0.190945...",0.809054,0.190946
3,14613360,"{'false': 0.010182771859936657, 'true': 0.9898...",0.010183,0.989817
4,14613363,"{'false': 0.015441106549229366, 'true': 0.9845...",0.015441,0.984559
...,...,...,...,...
5037300,24769173,"{'false': 0.03215497220448493, 'true': 0.96784...",0.032155,0.967845
5037301,24769174,"{'false': 0.011484112002788427, 'true': 0.9885...",0.011484,0.988516
5037302,24769175,"{'false': 0.020319365225920927, 'true': 0.9796...",0.020319,0.979681
5037303,24769176,"{'false': 0.17059470815317868, 'true': 0.82940...",0.170595,0.829405
