In [194]:
import mwclient
import json
import time

DIR = "wiki_history"
site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [195]:
revs = list(page.revisions())

In [196]:
revs[0]

OrderedDict([('revid', 1108886469),
             ('parentid', 1108063744),
             ('minor', ''),
             ('user', 'Keith D'),
             ('timestamp',
              time.struct_time(tm_year=2022, tm_mon=9, tm_mday=6, tm_hour=20, tm_min=6, tm_sec=58, tm_wday=1, tm_yday=249, tm_isdst=-1)),
             ('comment', 'BBC news -> BBC News')])

In [None]:
contents = list(page.revisions(prop="content"))

In [None]:
contents[0].keys()

In [None]:
comments = list(page.revisions(prop="comment"))

In [None]:
comments[0]

In [42]:
for (rev, content) in zip(revs, contents):
    if isinstance(rev['timestamp'], time.struct_time):
        rev['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S", rev['timestamp'])
    with open(f"{DIR}/{rev['timestamp']}.json", "w+") as f:
        data = json.dumps({'info': rev, 'content': content, 'comment': comment}, indent=4)
        f.write(data)

In [169]:
import os

files = sorted([f for f in os.listdir(DIR) if f.endswith("json")])

In [170]:
files[0]

'2009-03-08T16:41:07.json'

In [171]:
d = json.load(open(f"{DIR}/{files[0]}"))

In [172]:
d

{'info': {'revid': 275832581,
  'parentid': 0,
  'user': 'Pratyeka',
  'timestamp': '2009-03-08T16:41:07',
  'comment': 'creation (stub)'},
 'content': {'contentformat': 'text/x-wiki',
  'contentmodel': 'wikitext',
  '*': "'''Bitcoin''' is an open source peer-to-peer electronic cash system developed by Satoshi Nakamoto that's completely [[P2P|decentralized\\\\, with no central server or trusted parties. Users hold the [[cryptography|crypto]] keys to their own money and transact directly with each other, with the help of the network to check for double-spending.\n\n==See also==\n*[[Anonymous internet banking]]\n*[[eCache]]\n*[[Pecunix]]\n*[[Ripple monetary system]]\n*[[Yodelbank]]\n\n==External links==\n*[http://www.bitcoin.org Official website]\n\n{{stub}}"}}

In [188]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [None]:
edits = {}
prev_content = ""

for fname in files:
    with open(f"{DIR}/{fname}") as f:
        data = json.load(f)
        
    if "*" not in data["content"]:
        continue
        
    date = data["info"]["timestamp"].split("T")[0]
    if date not in edits:
        edits[date] = dict(ratios=list(), differences=list(), editor_names=set(), sentiments=list())
    
    content = data["content"]["*"]
    
    difference = abs(len(content) - len(prev_content))
    ratio = difference / (len(prev_content) + 1)
    
    edits[date]["differences"].append(difference)
    edits[date]["ratios"].append(ratio)
    edits[date]["editor_names"].add(data["info"]["user"])
    
    comment = data["info"]["comment"]
    if comment:
        edits[date]["sentiments"].append(find_sentiment(comment))
    
    prev_content = content

In [156]:
from statistics import mean

for key in edits:
    edits[key]["edit_count"] = len(edits[key]["ratios"])
    edits[key]["editor_count"] = len(edits[key]["editor_names"])
    
    edits[key]["ratio"] = mean(edits[key]["ratios"])
    edits[key]["difference"] = mean(edits[key]["differences"])
    edits[key]["sentiment"] = mean(edits[key]["sentiment"])
    edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiment"] if s < 0]) / len(edits[key][sentiment])
    edits[key]["total_difference"] = sum(edits[key]["differences"])
    
    del edits[key]["ratios"]
    del edits[key]["differences"]
    del edits[key]["editor_names"]

In [157]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [158]:
edits_df

Unnamed: 0,edit_count,editor_count,ratio,difference,total_difference
2009-03-08,4,2,132.262639,139.000000,556
2009-08-05,1,1,2.502693,1394.000000,1394
2009-08-06,2,1,0.004619,9.000000,18
2009-08-14,1,1,0.004656,9.000000,9
2009-10-13,2,2,0.034921,69.000000,138
...,...,...,...,...,...
2022-08-02,1,1,0.000018,4.000000,4
2022-08-14,1,1,0.001101,246.000000,246
2022-08-17,3,2,0.000971,216.666667,650
2022-08-23,2,2,0.000161,36.000000,72


In [159]:
edits_df.index = pd.to_datetime(edits_df.index)

In [160]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [161]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [162]:
edits_df

Unnamed: 0,edit_count,editor_count,ratio,difference,total_difference
2009-03-08,4,2,132.262639,139.0,556
2009-03-09,0,0,0.000000,0.0,0
2009-03-10,0,0,0.000000,0.0,0
2009-03-11,0,0,0.000000,0.0,0
2009-03-12,0,0,0.000000,0.0,0
...,...,...,...,...,...
2022-08-27,0,0,0.000000,0.0,0
2022-08-28,0,0,0.000000,0.0,0
2022-08-29,2,1,0.000532,119.0,238
2022-08-30,0,0,0.000000,0.0,0


In [163]:
rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [164]:
rolling_edits = rolling_edits.dropna()

In [165]:
rolling_edits

Unnamed: 0,edit_count,editor_count,ratio,difference,total_difference
2009-04-06,0.133333,0.066667,4.408755,4.633333,18.533333
2009-04-07,0.000000,0.000000,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...
2022-08-27,0.300000,0.266667,0.000076,16.922222,32.733333
2022-08-28,0.233333,0.200000,0.000075,16.755556,32.400000
2022-08-29,0.300000,0.233333,0.000093,20.722222,40.333333
2022-08-30,0.300000,0.233333,0.000093,20.722222,40.333333


In [166]:
rolling_edits.to_csv("wikipedia_edits.csv")