In [33]:
import mwclient
import json
import time

DIR = "wiki_history"
site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [8]:
revs = list(page.revisions())

In [9]:
revs[0]

OrderedDict([('revid', 1107421955),
             ('parentid', 1107344027),
             ('user', 'TheCurrencyGuy'),
             ('timestamp',
              time.struct_time(tm_year=2022, tm_mon=8, tm_mday=29, tm_hour=22, tm_min=55, tm_sec=24, tm_wday=0, tm_yday=241, tm_isdst=-1)),
             ('comment', '')])

In [25]:
contents = list(page.revisions(prop="content"))

In [31]:
contents[0].keys()

odict_keys(['contentformat', 'contentmodel', '*'])

In [42]:
for (rev, content) in zip(revs, contents):
    if isinstance(rev['timestamp'], time.struct_time):
        rev['timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S", rev['timestamp'])
    with open(f"{DIR}/{rev['timestamp']}.json", "w+") as f:
        data = json.dumps({'info': rev, 'content': content}, indent=4)
        f.write(data)

In [47]:
import os

files = sorted([f for f in os.listdir(DIR) if f.endswith("json")])

In [49]:
files[0]

'2009-03-08T16:41:07.json'

In [53]:
d = json.load(open(f"{DIR}/{files[0]}"))

In [54]:
d

{'info': {'revid': 275832581,
  'parentid': 0,
  'user': 'Pratyeka',
  'timestamp': '2009-03-08T16:41:07',
  'comment': 'creation (stub)'},
 'content': {'contentformat': 'text/x-wiki',
  'contentmodel': 'wikitext',
  '*': "'''Bitcoin''' is an open source peer-to-peer electronic cash system developed by Satoshi Nakamoto that's completely [[P2P|decentralized\\\\, with no central server or trusted parties. Users hold the [[cryptography|crypto]] keys to their own money and transact directly with each other, with the help of the network to check for double-spending.\n\n==See also==\n*[[Anonymous internet banking]]\n*[[eCache]]\n*[[Pecunix]]\n*[[Ripple monetary system]]\n*[[Yodelbank]]\n\n==External links==\n*[http://www.bitcoin.org Official website]\n\n{{stub}}"}}

In [112]:
edits = {}
prev_content = ""

for fname in files:
    with open(f"{DIR}/{fname}") as f:
        data = json.load(f)
        
    if "*" not in data["content"]:
        continue
        
    date = data["info"]["timestamp"].split("T")[0]
    if date not in edits:
        edits[date] = dict(ratios=list(), differences=list(), editor_names=set())
    
    content = data["content"]["*"]
    
    difference = len(content) - len(prev_content)
    ratio = difference / (len(prev_content) + 1)
    
    edits[date]["differences"].append(difference)
    edits[date]["ratios"].append(ratio)
    edits[date]["editor_names"].add(data["info"]["user"])
    
    prev_content = content

In [113]:
from statistics import mean

for key in edits:
    edits[key]["edit_count"] = len(edits[key]["ratios"])
    edits[key]["editor_count"] = len(edits[key]["editor_names"])
    
    edits[key]["ratio"] = mean(edits[key]["ratios"])
    edits[key]["difference"] = mean(edits[key]["differences"])
    edits[key]["total_difference"] = sum(edits[key]["differences"])
    
    del edits[key]["ratios"]
    del edits[key]["differences"]
    del edits[key]["editor_names"]

In [133]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [134]:
edits_df

Unnamed: 0,edit_count,editor_count,ratio,difference,total_difference
2009-03-08,4,2,1.322626e+02,139.000000,556
2009-08-05,1,1,2.502693e+00,1394.000000,1394
2009-08-06,2,1,-4.618939e-03,-9.000000,-18
2009-08-14,1,1,4.655975e-03,9.000000,9
2009-10-13,2,2,6.095466e-04,0.000000,0
...,...,...,...,...,...
2022-08-02,1,1,-1.790743e-05,-4.000000,-4
2022-08-14,1,1,-1.101327e-03,-246.000000,-246
2022-08-17,3,2,6.486102e-04,144.666667,434
2022-08-23,2,2,1.296392e-08,0.000000,0


In [135]:
edits_df.index = pd.to_datetime(edits_df.index)

In [136]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [137]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [138]:
edits_df

Unnamed: 0,edit_count,editor_count,ratio,difference,total_difference
2009-03-08,4,2,132.262639,139.0,556
2009-03-09,0,0,0.000000,0.0,0
2009-03-10,0,0,0.000000,0.0,0
2009-03-11,0,0,0.000000,0.0,0
2009-03-12,0,0,0.000000,0.0,0
...,...,...,...,...,...
2022-08-27,0,0,0.000000,0.0,0
2022-08-28,0,0,0.000000,0.0,0
2022-08-29,2,1,0.000532,119.0,238
2022-08-30,0,0,0.000000,0.0,0


In [140]:
rolling_edits = edits_df.rolling(7, min_periods=7).mean()

In [142]:
rolling_edits = rolling_edits.dropna()

In [143]:
rolling_edits

Unnamed: 0,edit_count,editor_count,ratio,difference,total_difference
2009-03-14,0.571429,0.285714,1.889466e+01,19.857143,79.428571
2009-03-15,0.000000,0.000000,0.000000e+00,0.000000,0.000000
2009-03-16,0.000000,0.000000,0.000000e+00,0.000000,0.000000
2009-03-17,0.000000,0.000000,0.000000e+00,0.000000,0.000000
2009-03-18,0.000000,0.000000,0.000000e+00,0.000000,0.000000
...,...,...,...,...,...
2022-08-27,0.285714,0.285714,1.851988e-09,0.000000,0.000000
2022-08-28,0.285714,0.285714,1.851988e-09,0.000000,0.000000
2022-08-29,0.571429,0.428571,7.602596e-05,17.000000,34.000000
2022-08-30,0.285714,0.142857,7.602411e-05,17.000000,34.000000


In [144]:
rolling_edits.to_csv("wikipedia_edits.csv")