In [21]:
import mwclient
import time
import pandas as pd

from transformers import pipeline
from statistics import mean
from datetime import datetime


site = mwclient.Site("en.wikipedia.org")
page = site.pages["Bitcoin"]

In [54]:
revs = list(page.revisions())

In [55]:
revs[0]

OrderedDict([('revid', 1136456955),
             ('parentid', 1135999629),
             ('user', 'Blockchainus Maximus'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=1, tm_mday=30, tm_hour=11, tm_min=46, tm_sec=26, tm_wday=0, tm_yday=30, tm_isdst=-1)),
             ('comment',
              'bitcoin logo, it is clear from both the page title and infobox name that this is "bitcoin", a bit like having a picture of a dog with the word "dog" next to it :) feel free to revert.')])

In [56]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [61]:
revs = revs[16343:]
# len(revs)
# revs[16343]

In [62]:
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent['score']
    if sent['label'] == 'NEGATIVE':
        score *= -1
    return score


No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceCla

In [63]:
edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

In [64]:
for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
        
    del edits[key]["sentiments"]

In [65]:
edits_df = pd.DataFrame.from_dict(edits, orient='index')

In [66]:
edits_df.index = pd.to_datetime(edits_df.index)

In [67]:
dates = pd.date_range('2020-12-03', datetime.today())
edits_df = edits_df.reindex(dates, fill_value=0)

In [68]:
rolling_edits = edits_df.rolling(30).mean()

In [69]:
rolling_edits = rolling_edits.dropna()
rolling_edits 

Unnamed: 0,edit_count,sentiment,neg_sentiment
2021-01-01,0.866667,-0.179525,0.294444
2021-01-02,0.833333,-0.147559,0.261111
2021-01-03,0.833333,-0.147559,0.261111
2021-01-04,0.833333,-0.147559,0.261111
2021-01-05,0.866667,-0.150237,0.261111
...,...,...,...
2023-02-02,0.600000,-0.160175,0.245556
2023-02-03,0.600000,-0.160175,0.245556
2023-02-04,0.600000,-0.160175,0.245556
2023-02-05,0.600000,-0.160175,0.245556


In [70]:
rolling_edits.to_csv('wikipedia_edits.csv')