In [1]:
pip install mwclient


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Imports mwclient and time modules.
# Initializes a connection to the English Wikipedia and accesses the 'Bitcoin' page.

import mwclient
import time

site = mwclient.Site('en.wikipedia.org')
page = site.pages['Bitcoin']

In [3]:
# Retrieving Page Revisions

revs = list(page.revisions())

In [4]:
# Accessing the Most Recent Revision

revs[0]

OrderedDict([('revid', 1185761286),
             ('parentid', 1185641913),
             ('user', 'A455bcd9'),
             ('timestamp',
              time.struct_time(tm_year=2023, tm_mon=11, tm_mday=18, tm_hour=21, tm_min=31, tm_sec=37, tm_wday=5, tm_yday=322, tm_isdst=-1)),
             ('comment', '0.1% => 0.2% based on a better RS')])

In [5]:
# Sorting Revisions by Timestamp

revs = sorted(revs, key=lambda rev: rev["timestamp"]) 

In [6]:
# Sorting Revisions by Timestamp

revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [7]:

pip install --upgrade transformers


Note: you may need to restart the kernel to use updated packages.


In [8]:
# Setting Up Sentiment Analysis

from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Function to Find Sentiment of Text

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

  from tensorflow.tsl.python.lib.core import pywrap_ml_dtypes
  import cgi
  from urllib3.contrib.pyopenssl import orig_util_SSLContext as SSLContext
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [11]:
# Analyzing and Storing Edit Information

edits = {}

for rev in revs:        
    date = time.strftime("%Y-%m-%d", rev["timestamp"])
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
    
    edits[date]["edit_count"] += 1
    
    comment = rev.get("comment", "")
    edits[date]["sentiments"].append(find_sentiment(comment))

In [12]:
# Calculating Average Sentiment and Negative Sentiment Proportion

from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0
    
    del edits[key]["sentiments"]

In [13]:
# Creating a DataFrame from Edit Data

import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [14]:
edits_df


Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930021,0.000000
2009-10-13,2,-0.227500,0.500000
...,...,...,...
2023-11-04,4,0.839732,0.000000
2023-11-05,1,-0.813071,1.000000
2023-11-08,1,-0.994873,1.000000
2023-11-17,3,0.203373,0.333333


In [15]:
# Converting Index to DateTime

edits_df.index = pd.to_datetime(edits_df.index)

In [16]:
# Creating a Range of Dates

from datetime import datetime

dates = pd.date_range(start="2009-03-08",end=datetime.today())

In [17]:
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2023-11-10', '2023-11-11', '2023-11-12', '2023-11-13',
               '2023-11-14', '2023-11-15', '2023-11-16', '2023-11-17',
               '2023-11-18', '2023-11-19'],
              dtype='datetime64[ns]', length=5370, freq='D')

In [18]:
# Reindexing the DataFrame

edits_df = edits_df.reindex(dates, fill_value=0)

In [19]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-03-09,0,0.000000,0.000000
2009-03-10,0,0.000000,0.000000
2009-03-11,0,0.000000,0.000000
2009-03-12,0,0.000000,0.000000
...,...,...,...
2023-11-15,0,0.000000,0.000000
2023-11-16,0,0.000000,0.000000
2023-11-17,3,0.203373,0.333333
2023-11-18,2,-0.996377,1.000000


In [20]:
# Calculating Rolling Averages

rolling_edits = edits_df.rolling(30, min_periods=30).mean()

In [21]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2023-11-15,0.466667,-0.031475,0.150000
2023-11-16,0.466667,-0.031475,0.150000
2023-11-17,0.566667,-0.024696,0.161111
2023-11-18,0.633333,-0.057909,0.194444


In [22]:
# Dropping NA Value 

rolling_edits = rolling_edits.dropna()

In [23]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2023-11-15,0.466667,-0.031475,0.150000
2023-11-16,0.466667,-0.031475,0.150000
2023-11-17,0.566667,-0.024696,0.161111
2023-11-18,0.633333,-0.057909,0.194444


In [24]:
# Exporting Data

rolling_edits.to_csv("wikipedia_edits.csv")