In [29]:
import mwclient
import time
import pickle 
from transformers import pipeline
from statistics import mean
import pandas as pd

# How to improve
Improve datascraping for sentiment analysis:
- look at news articles, tweets, google trends 

Find other correlated coin (e.g. etheruem or other crypto)

Add in economic indicators (e.g inflation, bond yield, debt level, FX, USD, AUD)

In [30]:
## Load data
# site = mwclient.Site("en.wikipedia.org")
# page = site.pages["Bitcoin"]
# revs = list(page.revisions())

# Save the revs data so we don't have to scrape the web
# Serialize and save to file
#with open('revs.pk1', 'wb') as file:
#    pickle.dump(revs, file

In [31]:
# Load revs data from file
with open('revs.pk1','rb') as file:
    revs = pickle.load(file)

revs[0]

OrderedDict([('revid', 1193028719),
             ('parentid', 1192928959),
             ('user', 'Ravenpuff'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=1, tm_mday=1, tm_hour=17, tm_min=51, tm_sec=34, tm_wday=0, tm_yday=1, tm_isdst=-1)),
             ('comment', '/* top */ use {{as of}} in infobox')])

In [32]:
revs = sorted(revs, key=lambda rev: rev["timestamp"])
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

In [37]:
sentiment_classifier = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")

def find_sentiment(text): 
    '''Calculates the sentiment estimation of the text'''

    # Run sentiment pipeline neural newtork on the input text (up to 250 characters)
    sent = sentiment_classifier([text[:250]])[0]
    score = sent["score"]

    # ... if it is a negative sentiment, make score negative
    if sent["label"] == "NEGATIVE":
        score *= -1 
    
    return score

In [44]:
edits = {}

# Loop through every update and extract information
for i, rev in enumerate(revs):
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    # If this is the first time seeing the date, add a new dictionary
    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)

    # Sum the number of edits per day
    edits[date]["edit_count"] += 1

    # Calculate the sentiment of the edit and append to edits['sentiments'] list
    try:
        comment = rev["comment"]
        edits[date]["sentiments"].append(find_sentiment(comment))
    except KeyError:
        edits[date]["sentiments"] = []

    if i % 100 == 0:
        percent_complete = round(i/len(revs)*100,2)
        print(f'{percent_complete}% Complete', end='\r' flush=True)




0.0% Complete
0.56% Complete
1.12% Complete
1.69% Complete
2.25% Complete
2.81% Complete
3.37% Complete
3.94% Complete
4.5% Complete
5.06% Complete
5.62% Complete
6.19% Complete
6.75% Complete
7.31% Complete
7.87% Complete
8.44% Complete
9.0% Complete
9.56% Complete
10.12% Complete
10.69% Complete
11.25% Complete
11.81% Complete
12.37% Complete
12.94% Complete
13.5% Complete
14.06% Complete
14.62% Complete
15.19% Complete
15.75% Complete
16.31% Complete
16.87% Complete
17.44% Complete
18.0% Complete
18.56% Complete
19.12% Complete
19.69% Complete
20.25% Complete
20.81% Complete
21.37% Complete
21.94% Complete
22.5% Complete
23.06% Complete
23.62% Complete
24.19% Complete
24.75% Complete
25.31% Complete
25.87% Complete
26.44% Complete
27.0% Complete
27.56% Complete
28.12% Complete
28.69% Complete
29.25% Complete
29.81% Complete
30.37% Complete
30.94% Complete
31.5% Complete
32.06% Complete
32.62% Complete
33.19% Complete
33.75% Complete
34.31% Complete
34.87% Complete
35.44% Complete
36

KeyError: 'comment'

In [None]:
# Tidy up the edits dictionary    
for key in edits: 

    # Change the list of sentiment into an average rating and % negative votes, otherwise return 0
    if len(edits[key]["sentiments"]) > 0: 
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0])/len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0

    del edits[key]["sentiments"]

1

In [None]:
# Save the above so that we don't have to run it again (takes ~10 min)
with open('edits_dict.pk1', 'wb') as file:
    pickle.dump(edits, file)

In [None]:
# Load results into dataframe
edits_df = pd.Datarfame.from_dict(edits, orient="index")
edits_df.index = pd.to_datetime(edits_df.index)

# We need a dataframe that has no missing days
from datetime import datetime
dates = pd.date_range(start="2009-03-08", end=datetime.today())

# Merge complete date df with edits df
edits_df = edits_df.reindex(dates, fill_value=0)

# Create a rolling average for the past month
rolling_edits = edits_df.rolling(30).mean()
rolling_edits = rolling_edits.dropna()
rolling_edits.to_csv("wikipedia_edits.csv")
