# Machine Learning to predict Bitcoin price

## Step 1
Downloading edits from Wikipedia

In [9]:
import mwclient
import time

site = mwclient.Site("en.wikipedia.org")
page = site.pages["Bitcoin"]

In [10]:
# get all the revisions from wiki; it's going to take a while
revs = list(page.revisions())

In [11]:
revs[0]

OrderedDict([('revid', 1225814205),
             ('parentid', 1225513035),
             ('minor', ''),
             ('user', 'Dogman15'),
             ('timestamp',
              time.struct_time(tm_year=2024, tm_mon=5, tm_mday=26, tm_hour=21, tm_min=38, tm_sec=53, tm_wday=6, tm_yday=147, tm_isdst=-1)),
             ('comment',
              "/* Use for payments */ finishing list with the word 'and'")])

In [12]:
# sort the revisions from first to last
revs = sorted(revs, key=lambda rev: rev["timestamp"])

In [13]:
revs[0]

OrderedDict([('revid', 275832581),
             ('parentid', 0),
             ('user', 'Pratyeka'),
             ('timestamp',
              time.struct_time(tm_year=2009, tm_mon=3, tm_mday=8, tm_hour=16, tm_min=41, tm_sec=7, tm_wday=6, tm_yday=67, tm_isdst=-1)),
             ('comment', 'creation (stub)')])

## Step 2
Finding the sentiment of page edits

In [14]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

def find_sentiment(text):
    sent = sentiment_pipeline([text[:250]])[0]
    score = sent["score"]
    if sent["label"] == "NEGATIVE":
        score *= -1
    return score

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [15]:
find_sentiment("I hate you.")

-0.9992952346801758

In [18]:
edits = {}

for rev in revs:
    date = time.strftime("%Y-%m-%d", rev["timestamp"])

    if date not in edits:
        edits[date] = dict(sentiments=list(), edit_count=0)
        
    edits[date]["edit_count"] += 1

    comment = rev.get('comment', None)
    if comment is not None:
        edits[date]["sentiments"].append(find_sentiment(comment))

In [20]:
from statistics import mean

for key in edits:
    if len(edits[key]["sentiments"]) > 0:
        edits[key]["sentiment"] = mean(edits[key]["sentiments"])
        edits[key]["neg_sentiment"] = len([s for s in edits[key]["sentiments"] if s < 0]) / len(edits[key]["sentiments"])
    else:
        edits[key]["sentiment"] = 0
        edits[key]["neg_sentiment"] = 0

    del edits[key]["sentiments"]

In [21]:
edits

{'2009-03-08': {'edit_count': 4,
  'sentiment': -0.5505250245332718,
  'neg_sentiment': 0.75},
 '2009-08-05': {'edit_count': 1,
  'sentiment': 0.7481210231781006,
  'neg_sentiment': 0.0},
 '2009-08-06': {'edit_count': 2,
  'sentiment': 0.995745837688446,
  'neg_sentiment': 0.0},
 '2009-08-14': {'edit_count': 1,
  'sentiment': 0.9300212860107422,
  'neg_sentiment': 0.0},
 '2009-10-13': {'edit_count': 2,
  'sentiment': -0.22749903798103333,
  'neg_sentiment': 0.5},
 '2009-11-18': {'edit_count': 1,
  'sentiment': 0.8839512467384338,
  'neg_sentiment': 0.0},
 '2009-12-08': {'edit_count': 1,
  'sentiment': -0.9869275689125061,
  'neg_sentiment': 1.0},
 '2009-12-17': {'edit_count': 1,
  'sentiment': -0.9975171089172363,
  'neg_sentiment': 1.0},
 '2010-02-23': {'edit_count': 1,
  'sentiment': -0.9994946718215942,
  'neg_sentiment': 1.0},
 '2010-03-18': {'edit_count': 1,
  'sentiment': 0.8758782148361206,
  'neg_sentiment': 0.0},
 '2010-04-13': {'edit_count': 4,
  'sentiment': 0.84435622394084

## Step 3 
Converting sentiment data into a dataframe

In [22]:
import pandas as pd

edits_df = pd.DataFrame.from_dict(edits, orient="index")

In [23]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.750000
2009-08-05,1,0.748121,0.000000
2009-08-06,2,0.995746,0.000000
2009-08-14,1,0.930021,0.000000
2009-10-13,2,-0.227499,0.500000
...,...,...,...
2024-05-16,1,-0.885971,1.000000
2024-05-21,4,-0.492858,0.750000
2024-05-22,9,-0.167402,0.555556
2024-05-24,1,-0.998527,1.000000


In [24]:
edits_df.index = pd.to_datetime(edits_df.index)

In [25]:
from datetime import datetime

dates = pd.date_range(start="2009-03-08", end=datetime.today())

In [26]:
dates

DatetimeIndex(['2009-03-08', '2009-03-09', '2009-03-10', '2009-03-11',
               '2009-03-12', '2009-03-13', '2009-03-14', '2009-03-15',
               '2009-03-16', '2009-03-17',
               ...
               '2024-05-21', '2024-05-22', '2024-05-23', '2024-05-24',
               '2024-05-25', '2024-05-26', '2024-05-27', '2024-05-28',
               '2024-05-29', '2024-05-30'],
              dtype='datetime64[ns]', length=5563, freq='D')

In [27]:
edits_df = edits_df.reindex(dates, fill_value=0)

In [28]:
edits_df

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,4,-0.550525,0.75
2009-03-09,0,0.000000,0.00
2009-03-10,0,0.000000,0.00
2009-03-11,0,0.000000,0.00
2009-03-12,0,0.000000,0.00
...,...,...,...
2024-05-26,1,-0.995705,1.00
2024-05-27,0,0.000000,0.00
2024-05-28,0,0.000000,0.00
2024-05-29,0,0.000000,0.00


In [29]:
rolling_edits = edits_df.rolling(30).mean()

In [30]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-03-08,,,
2009-03-09,,,
2009-03-10,,,
2009-03-11,,,
2009-03-12,,,
...,...,...,...
2024-05-26,1.000000,-0.156696,0.243519
2024-05-27,0.900000,-0.145586,0.221296
2024-05-28,0.900000,-0.145586,0.221296
2024-05-29,0.900000,-0.145586,0.221296


In [31]:
rolling_edits = rolling_edits.dropna()

In [32]:
rolling_edits

Unnamed: 0,edit_count,sentiment,neg_sentiment
2009-04-06,0.133333,-0.018351,0.025000
2009-04-07,0.000000,0.000000,0.000000
2009-04-08,0.000000,0.000000,0.000000
2009-04-09,0.000000,0.000000,0.000000
2009-04-10,0.000000,0.000000,0.000000
...,...,...,...
2024-05-26,1.000000,-0.156696,0.243519
2024-05-27,0.900000,-0.145586,0.221296
2024-05-28,0.900000,-0.145586,0.221296
2024-05-29,0.900000,-0.145586,0.221296


In [33]:
rolling_edits.to_csv("data/wikipedia_edits.csv")

## Step 4
Downloading Bitcoin price data.
Continue on predictionlive file.