In [26]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
from nltk.corpus import stopwords
from transformers import BertTokenizer, BertForSequenceClassification
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [27]:
# Import CoinDesk news data
coindesk_df = pd.read_csv("data/CoinDesk News.csv", encoding='unicode_escape') 
coindesk_df.drop("Unnamed: 0", axis=1, inplace=True)
coindesk_df["Date"] = pd.to_datetime(coindesk_df["Date"])
coindesk_df = coindesk_df.sort_values("Date")
coindesk_df = coindesk_df.reset_index().drop("index", axis=1)
coindesk_df.head()

  coindesk_df["Date"] = pd.to_datetime(coindesk_df["Date"])


Unnamed: 0,Title,Date,Content,Page
0,Exchange Tokens Hit New All-time Highs as Stoc...,2021-01-02,"Some retail equities traders, frustrated with ...",1312
1,'A Good Thing': Elon Musk Says He's a Supporte...,2021-01-02,Tesla and SpaceX CEO Elon Musk has clarified t...,1313
2,FinCEN's Wallet Rule Aims to Close Crypto-Cash...,2021-01-02,Financial institutions report large cash and c...,1312
3,US Man Pleads Guilty to Money Laundering Charg...,2021-01-02,A California man has entered into a plea agree...,1312
4,Mark Cuban Hails 'Store of Value Generation' T...,2021-01-02,"Mark Cuban, the billionaire owner of the Natio...",1312


# FinBert Sentiment

In [28]:
# Text pre-processing
nlp = spacy.load('en_core_web_sm')
stop = stopwords.words('english')

coindesk_df['Cleaned title'] = coindesk_df['Title'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
coindesk_df['Cleaned title'] = coindesk_df['Cleaned title'].str.replace('[^\w\s]','')
coindesk_df['Cleaned title'] = coindesk_df['Cleaned title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

coindesk_df['Cleaned content'] = coindesk_df['Content'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
coindesk_df['Cleaned content'] = coindesk_df['Cleaned content'].str.replace('[^\w\s]','')
coindesk_df['Cleaned content'] = coindesk_df['Cleaned content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

def space(comment):
    doc = nlp(comment)
    return " ".join([token.lemma_ for token in doc])

coindesk_df['Cleaned title'] = coindesk_df['Cleaned title'].apply(space)
coindesk_df['Cleaned content'] = coindesk_df['Cleaned content'].apply(space)
coindesk_df.head()

  coindesk_df['Cleaned title'] = coindesk_df['Cleaned title'].str.replace('[^\w\s]','')
  coindesk_df['Cleaned content'] = coindesk_df['Cleaned content'].str.replace('[^\w\s]','')


Unnamed: 0,Title,Date,Content,Page,Cleaned title,Cleaned content
0,Exchange Tokens Hit New All-time Highs as Stoc...,2021-01-02,"Some retail equities traders, frustrated with ...",1312,exchange tokens hit new alltime high stock tra...,retail equity trader frustrate recent restrict...
1,'A Good Thing': Elon Musk Says He's a Supporte...,2021-01-02,Tesla and SpaceX CEO Elon Musk has clarified t...,1313,good thing elon musk say he s supporter bitcoin,tesla spacex ceo elon musk clarify bitcoin sup...
2,FinCEN's Wallet Rule Aims to Close Crypto-Cash...,2021-01-02,Financial institutions report large cash and c...,1312,fincens wallet rule aim close cryptocash repor...,financial institution report large cash crypto...
3,US Man Pleads Guilty to Money Laundering Charg...,2021-01-02,A California man has entered into a plea agree...,1312,us man plead guilty money laundering charge in...,california man enter plea agreement we authori...
4,Mark Cuban Hails 'Store of Value Generation' T...,2021-01-02,"Mark Cuban, the billionaire owner of the Natio...",1312,mark cuban hail store value generation take wa...,mark cuban billionaire owner national basketba...


In [29]:
# Initialize FinBert
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

labels = {0:'neutral', 1:'positive',2:'negative'}

In [30]:
# Conduct FinBert sentiment analysis on news headlines
title_sent_val = list()
for headline in coindesk_df['Cleaned title'].to_list():
    try:
        inputs = tokenizer(headline, return_tensors="pt", padding=True)
        outputs = finbert(**inputs)[0]

        val = labels[np.argmax(outputs.detach().numpy())]   
        title_sent_val.append(val)
    except:
        title_sent_val.append(None)
        
coindesk_df['Title sentiment'] = title_sent_val

In [31]:
# Conduct FinBert sentiment analysis on news content
content_sent_val = list()
for content in coindesk_df['Cleaned content'].to_list():
    try:
        inputs = tokenizer(content[:512], return_tensors="pt", padding=True)
        outputs = finbert(**inputs)[0]

        val = labels[np.argmax(outputs.detach().numpy())]   
        content_sent_val.append(val)
    except:
        content_sent_val.append(None)
           
coindesk_df['Content sentiment'] = content_sent_val

In [32]:
# One-hot encoding of FinBert sentiment labels on news sentiment
coindesk_df['Pos title count'] = list(map(lambda x: 1 if x else 0, coindesk_df['Title sentiment']=="positive"))
coindesk_df['Neu title count'] = list(map(lambda x: 1 if x else 0, coindesk_df['Title sentiment']=="neutral"))
coindesk_df['Neg title count'] = list(map(lambda x: 1 if x else 0, coindesk_df['Title sentiment']=="negative"))

coindesk_df['Pos content count'] = list(map(lambda x: 1 if x else 0, coindesk_df['Content sentiment']=="positive"))
coindesk_df['Neu content count'] = list(map(lambda x: 1 if x else 0, coindesk_df['Content sentiment']=="neutral"))
coindesk_df['Neg content count'] = list(map(lambda x: 1 if x else 0, coindesk_df['Content sentiment']=="negative"))

coindesk_df.head()

Unnamed: 0,Title,Date,Content,Page,Cleaned title,Cleaned content,Title sentiment,Content sentiment,Pos title count,Neu title count,Neg title count,Pos content count,Neu content count,Neg content count
0,Exchange Tokens Hit New All-time Highs as Stoc...,2021-01-02,"Some retail equities traders, frustrated with ...",1312,exchange tokens hit new alltime high stock tra...,retail equity trader frustrate recent restrict...,neutral,positive,0,1,0,1,0,0
1,'A Good Thing': Elon Musk Says He's a Supporte...,2021-01-02,Tesla and SpaceX CEO Elon Musk has clarified t...,1313,good thing elon musk say he s supporter bitcoin,tesla spacex ceo elon musk clarify bitcoin sup...,neutral,positive,0,1,0,1,0,0
2,FinCEN's Wallet Rule Aims to Close Crypto-Cash...,2021-01-02,Financial institutions report large cash and c...,1312,fincens wallet rule aim close cryptocash repor...,financial institution report large cash crypto...,neutral,neutral,0,1,0,0,1,0
3,US Man Pleads Guilty to Money Laundering Charg...,2021-01-02,A California man has entered into a plea agree...,1312,us man plead guilty money laundering charge in...,california man enter plea agreement we authori...,neutral,neutral,0,1,0,0,1,0
4,Mark Cuban Hails 'Store of Value Generation' T...,2021-01-02,"Mark Cuban, the billionaire owner of the Natio...",1312,mark cuban hail store value generation take wa...,mark cuban billionaire owner national basketba...,neutral,neutral,0,1,0,0,1,0


In [33]:
# Aggregate tweets daily
coindesk_summary_df = coindesk_df[~(coindesk_df['Content sentiment'].isna())].groupby("Date").sum()

total_title_sent = coindesk_summary_df["Pos title count"] + coindesk_summary_df["Neu title count"] + coindesk_summary_df["Neg title count"]
coindesk_summary_df["Title proportion pos"] = coindesk_summary_df["Pos title count"] / total_title_sent
coindesk_summary_df["Title proportion neu"] = coindesk_summary_df["Neu title count"] / total_title_sent
coindesk_summary_df["Title proportion neg"] = coindesk_summary_df["Neg title count"] / total_title_sent

total_content_sent = coindesk_summary_df["Pos content count"] + coindesk_summary_df["Neu content count"] + coindesk_summary_df["Neg content count"]
coindesk_summary_df["Content proportion pos"] = coindesk_summary_df["Pos content count"] / total_content_sent
coindesk_summary_df["Content proportion neu"] = coindesk_summary_df["Neu content count"] / total_content_sent
coindesk_summary_df["Content proportion neg"] = coindesk_summary_df["Neg content count"] / total_content_sent

coindesk_summary_df.head()

  coindesk_summary_df = coindesk_df[~(coindesk_df['Content sentiment'].isna())].groupby("Date").sum()


Unnamed: 0_level_0,Page,Pos title count,Neu title count,Neg title count,Pos content count,Neu content count,Neg content count,Title proportion pos,Title proportion neu,Title proportion neg,Content proportion pos,Content proportion neu,Content proportion neg
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-01-02,13122,0,10,0,3,7,0,0.0,1.0,0.0,0.3,0.7,0.0
2021-01-03,25132,2,15,3,4,13,3,0.1,0.75,0.15,0.2,0.65,0.15
2021-01-04,22700,2,15,2,2,16,1,0.105263,0.789474,0.105263,0.105263,0.842105,0.052632
2021-01-05,3408,0,2,1,0,2,1,0.0,0.666667,0.333333,0.0,0.666667,0.333333
2021-01-06,5380,1,4,0,0,5,0,0.2,0.8,0.0,0.0,1.0,0.0


In [35]:
coindesk_summary_df.to_csv("CoinDesk Sentiment.csv")