In [56]:
# Import libararies
import pandas as pd
import numpy as np
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from langdetect import detect
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from transformers import BertTokenizer, BertForSequenceClassification
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [57]:
# Import tweets data
btc_tweets_df = pd.read_csv("data/Bitcoin Tweets.csv")
btc_tweets_df = btc_tweets_df[::-1].reset_index().drop("index", axis=1)
btc_tweets_df['date'] = btc_tweets_df['date'].astype(str).str[:10]
btc_tweets_df['date'] = pd.to_datetime(btc_tweets_df["date"])
btc_tweets_df = btc_tweets_df.sort_values("date")
btc_tweets_df = btc_tweets_df.reset_index().drop("index", axis=1)
btc_tweets_df.head()

Unnamed: 0,date,id,content,username,followers count,verified user,retweet count,like count,quote count
0,2021-01-01,1.3448e+18,"The price of\nSpaghetti alla Carbonara\nis 37,...",BitcoinBellyB,742,False,0,0,0
1,2021-01-01,1.34506e+18,BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin to 31...,trader_blitz,7607,False,0,0,0
2,2021-01-01,1.34506e+18,@CryptoHour @danheld I programmed my alarm to ...,westonnelson,18885,False,0,5,0
3,2021-01-01,1.34506e+18,Everybody talking about #altseason ... but guy...,CryptoAsAWay,1356,False,0,2,1
4,2021-01-01,1.34506e+18,I'm inviting you to start investing in crypto ...,TimothyShreck,41,False,0,0,0


# VADER Sentiment

In [58]:
# Conduct VADER sentiment analysis on tweets
neg_scores = []
neu_scores = []
pos_scores = []
comp_scores = []

for t in btc_tweets_df['content']:
    score = SentimentIntensityAnalyzer().polarity_scores(t)
    neg_scores.append(score['neg'])
    neu_scores.append(score['neu'])
    pos_scores.append(score['pos'])
    comp_scores.append(score['compound'])

btc_tweets_df['VADER Neg'] = neg_scores
btc_tweets_df['VADER Neu'] = neu_scores
btc_tweets_df['VADER Pos'] = pos_scores
btc_tweets_df['VADER Comp'] = comp_scores

btc_tweets_df.head()

Unnamed: 0,date,id,content,username,followers count,verified user,retweet count,like count,quote count,VADER Neg,VADER Neu,VADER Pos,VADER Comp
0,2021-01-01,1.3448e+18,"The price of\nSpaghetti alla Carbonara\nis 37,...",BitcoinBellyB,742,False,0,0,0,0.0,1.0,0.0,0.0
1,2021-01-01,1.34506e+18,BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin to 31...,trader_blitz,7607,False,0,0,0,0.0,1.0,0.0,0.0
2,2021-01-01,1.34506e+18,@CryptoHour @danheld I programmed my alarm to ...,westonnelson,18885,False,0,5,0,0.074,0.864,0.062,-0.1027
3,2021-01-01,1.34506e+18,Everybody talking about #altseason ... but guy...,CryptoAsAWay,1356,False,0,2,1,0.0,1.0,0.0,0.0
4,2021-01-01,1.34506e+18,I'm inviting you to start investing in crypto ...,TimothyShreck,41,False,0,0,0,0.0,0.847,0.153,0.6808


In [59]:
# Score sentiments based on follower count, like count and retweet count
sentiment_scores = []
for i in range(len(btc_tweets_df)):
    dat = btc_tweets_df.iloc[i]
    s = dat["VADER Comp"] * ((int(dat["followers count"])+1) + (int(dat["like count"])+1) * (int(dat["retweet count"])+1))
    sentiment_scores.append(s)
btc_tweets_df['VADER Score'] = sentiment_scores
btc_tweets_df.head()

Unnamed: 0,date,id,content,username,followers count,verified user,retweet count,like count,quote count,VADER Neg,VADER Neu,VADER Pos,VADER Comp,VADER Score
0,2021-01-01,1.3448e+18,"The price of\nSpaghetti alla Carbonara\nis 37,...",BitcoinBellyB,742,False,0,0,0,0.0,1.0,0.0,0.0,0.0
1,2021-01-01,1.34506e+18,BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin to 31...,trader_blitz,7607,False,0,0,0,0.0,1.0,0.0,0.0,0.0
2,2021-01-01,1.34506e+18,@CryptoHour @danheld I programmed my alarm to ...,westonnelson,18885,False,0,5,0,0.074,0.864,0.062,-0.1027,-1940.2084
3,2021-01-01,1.34506e+18,Everybody talking about #altseason ... but guy...,CryptoAsAWay,1356,False,0,2,1,0.0,1.0,0.0,0.0,0.0
4,2021-01-01,1.34506e+18,I'm inviting you to start investing in crypto ...,TimothyShreck,41,False,0,0,0,0.0,0.847,0.153,0.6808,29.2744


# FinBert Sentiment

In [60]:
# Initialize FinBert
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

labels = {0:'neutral', 1:'positive',2:'negative'}

In [61]:
# Conduct FinBert sentiment analysis on tweets
sent_val = list()
for tweet in btc_tweets_df['content'].to_list():
    inputs = tokenizer(tweet, return_tensors="pt", padding=True)
    outputs = finbert(**inputs)[0]
   
    val = labels[np.argmax(outputs.detach().numpy())]   
    sent_val.append(val)
    
btc_tweets_df['FinBert Sentiment'] = sent_val

In [62]:
# One-hot encoding of FinBert sentiment labels on tweets
btc_tweets_df['FinBert Pos'] = list(map(lambda x: 1 if x else 0, btc_tweets_df["FinBert Sentiment"]=="positive"))
btc_tweets_df['FinBert Neu'] = list(map(lambda x: 1 if x else 0, btc_tweets_df["FinBert Sentiment"]=="neutral"))
btc_tweets_df['FinBert Neg'] = list(map(lambda x: 1 if x else 0, btc_tweets_df["FinBert Sentiment"]=="negative"))

btc_tweets_df.head()

Unnamed: 0,date,id,content,username,followers count,verified user,retweet count,like count,quote count,VADER Neg,VADER Neu,VADER Pos,VADER Comp,VADER Score,FinBert Sentiment,FinBert Pos,FinBert Neu,FinBert Neg
0,2021-01-01,1.3448e+18,"The price of\nSpaghetti alla Carbonara\nis 37,...",BitcoinBellyB,742,False,0,0,0,0.0,1.0,0.0,0.0,0.0,neutral,0,1,0
1,2021-01-01,1.34506e+18,BTC/USD | $BTCUSD | $BTC $USD\n\nBitcoin to 31...,trader_blitz,7607,False,0,0,0,0.0,1.0,0.0,0.0,0.0,neutral,0,1,0
2,2021-01-01,1.34506e+18,@CryptoHour @danheld I programmed my alarm to ...,westonnelson,18885,False,0,5,0,0.074,0.864,0.062,-0.1027,-1940.2084,neutral,0,1,0
3,2021-01-01,1.34506e+18,Everybody talking about #altseason ... but guy...,CryptoAsAWay,1356,False,0,2,1,0.0,1.0,0.0,0.0,0.0,neutral,0,1,0
4,2021-01-01,1.34506e+18,I'm inviting you to start investing in crypto ...,TimothyShreck,41,False,0,0,0,0.0,0.847,0.153,0.6808,29.2744,neutral,0,1,0


In [63]:
# Aggregate tweets daily
daily_tweet_count = btc_tweets_df.groupby('date')['content'].count()
daily_like_count = btc_tweets_df.groupby('date')['like count'].sum()
daily_retweet_count = btc_tweets_df.groupby('date')['retweet count'].sum()

avg_daily_vader_neg = btc_tweets_df.groupby('date')['VADER Neg'].mean()
avg_daily_vader_neu = btc_tweets_df.groupby('date')['VADER Neu'].mean()
avg_daily_vader_pos = btc_tweets_df.groupby('date')['VADER Pos'].mean()
avg_daily_vader_comp = btc_tweets_df.groupby('date')['VADER Comp'].mean()
avg_daily_vader_score = btc_tweets_df.groupby('date')['VADER Score'].mean()

daily_finbert_neg_count = btc_tweets_df.groupby('date')['FinBert Neg'].sum()
daily_finbert_neu_count = btc_tweets_df.groupby('date')['FinBert Neu'].sum()
daily_finbert_pos_count = btc_tweets_df.groupby('date')['FinBert Pos'].sum()

daily_total_sent_count = daily_finbert_neg_count + daily_finbert_neu_count + daily_finbert_pos_count
daily_prop_finbert_neg = daily_finbert_neg_count / daily_total_sent_count
daily_prop_finbert_neu = daily_finbert_neu_count / daily_total_sent_count
daily_prop_finbert_pos = daily_finbert_pos_count / daily_total_sent_count

btc_tweets_summary_df = pd.DataFrame({"Tweet count":daily_tweet_count, 
                                      "Retweet count":daily_retweet_count,
                                      "Like count":daily_like_count,
                                      "Avg VADER Neg":avg_daily_vader_neg,
                                      "Avg VADER Neu":avg_daily_vader_neu,
                                      "Avg VADER Pos":avg_daily_vader_pos,
                                      "Avg VADER Comp":avg_daily_vader_comp,
                                      "Avg VADER Score":avg_daily_vader_score,
                                      "FinBert Neg count": daily_finbert_neg_count,
                                      "FinBert Neu count": daily_finbert_neu_count,
                                      "FinBert Pos count": daily_finbert_pos_count,
                                      "FinBert proportion Neg": daily_prop_finbert_neg,
                                      "FinBert proportion Neu": daily_prop_finbert_neu,
                                      "FinBert proportion Pos": daily_prop_finbert_pos,
                                     })

btc_tweets_summary_df.head()

Unnamed: 0_level_0,Tweet count,Retweet count,Like count,Avg VADER Neg,Avg VADER Neu,Avg VADER Pos,Avg VADER Comp,Avg VADER Score,FinBert Neg count,FinBert Neu count,FinBert Pos count,FinBert proportion Neg,FinBert proportion Neu,FinBert proportion Pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-01-01,267,125,495,0.031184,0.883202,0.085622,0.214323,778.673629,8,243,16,0.029963,0.910112,0.059925
2021-01-02,501,715,5579,0.039138,0.890551,0.070329,0.146211,1894.730432,10,461,30,0.01996,0.92016,0.05988
2021-01-03,486,281,1454,0.034685,0.879,0.086323,0.208446,152.905407,6,442,38,0.012346,0.909465,0.078189
2021-01-04,365,154,1241,0.042101,0.880693,0.077214,0.17761,1048.403198,16,326,23,0.043836,0.893151,0.063014
2021-01-05,347,1032,2375,0.033476,0.876199,0.09036,0.25214,4049.782706,4,323,20,0.011527,0.930836,0.057637


In [64]:
# Export Tweets sentiment
btc_tweets_summary_df.to_csv("Tweet Sentiments.csv")