In [25]:
import warnings
warnings.filterwarnings("ignore")

In [26]:
import json
import pandas as pd
import numpy as np
#!pip install pandas_datareader to validate crypto pricing with specific date
import pandas_datareader as web
import datetime as dt
from datetime import date
import seaborn as sns
from numerize import numerize
import torch #torch first before matplotlib, otherwise the library will crash the environment. 
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

#NLP libraries
import re
# Import nltk modules and download dataset
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop = set(stopwords.words('english'))

#import FinBert library
from textblob import TextBlob
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [27]:
#extract data libraries
import requests
import dateutil.parser
import unicodedata
import time
from searchtweets import load_credentials

In [28]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

print(os.getcwd())

C:\Users\eikde\source\repos\UM_Crypto_Sentiment_Prediction


In [29]:
from pprint import pprint
from transformers import AutoModelForSequenceClassification

from finbert import *
import finbert.utils as tools
from finbert.finbert import predict

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd()
print(project_dir)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
C:\Users\eikde\source\repos\UM_Crypto_Sentiment_Prediction


In [30]:
colors = {'red': '#ff207c', 'grey': '#42535b', 'blue': '#207cff', 'orange': '#ffa320', 'green': '#00ec8b'}
config_ticks = {'size': 14, 'color': colors['grey'], 'labelcolor': colors['grey']}
config_title = {'size': 18, 'color': colors['grey'], 'ha': 'left', 'va': 'baseline'}

In [31]:
#plot chart function
def get_charts(data, title):
    plt.rc('figure', figsize=(15, 10))
    fig, axes = plt.subplots(2, 1, 
                gridspec_kw={'height_ratios': [3, 1]})
    fig.tight_layout(pad=3)
    fig.suptitle(title, fontsize=16)
    
    date = data['Date']
    close = data['Close']
    vol = data['Volume']
    
    plot_price = axes[0]
    plot_price.plot(date, close, color=colors['blue'], 
    linewidth=2, label='Price')
    plot_price.set_ylabel('Price (in USD)', fontsize=14)
    plot_price.set_xlabel('Date', fontsize=14)
    
    plot_vol = axes[1]
    plot_vol.bar(date, vol, width=15, color='darkgrey')
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Volume (in millions)', fontsize=14)

In [32]:
#to check whether torch is available. Torch library is important for NLP and BERT.
torch.cuda.is_available()

True

# Preprocessing Text

Additional filter if needed, can be added on into preprocess_word function. This will help us to reuse this function to remove unnessary text. 

In [33]:
#a function that to help preprocessing the message to a proper text for analysis
def preprocess_word(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    # Lowercase the message
    try:
        text = str(message).lower()
    except:
        print(text)
    
    
    # Replace % to percentage only fulfill for number
    try:
        replace_percent = re.findall('(\d+(\.\d+)?%)', text)
        for i in range(len(replace_percent)):
            item = re.sub('%', 'percent', replace_percent[i][0])
            percent = item.replace("percent","%")
            item = re.sub('%', ' percent', replace_percent[i][0])
            text = text.replace(percent, item)
    except:
        print(text)
        
    # Replace URLs with a space in the message
    text = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub('\$[a-zA-Z0-9]*', ' ', text)
    
    # Replace usernames with a space. The usernames are any word that starts with @.
    text = re.sub('\@[a-zA-Z0-9]*', ' ', text)

    # Replace everything not a letter with a space
    text = re.sub('[^a-z0-9.0-9A-Z.]', ' ', text)
    
    # Remove stop words
    word_tokens = word_tokenize(text)
    filtered = []
    filtered = [w for w in word_tokens if not w in stop]
    filtered = TreebankWordDetokenizer().detokenize(filtered)
    
    return filtered

In [34]:
# Check whether the function is working. 
test_message = 'RT @google Our annual looked% at the year 50.52% in Google blogging (and beyond) http://t.co/sptHOAh8 $GOOG 33%'
print(preprocess_word(test_message))


rt annual looked year 50.52 percent google blogging beyond 33 percent


With the `predict` function, given a piece of text, we split it into a list of sentences and then predict sentiment for each sentence. The output is written into a dataframe. Predictions are represented in three different columns: 

1) `logit`: probabilities for each class

2) `prediction`: predicted label

3) `sentiment_score`: sentiment score calculated as: probability of positive - probability of negative

Below we analyze a paragraph taken out of [this](https://www.economist.com/finance-and-economics/2019/01/03/a-profit-warning-from-apple-jolts-markets) article from The Economist. For comparison purposes, we also put the sentiments predicted with TextBlob.
> Later that day Apple said it was revising down its earnings expectations in the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours trading and the decline was extended to more than 10% when the market opened. The dollar fell by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. Yields on government bonds fell as investors fled to the traditional haven in a market storm.

In [35]:
#load back the pre-train model
cl_path = project_dir/'Models'/'classifier_model'/'finbert-sentiment'
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [36]:
#Test statement
text = "Later that day Apple said it was revising down its earnings expectations in \
the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
trading and the decline was extended to more than 10% when the market opened. The dollar fell \
by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
Yields on government bonds fell as investors fled to the traditional haven in a market storm."

In [37]:
result = predict(preprocess_word(text),model, use_gpu=True, gpu_name='cuda:0',batch_size=100)

07/11/2022 18:57:32 - INFO - root -   Using device: cuda:0 
07/11/2022 18:57:32 - INFO - finbert.utils -   *** Example ***
07/11/2022 18:57:32 - INFO - finbert.utils -   guid: 0
07/11/2022 18:57:32 - INFO - finbert.utils -   tokens: [CLS] later day apple said rev ##ising earnings expectations fourth quarter 2018 largely lower sales signs economic weakness china . [SEP]
07/11/2022 18:57:32 - INFO - finbert.utils -   input_ids: 101 2101 2154 6207 2056 7065 9355 16565 10908 2959 4284 2760 4321 2896 4341 5751 3171 11251 2859 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:57:32 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:57:32 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [38]:
result

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,later day apple said revising earnings expecta...,"[0.23293637, 0.73992413, 0.027139485]",negative,-0.506988
1,news rapidly infected financial markets .,"[0.053369947, 0.9249154, 0.02171469]",negative,-0.871545
2,apple share price fell around 7 percent hours ...,"[0.015234754, 0.9667646, 0.018000556]",negative,-0.95153
3,dollar fell 3.7 percent yen matter minutes ann...,"[0.10804207, 0.8827521, 0.009205873]",negative,-0.77471
4,asian stockmarkets closed january 3rd european...,"[0.016140783, 0.96877366, 0.015085568]",negative,-0.952633
5,yields government bonds fell investors fled tr...,"[0.06963585, 0.91269547, 0.017668657]",negative,-0.84306


In [39]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is -0.82.


In [40]:
text = "STEPN is cutting its services for players in mainland China, and the move has had a major\
        impact on the app’s tokens. The move-to-earn lifestyle app announced Thursday that it would\
        cut access to users playing from mainland China to abide by local regulations."
result = predict(preprocess_word(text),model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
print(preprocess_word(text))


07/11/2022 18:57:34 - INFO - root -   Using device: cuda:0 
07/11/2022 18:57:34 - INFO - finbert.utils -   *** Example ***
07/11/2022 18:57:34 - INFO - finbert.utils -   guid: 0
07/11/2022 18:57:34 - INFO - finbert.utils -   tokens: [CLS] step ##n cutting services players mainland china move major impact app token ##s . [SEP]
07/11/2022 18:57:34 - INFO - finbert.utils -   input_ids: 101 3357 2078 6276 2578 2867 8240 2859 2693 2350 4254 10439 19204 2015 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:57:34 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:57:34 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:57:34 - INFO - finbert.utils -   label: No

stepn cutting services players mainland china move major impact app tokens . move earn lifestyle app announced thursday would cut access users playing mainland china abide local regulations.


In [41]:
result


Unnamed: 0,sentence,logit,prediction,sentiment_score
0,stepn cutting services players mainland china ...,"[0.21925202, 0.5232107, 0.2575373]",negative,-0.303959
1,move earn lifestyle app announced thursday wou...,"[0.033123314, 0.5074638, 0.4594128]",negative,-0.47434


In [18]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is -0.39.


In [19]:
text = "Good News For Markets As Fed’s Preferred Inflation Metric May Have Peaked. \
        Today’s PCE inflation number was relatively good news for markets suggesting inflation \
        could trend lower over the summer. The annual price change for April 2022 was 6.3%. \
        That’s down from 6.6% in March, as prices for goods rose at a slower pace than previously \
        and price increases for services rose at a broadly similar rate to recent months.\
        Stripping out food and energy, annual inflation fell back to 4.9%, a rate of growth\
        we last saw in December 2021. It’s too early to be sure, but inflation may be trending lower.\
        Of course, inflation is still well above the Federal Reserve’s 2% target,\
        but the direction is potentially a positive."
result = predict(preprocess_word(text),model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
print(preprocess_word(text))

07/11/2022 17:52:56 - INFO - root -   Using device: cuda:0 
07/11/2022 17:52:56 - INFO - finbert.utils -   *** Example ***
07/11/2022 17:52:56 - INFO - finbert.utils -   guid: 0
07/11/2022 17:52:56 - INFO - finbert.utils -   tokens: [CLS] good news markets fed preferred inflation metric may peaked . [SEP]
07/11/2022 17:52:56 - INFO - finbert.utils -   input_ids: 101 2204 2739 6089 7349 6871 14200 12046 2089 6601 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 17:52:56 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 17:52:56 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 17:52:56 - INFO - finbert.utils -   label: None (id = 9090)
07/11/2022 17:52:5

good news markets fed preferred inflation metric may peaked . today pce inflation number relatively good news markets suggesting inflation could trend lower summer . annual price change april 2022 6.3 percent . 6.6 percent march prices goods rose slower pace previously price increases services rose broadly similar rate recent months . stripping food energy annual inflation fell back 4.9 percent rate growth last saw december 2021. early sure inflation may trending lower . course inflation still well federal reserve 2 percent target direction potentially positive.


In [20]:
result

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,good news markets fed preferred inflation metr...,"[0.42937544, 0.54434097, 0.026283588]",negative,-0.114966
1,today pce inflation number relatively good new...,"[0.8329399, 0.13839239, 0.028667703]",positive,0.694548
2,annual price change april 2022 6.3 percent .,"[0.041362982, 0.014596945, 0.94404006]",neutral,0.026766
3,6.6 percent march prices goods rose slower pac...,"[0.93361944, 0.03438829, 0.031992253]",positive,0.899231
4,stripping food energy annual inflation fell ba...,"[0.050289456, 0.9391308, 0.010579836]",negative,-0.888841
5,course inflation still well federal reserve 2 ...,"[0.6708865, 0.26270625, 0.06640722]",positive,0.40818


In [21]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is 0.17.


# Generate Forbes finance news sentiment

In [11]:
#read excel file
news = pd.read_excel(r"Data/Forbes/Forbes_News_Jun2022.xlsx")
#news = news.drop(['Column1','sentiment_score'], axis=1)
news.rename(columns ={"label": "category"}, inplace=True)

In [18]:
start = dt.datetime(2021,10,1)
end = dt.datetime(2022,6,30)
def filter_news_by_date(start, end, data):
    data.date = pd.to_datetime(data.date)
    data = data[(data['date'] >= start) & (data['date'] <= end)]
    data = data.set_index('date') 
    data = data.sort_index()
    data = data.reset_index()
    #data = data.set_index('date')
    return data

In [13]:
news['category'] = np.where(news.category == 'Metaverse', 'Cryptocurrencies', news.category)
news['content'] = news["header"] + ". " + news["desc"]
news_target_date = filter_news_by_date(start, end, news)
news_target_date

Unnamed: 0,date,Column1,category,header,desc,author,content
0,2021-10-06,7330,Finance,India’s Wealth Creation: Rapid Recovery Puts E...,GDP grew at a record pace in the fiscal first ...,Rainer Michael Preiss,India’s Wealth Creation: Rapid Recovery Puts E...
1,2021-10-06,7342,Stock Market,McConnell Offers Democrats An Emergency Debt L...,"For weeks, the top Senate Republican has firml...",Jonathan Ponciano,McConnell Offers Democrats An Emergency Debt L...
2,2021-10-06,7341,Finance,How Making Public Long-Term Care Insurance (So...,When Washington State legislators approved a p...,Howard Gleckman,How Making Public Long-Term Care Insurance (So...
3,2021-10-06,7340,Stock Market,Blackstone Mortgage Trust Passes Through 8% Yi...,"In trading on Wednesday, shares of Blackstone ...",Dividend Channel,Blackstone Mortgage Trust Passes Through 8% Yi...
4,2021-10-06,7339,Finance,Why Traders Should Dig For Goldman Sachs Stock...,The shares of Goldman Sachs (GS) are up 0.4% a...,Schaeffer's Investment Research,Why Traders Should Dig For Goldman Sachs Stock...
...,...,...,...,...,...,...,...
9117,2022-06-30,7392,Stock Market,3 Key Student Loan Forgiveness Opportunities M...,Several new student loan forgiveness initiativ...,Adam S. Minsky,3 Key Student Loan Forgiveness Opportunities M...
9118,2022-06-30,7391,Cryptocurrencies,Supplementary Review: Progress Software (PRGS)...,Fiscal 2022 Q2 adjusted revenue (includes acqu...,Taesik Yoon,Supplementary Review: Progress Software (PRGS)...
9119,2022-06-30,7389,Stock Market,"Portfolio Change: One Stock To Buy, June 29",Investors’ fears that consumer demand will slo...,Taesik Yoon,"Portfolio Change: One Stock To Buy, June 29. I..."
9120,2022-06-30,7384,Finance,"Private Equity, Crypto Allowed In 401ks—Lotter...",Now that 401(k) investors are now free to jeop...,Edward Siedle,"Private Equity, Crypto Allowed In 401ks—Lotter..."


In [None]:
finance_news = news_target_date.drop(['Column1','author'], axis=1)

sentiment_result = pd.DataFrame()
fieldnames = ['date','category','content', 'sentiment_score']

for index, row in finance_news.iterrows():
    date = row['date']
    content = preprocess_word(row['header']) + ' ' + preprocess_word(row['desc'])
    category = row['category']
    
    predict_content = predict(content, model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
    sentiment_score = predict_content.sentiment_score.mean()
    
    value = [(date, category, content, sentiment_score)]
    
    record = pd.DataFrame(value, columns=fieldnames)
    sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)

sentiment_result
sentiment_result.to_csv(r"Data/text_data/{0}_sentiment.csv".format('Forbes_News'), sep="\t")

# Generate CoinDesk news sentiment

In [16]:
#read excel file
news = pd.read_excel(r"Data/CoinDesk/coindesk_news.xlsx")
#news = news.drop(['Column1','sentiment_score'], axis=1)
#news.rename(columns ={"label": "category"}, inplace=True)

In [19]:
news_target_date = filter_news_by_date(start, end, news)
news_target_date

Unnamed: 0,date,index,header,desc
0,2021-10-29,1121,Curve Finance’s CRV Quietly Becomes a Top-Perf...,CRV was up 1.87% in the past 24 hours and near...
1,2021-10-29,1118,"Play-to-Earn Squid Token Rockets 35,000% in 3 ...",The token buys entry to an online game inspire...
2,2021-10-29,1120,Bitcoin Back Over $60K as El Salvador Buys 420...,"After a swoon in recent days, the largest cryp..."
3,2021-10-29,1119,Market Wrap: Bitcoin Heads to $61K Ahead of Op...,Analysts expect short-term choppiness ahead of...
4,2021-10-30,1113,Ether alcanza un récord de $4400 mientras shib...,La quema de monedas se refiere al proceso de r...
...,...,...,...,...
1117,2022-06-30,4,Genesis Faces ‘Hundreds of Millions’ in Losses...,The DCG-owned trading colossus is said to have...
1118,2022-06-30,3,Messari Research: DCG’s Barry Silbert Wins Fro...,Messari’s Ryan Selkis says Grayscale's product...
1119,2022-06-30,2,First Mover Asia: How Traders Are Shorting Tet...,Hedge funds are increasingly betting against U...
1120,2022-06-30,1,Bitcoin Drops to Nearly $19K as Fed Renews Inf...,Central bank leaders warned Wednesday that inf...


In [None]:
news_target_date['content'] = news_target_date["header"] + ". " + news_target_date["desc"]
coindesk_news = news_target_date.drop(['index','header','desc'], axis=1)

sentiment_result = pd.DataFrame()
fieldnames = ['date','content', 'sentiment_score']

for index, row in coindesk_news.iterrows():
    date = row['date']
    content = preprocess_word(row['content'])
    
    predict_content = predict(content, model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
    sentiment_score = predict_content.sentiment_score.mean()
    
    value = [(date, content, sentiment_score)]
    
    record = pd.DataFrame(value, columns=fieldnames)
    sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)

sentiment_result
sentiment_result.to_csv(r"Data/text_data/{0}_sentiment.csv".format('CoinDesk_News'), sep="\t")

07/11/2022 18:49:43 - INFO - root -   Using device: cuda:0 
07/11/2022 18:49:43 - INFO - finbert.utils -   *** Example ***
07/11/2022 18:49:43 - INFO - finbert.utils -   guid: 0
07/11/2022 18:49:43 - INFO - finbert.utils -   tokens: [CLS] curve finance cr ##v quietly becomes top performing def ##i token despite me ##me token cr ##az ##e . [SEP]
07/11/2022 18:49:43 - INFO - finbert.utils -   input_ids: 101 7774 5446 13675 2615 5168 4150 2327 4488 13366 2072 19204 2750 2033 4168 19204 13675 10936 2063 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:49:43 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:49:43 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
07/11/2022 18:49:4

# Generate community discussion sentiment

In [42]:
def generate_sentiment(data, symbol):
    
    data["Process Content"] = data["Content"].apply(lambda x: preprocess_word(x))
    data["Date"] = data["Date"].apply(lambda x: pd.to_datetime(x, unit="ns", utc=True).floor('D').date())
    data = data.drop(['AuthorID','Author','Content', 'Attachments','Reactions'], axis=1)
    data = data.reset_index()
    sentiment_result = pd.DataFrame()
    fieldnames = ['symbol','date', 'content', 'sentiment_score']
    for index, row in data.iterrows():
        percent_complete(int(index), int(data.shape[0]),title="{0}: Predicting text sentiment analysis".format(symbol))
        symbol = symbol
        date = row['Date']
        content = row['Process Content']
        predict_content = predict(row['Process Content'], model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
        sentiment_score = predict_content.sentiment_score.mean()
    
        value = [(symbol, date, content, sentiment_score)]
    
        record = pd.DataFrame(value, columns=fieldnames)
        sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)


    sentiment_result
    sentiment_result.to_csv(r"Data/text_data/{0}_sentiment.csv".format(symbol), sep="\t")

In [43]:
def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
    import sys

    # UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
    utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
    perc = 100 * float(step) / float(total_steps)
    max_ticks = bar_width * 8
    num_ticks = int(round(perc / 100 * max_ticks))
    full_ticks = num_ticks / 8      # Number of full blocks
    part_ticks = num_ticks % 8      # Size of partial block (array index)
    
    disp = bar = ""                 # Blank out variables
    bar += utf_8s[0] * int(full_ticks)   # Add full blocks into Progress Bar
    
    # If part_ticks is zero, then no partial block, else append part char
    if part_ticks > 0:
        bar += utf_8s[part_ticks]
    
    # Pad Progress Bar with fill character
    bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))
    
    if len(title) > 0:
        disp = title + ": "         # Optional title to progress display
    
    # Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
    disp += "\x1b[0;32m"            # Color Green
    disp += bar                     # Progress bar to progress display
    disp += "\x1b[0m"               # Color Reset
    if print_perc:
        # If requested, append percentage complete to progress display
        if perc > 100.0:
            perc = 100.0            # Fix "100.04 %" rounding error
        disp += " {:6.2f}".format(perc) + " %"
    
    # Output to terminal repetitively over the same line using '\r'.
    sys.stdout.write("\r" + disp)
    sys.stdout.flush()

# Process Discord Data to Generate Sentiment Score

In [None]:
MANA_General = pd.read_csv(r"Data/discord/Decentraland_General_2021-10-01_2022-06-30.csv")
generate_sentiment(MANA_General,'MANA_General')

In [None]:
BTC_General = pd.read_csv(r"Data/discord/Bitcoin_General_2021-10-01_2022-06-30.csv")
generate_sentiment(BTC_General,'BTC_General')

In [None]:
ETH_General = pd.read_csv(r"Data/discord/Ethereum_General_2021-10-01_2022-06-30.csv")
generate_sentiment(ETH_General,'ETH_General')

In [None]:
BNB_General = pd.read_csv(r"Data/discord/Binance_General_2021-10-01_2022-06-30.csv")
generate_sentiment(BNB_General,'BNB_General')

In [None]:
ENJ_General = pd.read_csv(r"Data/discord/Enjin_General_2021-10-01_2022-06-30.csv")
generate_sentiment(ENJ_General,'ENJ_General')

In [None]:
RACA_General = pd.read_csv(r"Data/discord/RadioCaca_General2021-10-01_2022-06-30.csv")
generate_sentiment(RACA_General,'RACA_General')