In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import json
import pandas as pd
import numpy as np
#!pip install pandas_datareader to validate crypto pricing with specific date
import pandas_datareader as web
import datetime as dt
from datetime import date
import seaborn as sns
from numerize import numerize
import torch #torch first before matplotlib, otherwise the library will crash the environment. 
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

#NLP libraries
import re
# Import nltk modules and download dataset
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop = set(stopwords.words('english'))

#import FinBert library
from textblob import TextBlob
from sklearn.metrics import classification_report


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#extract data libraries
import requests
import dateutil.parser
import unicodedata
import time
from searchtweets import load_credentials

In [4]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

print(os.getcwd())

C:\Users\eikde\source\repos\exploration\01_METAVERSE


In [5]:
from pprint import pprint
from transformers import AutoModelForSequenceClassification

from finbert import *
import finbert.utils as tools
from finbert.finbert import predict

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd()
print(project_dir)

C:\Users\eikde\source\repos\exploration\01_METAVERSE


In [6]:
colors = {'red': '#ff207c', 'grey': '#42535b', 'blue': '#207cff', 'orange': '#ffa320', 'green': '#00ec8b'}
config_ticks = {'size': 14, 'color': colors['grey'], 'labelcolor': colors['grey']}
config_title = {'size': 18, 'color': colors['grey'], 'ha': 'left', 'va': 'baseline'}

In [7]:
#plot chart function
def get_charts(data, title):
    plt.rc('figure', figsize=(15, 10))
    fig, axes = plt.subplots(2, 1, 
                gridspec_kw={'height_ratios': [3, 1]})
    fig.tight_layout(pad=3)
    fig.suptitle(title, fontsize=16)
    
    date = data['Date']
    close = data['Close']
    vol = data['Volume']
    
    plot_price = axes[0]
    plot_price.plot(date, close, color=colors['blue'], 
    linewidth=2, label='Price')
    plot_price.set_ylabel('Price (in USD)', fontsize=14)
    plot_price.set_xlabel('Date', fontsize=14)
    
    plot_vol = axes[1]
    plot_vol.bar(date, vol, width=15, color='darkgrey')
    plt.xlabel('Date', fontsize=14)
    plt.ylabel('Volume (in millions)', fontsize=14)

In [8]:
#to check whether torch is available. Torch library is important for NLP and BERT.
torch.cuda.is_available()

True

# Preprocessing Text

Additional filter if needed, can be added on into preprocess_word function. This will help us to reuse this function to remove unnessary text. 

In [9]:
#a function that to help preprocessing the message to a proper text for analysis
def preprocess_word(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    # Lowercase the message
    try:
        text = str(message).lower()
    except:
        print(text)
    
    
    # Replace % to percentage only fulfill for number
    try:
        replace_percent = re.findall('(\d+(\.\d+)?%)', text)
        for i in range(len(replace_percent)):
            item = re.sub('%', 'percent', replace_percent[i][0])
            percent = item.replace("percent","%")
            item = re.sub('%', ' percent', replace_percent[i][0])
            text = text.replace(percent, item)
    except:
        print(text)
        
    # Replace URLs with a space in the message
    text = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub('\$[a-zA-Z0-9]*', ' ', text)
    
    # Replace usernames with a space. The usernames are any word that starts with @.
    text = re.sub('\@[a-zA-Z0-9]*', ' ', text)

    # Replace everything not a letter with a space
    text = re.sub('[^a-z0-9.0-9A-Z.]', ' ', text)
    
    # Remove stop words
    word_tokens = word_tokenize(text)
    filtered = []
    filtered = [w for w in word_tokens if not w in stop]
    filtered = TreebankWordDetokenizer().detokenize(filtered)
    
    return filtered

In [10]:
# Check whether the function is working. 
test_message = 'RT @google Our annual looked% at the year 50.52% in Google blogging (and beyond) http://t.co/sptHOAh8 $GOOG 33%'
print(preprocess_word(test_message))


rt annual looked year 50.52 percent google blogging beyond 33 percent


With the `predict` function, given a piece of text, we split it into a list of sentences and then predict sentiment for each sentence. The output is written into a dataframe. Predictions are represented in three different columns: 

1) `logit`: probabilities for each class

2) `prediction`: predicted label

3) `sentiment_score`: sentiment score calculated as: probability of positive - probability of negative

Below we analyze a paragraph taken out of [this](https://www.economist.com/finance-and-economics/2019/01/03/a-profit-warning-from-apple-jolts-markets) article from The Economist. For comparison purposes, we also put the sentiments predicted with TextBlob.
> Later that day Apple said it was revising down its earnings expectations in the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours trading and the decline was extended to more than 10% when the market opened. The dollar fell by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. Yields on government bonds fell as investors fled to the traditional haven in a market storm.

In [11]:
#load back the pre-train model
cl_path = project_dir/'Models'/'classifier_model'/'finbert-sentiment'
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [12]:
#Test statement
text = "Later that day Apple said it was revising down its earnings expectations in \
the fourth quarter of 2018, largely because of lower sales and signs of economic weakness in China. \
The news rapidly infected financial markets. Apple’s share price fell by around 7% in after-hours \
trading and the decline was extended to more than 10% when the market opened. The dollar fell \
by 3.7% against the yen in a matter of minutes after the announcement, before rapidly recovering \
some ground. Asian stockmarkets closed down on January 3rd and European ones opened lower. \
Yields on government bonds fell as investors fled to the traditional haven in a market storm."

In [13]:
result = predict(preprocess_word(text),model, use_gpu=True, gpu_name='cuda:0',batch_size=100)

05/28/2022 13:12:18 - INFO - root -   Using device: cuda:0 
05/28/2022 13:12:18 - INFO - finbert.utils -   *** Example ***
05/28/2022 13:12:18 - INFO - finbert.utils -   guid: 0
05/28/2022 13:12:18 - INFO - finbert.utils -   tokens: [CLS] later day apple said rev ##ising earnings expectations fourth quarter 2018 largely lower sales signs economic weakness china . [SEP]
05/28/2022 13:12:18 - INFO - finbert.utils -   input_ids: 101 2101 2154 6207 2056 7065 9355 16565 10908 2959 4284 2760 4321 2896 4341 5751 3171 11251 2859 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:12:18 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:12:18 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [14]:
result

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,later day apple said revising earnings expecta...,"[0.016367333, 0.9784824, 0.0051503545]",negative,-0.962115
1,news rapidly infected financial markets .,"[0.0031599307, 0.97179496, 0.025045084]",negative,-0.968635
2,apple share price fell around 7 percent hours ...,"[0.0018037576, 0.9909236, 0.007272607]",negative,-0.98912
3,dollar fell 3.7 percent yen matter minutes ann...,"[0.023911143, 0.9672537, 0.008835201]",negative,-0.943343
4,asian stockmarkets closed january 3rd european...,"[0.0020976358, 0.99082804, 0.007074352]",negative,-0.98873
5,yields government bonds fell investors fled tr...,"[0.011761616, 0.96269923, 0.025539124]",negative,-0.950938


In [15]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is -0.97.


In [18]:
text = "STEPN is cutting its services for players in mainland China, and the move has had a major\
        impact on the app’s tokens. The move-to-earn lifestyle app announced Thursday that it would\
        cut access to users playing from mainland China to abide by local regulations."
result = predict(preprocess_word(text),model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
print(preprocess_word(text))


05/28/2022 13:13:54 - INFO - root -   Using device: cuda:0 
05/28/2022 13:13:54 - INFO - finbert.utils -   *** Example ***
05/28/2022 13:13:54 - INFO - finbert.utils -   guid: 0
05/28/2022 13:13:54 - INFO - finbert.utils -   tokens: [CLS] step ##n cutting services players mainland china move major impact app token ##s . [SEP]
05/28/2022 13:13:54 - INFO - finbert.utils -   input_ids: 101 3357 2078 6276 2578 2867 8240 2859 2693 2350 4254 10439 19204 2015 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:13:54 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:13:54 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:13:54 - INFO - finbert.utils -   label: No

stepn cutting services players mainland china move major impact app tokens . move earn lifestyle app announced thursday would cut access users playing mainland china abide local regulations.
                                            sentence  \
0  stepn cutting services players mainland china ...   
1  move earn lifestyle app announced thursday wou...   

                                   logit prediction sentiment_score  
0   [0.07829892, 0.34680462, 0.57489645]    neutral       -0.268506  
1  [0.006916296, 0.84532917, 0.14775454]   negative       -0.838413  
Average sentiment is -0.55.


In [19]:
result


Unnamed: 0,sentence,logit,prediction,sentiment_score
0,stepn cutting services players mainland china ...,"[0.07829892, 0.34680462, 0.57489645]",neutral,-0.268506
1,move earn lifestyle app announced thursday wou...,"[0.006916296, 0.84532917, 0.14775454]",negative,-0.838413


In [20]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is -0.55.


In [22]:
text = "Good News For Markets As Fed’s Preferred Inflation Metric May Have Peaked. \
        Today’s PCE inflation number was relatively good news for markets suggesting inflation \
        could trend lower over the summer. The annual price change for April 2022 was 6.3%. \
        That’s down from 6.6% in March, as prices for goods rose at a slower pace than previously \
        and price increases for services rose at a broadly similar rate to recent months.\
        Stripping out food and energy, annual inflation fell back to 4.9%, a rate of growth\
        we last saw in December 2021. It’s too early to be sure, but inflation may be trending lower.\
        Of course, inflation is still well above the Federal Reserve’s 2% target,\
        but the direction is potentially a positive."
result = predict(preprocess_word(text),model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
print(preprocess_word(text))

05/28/2022 13:21:55 - INFO - root -   Using device: cuda:0 
05/28/2022 13:21:55 - INFO - finbert.utils -   *** Example ***
05/28/2022 13:21:55 - INFO - finbert.utils -   guid: 0
05/28/2022 13:21:55 - INFO - finbert.utils -   tokens: [CLS] good news markets fed preferred inflation metric may peaked . [SEP]
05/28/2022 13:21:55 - INFO - finbert.utils -   input_ids: 101 2204 2739 6089 7349 6871 14200 12046 2089 6601 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:21:55 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:21:55 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/28/2022 13:21:55 - INFO - finbert.utils -   label: None (id = 9090)
05/28/2022 13:21:5

good news markets fed preferred inflation metric may peaked . today pce inflation number relatively good news markets suggesting inflation could trend lower summer . annual price change april 2022 6.3 percent . 6.6 percent march prices goods rose slower pace previously price increases services rose broadly similar rate recent months . stripping food energy annual inflation fell back 4.9 percent rate growth last saw december 2021. early sure inflation may trending lower . course inflation still well federal reserve 2 percent target direction potentially positive.


In [23]:
result

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,good news markets fed preferred inflation metr...,"[0.36811048, 0.61065453, 0.02123492]",negative,-0.242544
1,today pce inflation number relatively good new...,"[0.96358657, 0.028451834, 0.007961505]",positive,0.935135
2,annual price change april 2022 6.3 percent .,"[0.024601964, 0.014657142, 0.96074086]",neutral,0.009945
3,6.6 percent march prices goods rose slower pac...,"[0.9722979, 0.01217896, 0.01552313]",positive,0.960119
4,stripping food energy annual inflation fell ba...,"[0.0101604, 0.9754249, 0.01441473]",negative,-0.965264
5,course inflation still well federal reserve 2 ...,"[0.95280415, 0.016268332, 0.030927548]",positive,0.936536


In [24]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is 0.27.


# Generate finance news sentiment

In [None]:
#read excel file
news = pd.read_excel(r"Data/forbes_news/forbes_news.xlsx")
#news = news.drop(['Column1','sentiment_score'], axis=1)
news.rename(columns ={"label": "category"}, inplace=True)

In [None]:
start = dt.datetime(2021,10,1)
end = dt.datetime(2022,4,30)
def filter_news_by_date(start, end, data):
    data.date = pd.to_datetime(data.date)
    data = data[(data['date'] >= start) & (data['date'] <= end)]
    data = data.set_index('date') 
    data = data.sort_index()
    data = data.reset_index()
    #data = data.set_index('date')
    return data

In [None]:
news['category'] = np.where(news.category == 'Metaverse', 'Cryptocurrencies', news.category)
news['content'] = news["header"] + ". " + news["desc"]
news_target_date = filter_news_by_date(start, end, news)
news_target_date

In [None]:
finance_news = news_target_date.drop(['number','author'], axis=1)

sentiment_result = pd.DataFrame()
fieldnames = ['date','category','content', 'sentiment_score']

for index, row in finance_news.iterrows():
    date = row['date']
    content = preprocess_word(row['header']) + ' ' + preprocess_word(row['desc'])
    category = row['category']
    
    predict_content = predict(content, model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
    sentiment_score = predict_content.sentiment_score.mean()
    
    value = [(date, category, content, sentiment_score)]
    
    record = pd.DataFrame(value, columns=fieldnames)
    sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)

sentiment_result
sentiment_result.to_csv(r"Data/text_data/{0}_sentiment.csv".format('News'), sep="\t")

In [None]:
def generate_sentiment(data, symbol):
    
    data["Process Content"] = data["Content"].apply(lambda x: preprocess_word(x))
    data["Date"] = data["Date"].apply(lambda x: pd.to_datetime(x, unit="ns", utc=True).floor('D').date())
    data = data.drop(['AuthorID','Author','Content', 'Attachments','Reactions'], axis=1)
    data = data.reset_index()
    sentiment_result = pd.DataFrame()
    fieldnames = ['symbol','date', 'content', 'sentiment_score']
    for index, row in data.iterrows():
        percent_complete(int(index), int(data.shape[0]),title="{0}: Predicting text sentiment analysis".format(symbol))
        symbol = symbol
        date = row['Date']
        content = row['Process Content']
        predict_content = predict(row['Process Content'], model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
        sentiment_score = predict_content.sentiment_score.mean()
    
        value = [(symbol, date, content, sentiment_score)]
    
        record = pd.DataFrame(value, columns=fieldnames)
        sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)


    sentiment_result
    sentiment_result.to_csv(r"Data/text_data/{0}_sentiment.csv".format(symbol), sep="\t")

In [None]:
def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
    import sys

    # UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
    utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
    perc = 100 * float(step) / float(total_steps)
    max_ticks = bar_width * 8
    num_ticks = int(round(perc / 100 * max_ticks))
    full_ticks = num_ticks / 8      # Number of full blocks
    part_ticks = num_ticks % 8      # Size of partial block (array index)
    
    disp = bar = ""                 # Blank out variables
    bar += utf_8s[0] * int(full_ticks)   # Add full blocks into Progress Bar
    
    # If part_ticks is zero, then no partial block, else append part char
    if part_ticks > 0:
        bar += utf_8s[part_ticks]
    
    # Pad Progress Bar with fill character
    bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))
    
    if len(title) > 0:
        disp = title + ": "         # Optional title to progress display
    
    # Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
    disp += "\x1b[0;32m"            # Color Green
    disp += bar                     # Progress bar to progress display
    disp += "\x1b[0m"               # Color Reset
    if print_perc:
        # If requested, append percentage complete to progress display
        if perc > 100.0:
            perc = 100.0            # Fix "100.04 %" rounding error
        disp += " {:6.2f}".format(perc) + " %"
    
    # Output to terminal repetitively over the same line using '\r'.
    sys.stdout.write("\r" + disp)
    sys.stdout.flush()

# Process Discord Data to Generate Sentiment Score

In [None]:
MANA_Announcement = pd.read_csv(r"Data/discord/Decentraland - announcements (2021-10-01 to 2022-04-30).csv")
generate_sentiment(MANA_Announcement,'MANA_Announcement')

In [None]:
MANA_General = pd.read_csv(r"Data/discord/Decentraland - general (2021-10-01 to 2022-04-30).csv")
generate_sentiment(MANA_General,'MANA_General')

In [None]:
SAND_Announcement = pd.read_csv(r"Data/discord/The Sandbox - announcements (2021-10-01 to 2022-04-30).csv")
generate_sentiment(SAND_Announcement,'SAND_Announcement')

In [None]:
SAND_General = pd.read_csv(r"Data/discord/The Sandbox - general (2021-10-01 to 2022-04-30).csv")
generate_sentiment(SAND_General,'SAND_General')

In [None]:
THETA_Announcement = pd.read_csv(r"Data/discord/Theta Network - announcements (2021-10-01 to 2022-04-30).csv")
generate_sentiment(THETA_Announcement,'THETA_Announcement')

In [None]:
THETA_General = pd.read_csv(r"Data/discord/Theta Network - general (2021-10-01 to 2022-04-30).csv")
generate_sentiment(THETA_General,'THETA_General')

In [None]:
STX_Announcement = pd.read_csv(r"Data/discord/Stacks - announcements (2021-10-01 to 2022-04-30).csv")
generate_sentiment(STX_Announcement,'STX_Announcement')

In [None]:
STX_General = pd.read_csv(r"Data/discord/Stacks - general (2021-10-01 to 2022-04-30).csv")
generate_sentiment(STX_General,'STX_General')

In [None]:
AXS_Announcement = pd.read_csv(r"Data/discord/Axie Infinity - announcements (2021-10-01 to 2022-04-30).csv")
generate_sentiment(AXS_Announcement,'AXS_Announcement')

In [None]:
AXS_Discussion_1 = pd.read_csv(r"Data/discord/Axie Infinity - general (2021-10-21 to 2021-12-11).csv")
AXS_Discussion_2 = pd.read_csv(r"Data/discord/Axie Infinity - general (2021-12-12 to 2022-04-25).csv")
AXS_Discussion = AXS_Discussion_1.append(AXS_Discussion_2, ignore_index=True)
AXS_Discussion = AXS_Discussion.reset_index()
AXS_Discussion = AXS_Discussion.set_index("index")

generate_sentiment(AXS_Discussion,'AXS_General')