# Import Libraries

In [1]:
import json
import pandas as pd
import numpy as np
#!pip install pandas_datareader to validate crypto pricing with specific date
import pandas_datareader as web
import datetime as dt
from datetime import date
import seaborn as sns
from numerize import numerize
import torch #torch first before matplotlib, otherwise the library will crash the environment. 
import matplotlib.patches as patches
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

#NLP libraries
import re
# Import nltk modules and download dataset
import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop = set(stopwords.words('english'))

#import FinBert library
from textblob import TextBlob
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eikde\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#extract data libraries
import requests
import dateutil.parser
import unicodedata
import time
from searchtweets import load_credentials

In [3]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

print(os.getcwd())

C:\Users\eikde\source\repos\exploration\STEPN


In [4]:
from pprint import pprint
from transformers import AutoModelForSequenceClassification

from finbert import *
import finbert.utils as tools
from finbert.finbert import predict

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd()
print(project_dir)

C:\Users\eikde\source\repos\exploration\STEPN


In [5]:
#to check whether torch is available. Torch library is important for NLP and BERT.
torch.cuda.is_available()

True

# Preprocessing Text

Additional filter if needed, can be added on into preprocess_word function. This will help us to reuse this function to remove unnessary text. 

In [6]:
#a function that to help preprocessing the message to a proper text for analysis
def preprocess_word(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    # Lowercase the message
    try:
        text = str(message).lower()
    except:
        print(text)
    
    
    # Replace % to percentage only fulfill for number
    try:
        replace_percent = re.findall('(\d+(\.\d+)?%)', text)
        for i in range(len(replace_percent)):
            item = re.sub('%', 'percent', replace_percent[i][0])
            percent = item.replace("percent","%")
            item = re.sub('%', ' percent', replace_percent[i][0])
            text = text.replace(percent, item)
    except:
        print(text)
        
    # Replace URLs with a space in the message
    text = re.sub('https?:\/\/[a-zA-Z0-9@:%._\/+~#=?&;-]*', ' ', text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub('\$[a-zA-Z0-9]*', ' ', text)
    
    # Replace usernames with a space. The usernames are any word that starts with @.
    text = re.sub('\@[a-zA-Z0-9]*', ' ', text)

    # Replace everything not a letter with a space
    text = re.sub('[^a-z0-9.0-9A-Z.]', ' ', text)
    
    # Remove stop words
    word_tokens = word_tokenize(text)
    filtered = []
    filtered = [w for w in word_tokens if not w in stop]
    filtered = TreebankWordDetokenizer().detokenize(filtered)
    
    return filtered

# Get predictions

With the `predict` function, given a piece of text, we split it into a list of sentences and then predict sentiment for each sentence. The output is written into a dataframe. Predictions are represented in three different columns: 

1) `logit`: probabilities for each class

2) `prediction`: predicted label

3) `sentiment_score`: sentiment score calculated as: probability of positive - probability of negative


In [7]:
#load back the pre-train model
cl_path = project_dir/'Models'/'classifier_model'/'finbert-sentiment'
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [8]:
#Test statement
text = "animoca leads million funding round hong kong nft platform amid crypto craze ucollex \
        latest funding round comes sales digital collectibles gaining ground city seen increasing \
        number projects launched past year."

In [9]:
result = predict(preprocess_word(text), model,use_gpu=True, gpu_name='cuda:0',batch_size=100)

05/09/2022 22:51:34 - INFO - root -   Using device: cuda:0 
05/09/2022 22:51:34 - INFO - finbert.utils -   *** Example ***
05/09/2022 22:51:34 - INFO - finbert.utils -   guid: 0
05/09/2022 22:51:34 - INFO - finbert.utils -   tokens: [CLS] an ##imo ##ca leads million funding round hong kong n ##ft platform amid crypt ##o cr ##az ##e uc ##oll ##ex latest funding round comes sales digital collect ##ible ##s gaining ground city seen increasing number projects launched past year . [SEP]
05/09/2022 22:51:34 - INFO - finbert.utils -   input_ids: 101 2019 16339 3540 5260 2454 4804 2461 4291 4290 1050 6199 4132 13463 19888 2080 13675 10936 2063 15384 14511 10288 6745 4804 2461 3310 4341 3617 8145 7028 2015 8550 2598 2103 2464 4852 2193 3934 3390 2627 2095 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/09/2022 22:51:34 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0

In [10]:
result

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,animoca leads million funding round hong kong ...,"[0.9674181, 0.0042448044, 0.028337164]",positive,0.963173


In [11]:
#Test statement
text = "With crypto tax evasion growing around the world, a key global body is looking\
        to standardize reporting requirements. However, despite good intentions they could be \
        onerous for the industry to comply with."
result = predict(preprocess_word(text), model,use_gpu=True, gpu_name='cuda:0',batch_size=100)

05/05/2022 20:47:36 - INFO - root -   Using device: cuda:0 
05/05/2022 20:47:36 - INFO - finbert.utils -   *** Example ***
05/05/2022 20:47:36 - INFO - finbert.utils -   guid: 0
05/05/2022 20:47:36 - INFO - finbert.utils -   tokens: [CLS] crypt ##o tax eva ##sion growing around world key global body looking standard ##ize reporting requirements . [SEP]
05/05/2022 20:47:36 - INFO - finbert.utils -   input_ids: 101 19888 2080 4171 9345 10992 3652 2105 2088 3145 3795 2303 2559 3115 4697 7316 5918 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/05/2022 20:47:36 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/05/2022 20:47:36 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
05/05/2022 20:47:36 

In [12]:
result

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,crypto tax evasion growing around world key gl...,"[0.2166585, 0.0037592866, 0.7795822]",neutral,0.212899
1,however despite good intentions could onerous ...,"[0.690478, 0.016513273, 0.29300866]",positive,0.673965


In [13]:
print(f'Average sentiment is %.2f.' % (result.sentiment_score.mean()))

Average sentiment is 0.44.


# Generate finance news sentiment

In [44]:
#read excel file
news = pd.read_excel(r"Data/forbes_news_new.xlsx")
#news = news.drop(['Column1','sentiment_score'], axis=1)
news.rename(columns ={"label": "category"}, inplace=True)

In [45]:
start = dt.datetime(2022,4,25)
end = dt.datetime(2022,5,9)
def filter_news_by_date(start, end, data):
    data.date = pd.to_datetime(data.date)
    data = data[(data['date'] >= start) & (data['date'] <= end)]
    data = data.set_index('date') 
    data = data.sort_index()
    data = data.reset_index()
    #data = data.set_index('date')
    return data

In [48]:
news['category'] = np.where(news.category == 'Metaverse', 'Cryptocurrencies', news.category)
news['content'] = news["header"] + ". " + news["desc"]
news_target_date = filter_news_by_date(start, end, news)
news_target_date

Unnamed: 0,date,number,category,header,desc,author,content
0,2022-04-26,410,Finance,Singapore’s GIC Buying Stake In London’s Paddi...,Singaporean sovereign wealth fund GIC has agre...,Jonathan Burgos,Singapore’s GIC Buying Stake In London’s Paddi...
1,2022-04-26,384,Cryptocurrencies,What You Should Know Before Investing In Fidel...,This morning crypto advocates and the crypto c...,Steven Ehrlich,What You Should Know Before Investing In Fidel...
2,2022-04-26,383,Finance,Student Loan Forgiveness: 4 People Likely To O...,Here are 4 people who are likely to oppose mas...,Zack Friedman,Student Loan Forgiveness: 4 People Likely To O...
3,2022-04-26,382,Finance,Home Buying Is Becoming ‘Unaffordable For Most...,Monthly mortgage payments are up nearly $500 s...,Sergei Klebnikov,Home Buying Is Becoming ‘Unaffordable For Most...
4,2022-04-26,381,Finance,Examining The Ukrainian Tax Implications Of Ru...,Valeria Tarasenko of Dentons Kyiv discusses th...,Tax Notes Staff,Examining The Ukrainian Tax Implications Of Ru...
...,...,...,...,...,...,...,...
406,2022-05-09,12,Cryptocurrencies,Analyst Believes Bitcoin Could Soon Fall To $3...,"Carter Braxton Worth, a technical analyst, bel...",Chuck Jones,Analyst Believes Bitcoin Could Soon Fall To $3...
407,2022-05-09,13,Stock Market,The Best Mid-Cap Dividend Stocks For 2022,Mid-cap dividend stocks are the best bargain o...,Brett Owens,The Best Mid-Cap Dividend Stocks For 2022. Mid...
408,2022-05-09,14,Finance,How To Optimize The Energy We Put Into Financi...,Studies suggest that up to 80% of financial pl...,Tim Maurer,How To Optimize The Energy We Put Into Financi...
409,2022-05-09,7,Finance,Ask Larry: Will The 2022 COLA Apply To Benefit...,Today's Social Security column addresses quest...,Laurence Kotlikoff,Ask Larry: Will The 2022 COLA Apply To Benefit...


In [None]:
sentiment_result = pd.DataFrame()
fieldnames = ['date','category','content', 'sentiment_score']

for index, row in news_target_date.iterrows():
    date = row['date']
    content = preprocess_word(row['content'])
    category = row['category']
    
    predict_content = predict(content, model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
    sentiment_score = predict_content.sentiment_score.mean()
    
    value = [(date, category, content, sentiment_score)]
    
    record = pd.DataFrame(value, columns=fieldnames)
    sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)

sentiment_result
sentiment_result.to_csv(r"data/{0}_sentiment.csv".format('New_News'), sep="\t")

# Generate announcement sentiment

In [12]:
def generate_sentiment(data, symbol):
    
    data["Process Content"] = data["Content"].apply(lambda x: preprocess_word(x))
    data["Date"] = data["Date"].apply(lambda x: pd.to_datetime(x, unit="ns", utc=True).floor('D').date())
    data = data.drop(['AuthorID','Author','Content', 'Attachments','Reactions'], axis=1)
    data = data.reset_index()
    sentiment_result = pd.DataFrame()
    fieldnames = ['symbol','date', 'content', 'sentiment_score']
    for index, row in data.iterrows():
        percent_complete(int(index), int(data.shape[0]),title="{0}: Predicting text sentiment analysis".format(symbol))
        symbol = symbol
        date = row['Date']
        content = row['Process Content']
        predict_content = predict(row['Process Content'], model, use_gpu=True, gpu_name='cuda:0',batch_size=100)
        sentiment_score = predict_content.sentiment_score.mean()
    
        value = [(symbol, date, content, sentiment_score)]
    
        record = pd.DataFrame(value, columns=fieldnames)
        sentiment_result = pd.concat([sentiment_result, record], ignore_index=True, axis=0)


    sentiment_result
    sentiment_result.to_csv(r"data/{0}_sentiment.csv".format(symbol), sep="\t")

In [13]:
def percent_complete(step, total_steps, bar_width=60, title="", print_perc=True):
    import sys

    # UTF-8 left blocks: 1, 1/8, 1/4, 3/8, 1/2, 5/8, 3/4, 7/8
    utf_8s = ["█", "▏", "▎", "▍", "▌", "▋", "▊", "█"]
    perc = 100 * float(step) / float(total_steps)
    max_ticks = bar_width * 8
    num_ticks = int(round(perc / 100 * max_ticks))
    full_ticks = num_ticks / 8      # Number of full blocks
    part_ticks = num_ticks % 8      # Size of partial block (array index)
    
    disp = bar = ""                 # Blank out variables
    bar += utf_8s[0] * int(full_ticks)   # Add full blocks into Progress Bar
    
    # If part_ticks is zero, then no partial block, else append part char
    if part_ticks > 0:
        bar += utf_8s[part_ticks]
    
    # Pad Progress Bar with fill character
    bar += "▒" * int((max_ticks/8 - float(num_ticks)/8.0))
    
    if len(title) > 0:
        disp = title + ": "         # Optional title to progress display
    
    # Print progress bar in green: https://stackoverflow.com/a/21786287/6929343
    disp += "\x1b[0;32m"            # Color Green
    disp += bar                     # Progress bar to progress display
    disp += "\x1b[0m"               # Color Reset
    if print_perc:
        # If requested, append percentage complete to progress display
        if perc > 100.0:
            perc = 100.0            # Fix "100.04 %" rounding error
        disp += " {:6.2f}".format(perc) + " %"
    
    # Output to terminal repetitively over the same line using '\r'.
    sys.stdout.write("\r" + disp)
    sys.stdout.flush()

In [None]:
StepN = pd.read_csv(r"data/STEPN_new_Announcement.csv")
generate_sentiment(StepN,'new_STEPN_Announcement')

# Generate general sentiment

In [None]:
StepN = pd.read_csv(r"data/STEPN_new_general.csv")
generate_sentiment(StepN,'new_STEPN_General')