In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import nltk

In [None]:
from nltk.corpus import reuters

In [None]:
print(reuters.fileids()[:10])

In [None]:
len(reuters.categories())

In [None]:
print(reuters.categories())

In [None]:
## categories overlap on the Reuters corpus
print(reuters.categories('training/9865'))

In [None]:
raw1 = reuters.raw('test/14826')
print(raw1)

In [None]:
chars1 = [',', '"', ':', ')', '(', '\n', '>', '<', ';', "'s", "'", '&']
chars2 = ['. '] #['-'] ## Hong-kong vs far-reaching ?
def clean_text(x):
    for char in chars1:
        if char in x:
            x = x.replace(char, '')
    for char in chars2:
        if char in x:
            x = x.replace(char, ' ')
    return x
clean1 = clean_text(raw1)
print(clean1)

In [None]:
clean1 = clean1.lower()
print(clean1)

In [None]:
nltk.download('words')

In [None]:
from nltk.corpus import words
## check if word exists
"would" in words.words()

In [None]:
nltk.download('omw-1.4')

In [None]:
nltk.download("wordnet") ## another way

from nltk.corpus import wordnet

wordnet.synsets("world")

In [None]:
import re

In [None]:
clean1 = re.sub(' +', ' ', clean1)
print(clean1)

In [None]:
## tokenization
words1 = clean1.split(" ")
print(words1)

In [None]:
article1 = nltk.Text(words1)
## how many words?
len(article1)

In [None]:
article1.count("trade")

In [None]:
article1.common_contexts(["japan", "u.s"])

In [None]:
article1.dispersion_plot(["japan", "australia", 'u.s'])

In [None]:
article1.concordance("japan")

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
cachedStopWords = stopwords.words('english')
print(cachedStopWords)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from matplotlib import rcParams


In [None]:

## without stop words
wc1 = WordCloud(stopwords=None, max_words=25, background_color="white").generate(clean1)

rcParams["figure.figsize"] = (10,5)
plt.imshow(wc1)
plt.axis("off")
plt.show()

In [None]:

## without stop words
wc1 = WordCloud(stopwords=None, max_words=25, background_color="white").generate_from_text(clean1)

rcParams["figure.figsize"] = (10,5)
plt.imshow(wc1)
plt.axis("off")
plt.show()

In [None]:
## add some stopwords to the current list
cachedStopWords.append("said")
cachedStopWords.append("might")
## with stop words
wc2 = WordCloud(stopwords=cachedStopWords, max_words=25, background_color="white").generate_from_text(clean1)


rcParams["figure.figsize"] = (10,5)
plt.imshow(wc2)
plt.axis("off")
plt.show()

In [None]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
tokens = article1[0:10]

print(tokens)

print([porter.stem(t) for t in tokens])
print([lancaster.stem(t) for t in tokens])

In [None]:
nltk.download('wordnet') ## wordnet is a rich dictionnary with definitions and synon # import these modules
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print("rocks :", lemmatizer.lemmatize("rocks"))
print("corpora :", lemmatizer.lemmatize("corpora"))

# a denotes adjective in "pos"
print("better :", lemmatizer.lemmatize("better", pos ="a")) # v denotes verb in "pos"
print("is :", lemmatizer.lemmatize("is", pos ="v"))

In [None]:
from nltk.corpus import wordnet as wn
## "synonym set", a collection of synonymous words (or "lemmas")
wn.synsets('motorcar')

In [None]:
wn.synsets('car') ## ambiguous word

In [None]:
i = '1'
print(wn.synset('car.n.0'+i).definition())
print(wn.synset('car.n.0'+i).lemma_names())

In [None]:
## frequency distributions
fdist = nltk.FreqDist(article1)
print(fdist)

In [None]:
## for most common words
fdist.most_common(10)

In [None]:
## words count
fdist.plot(10, cumulative=True)

In [None]:
## freqency of word lengths
fdist = nltk.FreqDist(len(w) for w in article1)
fdist.most_common()

In [None]:
fdist.tabulate()

In [None]:
from nltk import word_tokenize
import re

def tokenize(text, min_length=3):
    """
    A tokenizer typical used for classification
    """
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens = (list(map(lambda token: porter.stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token) >= min_length, tokens))
    return filtered_tokens

In [None]:
text = 'Stock futures soared Monday morning after Moderna (MRNA) became the latest m'
tokenize(text)

In [None]:
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=tokenize)
## we take 10 documents from the Reuters dataset
docs = [reuters.raw(doc_id) for doc_id in reuters.fileids()[:10]]
docs_bow = vectorizer.fit_transform(docs)

In [None]:
print(docs_bow.shape)
docs_bow.toarray()[1,]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(tokenizer=tokenize)
docs_bow2 = vectorizer2.fit_transform(docs)

In [None]:
print(docs_bow2.shape)
np.round(docs_bow2.toarray()[1,], 2)

In [None]:
from textblob import TextBlob
blob = TextBlob(raw1)
print(blob.polarity)

In [None]:
def loadSentimentDict(filename):
    myfile = open(filename, "r")
    lines = myfile.readlines()
    sent = [l.split(",")[0].strip().lower() for l in lines]
    return sent

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2021.csv')

In [None]:
data

In [None]:
data['Positive'].unique()

In [None]:
from nltk import sent_tokenize ## Notice: this is a sentence tokenizer
text = raw1
sentences = sent_tokenize(text)
print(len(sentences))
##
sentences[:2]

In [None]:
print(tokenize(sentences[0]))

In [None]:
sent_tfidf = vectorizer2.fit_transform(sentences).toarray()
print(sent_tfidf.shape)
np.round(sent_tfidf, 2)

In [None]:
def score_sentences(x):
    sentenceValue = {}
    ## loop over the rows
    for i in range(x.shape[0]):
        total_score_per_sentence = np.sum(x[i,:])
        count_words_in_sentence = np.sum(x[i,:] > 0)
        sentenceValue[i] = total_score_per_sentence / count_words_in_sentence
    return sentenceValue
sent_scores = score_sentences(sent_tfidf)

In [None]:
qtl = 0.9
sent_threshold = np.quantile(list(sent_scores.values()), qtl)
print(sent_threshold)
def generate_summary(sentences, scores, threshold):
    sentence_count = 0
    summary = ''
    for i in range(len(scores)):
        if scores[i] >= (threshold):
            summary += " \n\n " + sentences[i]
            sentence_count += 1
    return summary, sentence_count
summary, count = generate_summary(sentences, sent_scores, sent_threshold)
print(summary)

In [None]:
from flair.models import TextClassifier
from flair.data import Sentence

# load tagger
classifier = TextClassifier.load('sentiment')

In [None]:
# make example sentence
sentence = Sentence("Finnish steel maker Rautaruukki Oyj ( Ruukki ) said on July 7 , 2008 that it won a 9.0 mln euro ( $ 14.1 mln ) contract to supply and install steel superstructures for Partihallsforbindelsen bridge project in Gothenburg , western Sweden.")

# call predict
classifier.predict(sentence)

# check prediction
print(sentence)

In [1]:
from utils import *
import os
import numpy as np

path_finbert_df = os.path.join('data', 'sentiment_analysis', 'df_finbert_predictions.pkl')

In [None]:
# Load pickle file with analysis results of FINBERT
df = load_pkl(path_finbert_df)
df.head(2)

In [None]:
# Save as csv file
# path_finbert_df_csv = os.path.join('data', 'sentiment_analysis', 'df_finbert_predictions.csv')
# df.to_csv(path_finbert_df_csv)

In [None]:
mask = df.finbert_positive.isna()

In [None]:
mask.to_frame().value_counts()

In [9]:
pd.read_csv('data/df_final.csv')

Unnamed: 0.1,Unnamed: 0,comp_cik,year,vwretd,yret,basis_points_delta,finbert_positive,finbert_negative,finbert_neutral
0,0,1567101,2016,0.002929,0.010643,0.007714,0.061879,0.124403,0.813717
1,1,1567101,2016,0.002929,0.010643,0.007714,0.025089,0.107892,0.867019
2,2,1567138,2016,0.002929,0.005900,0.002971,0.110543,0.076809,0.812648
3,3,1567138,2016,0.002929,0.005900,0.002971,0.135020,0.126333,0.738647
4,4,1570384,2016,0.018772,0.030982,0.012210,0.066827,0.103726,0.829447
...,...,...,...,...,...,...,...,...,...
1094,1094,1710607,2018,-0.089890,-0.038793,0.051097,,,
1095,1095,1710607,2018,-0.089890,-0.038793,0.051097,,,
1096,1096,1722478,2018,0.005365,-0.004034,-0.009399,,,
1097,1097,1727074,2018,-0.089890,-0.067275,0.022615,,,
