In [None]:
import pandas as pd
import numpy as np
import regex as re
import string

from pandas_profiling import ProfileReport

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import WordNetLemmatizer, pos_tag, word_tokenize, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!python -m pip install "git+https://github.com/javadba/mpld3@display_fix"


# <font color="chillipepper">**Data importation**</font>

The objective of this project is to form a financial strategy based on the headlines of the news concerning 7 stocks, which we have retrieved thanks to the API of Thomson Reuters.

We want to do sentiment analysis on the news headlines, i.e. classify the news headlines according to whether they are positive, negative or neutral for the stock price. 

1) To do so, we first propose to train our NLP models on another database of financial news headlines, this one already labeled. This database comes from Kaggle.

2) Second, we will use a pre-trained model for sentiment analysis in finance, finBERT, to overcome the difficulty of unlabeled data.

In relation to stocks, we have chosen to focus on the SnP500, we have retrieved as much news as possible on these companies

In [None]:
#Importing news titles dataset

! wget https://raw.githubusercontent.com/emincalyaka/NLP-financial-news/main/News_SP500.csv

#Importing prelabeled dataset

! wget https://github.com/emincalyaka/NLP-financial-news/raw/main/labelled%20data/FinancialPhraseBank.csv 

#Importing financial data 

! wget https://raw.githubusercontent.com/emincalyaka/NLP-financial-news/main/SPY500_Prices.csv

In [None]:
df_news = pd.read_csv("News_SP500.csv", index_col = ["Unnamed: 0", "Ticker"])["text"]
df_news = pd.DataFrame(df_news.sort_index())
df_news

In [None]:
data_label = pd.read_csv("FinancialPhraseBank.csv", names = ['label','text'],encoding='ISO-8859-1')
data_label

In [None]:
price = pd.read_csv("SPY500_Prices.csv", index_col = "date").sort_index()
price

In [None]:
ticker = list(set([df_news.index[k][1] for k in range(len(df_news))]))


# <font color="chillipepper">**Descriptive Statistics**</font>


### Content length 


In [None]:
# Repartition of the content lengths (characters)
%matplotlib inline
df_news.text.drop_duplicates().apply(len).hist(bins=40)

One notices really short sentences compared to what one could meet in TP

### Lemmatization with WordnetLemmatizer, Spacy or PorterStemmer


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Lookups

nltk.download("stopwords")

def clean_string(text, stem="None"):

    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Remove puncuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    # Remove anything else
    text_filtered = [ re.sub('[^A-Za-z0-9]+','', w) for w in text_filtered ]
    text_filtered = [w for w in text_filtered if w != ""]

    # Stem or Lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string

In [None]:
df_news["text_clean"] = df_news.text.apply(lambda x : clean_string(x, stem = "Spacy"))
df_news["text_clean"]

### Most common words


In [None]:
import wordcloud

allwords = [s.split(" ")[k] for s in df_news["text_clean"] for k in range(len(s.split(" "))) if s.split(" ")[k] != ""]
mostcommon = FreqDist(allwords).most_common(100)
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(str(mostcommon))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Most Common Words', fontsize=100)
plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Words', fontsize=50)
plt.ylabel('Frequency of Words', fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title('Frequency of 25 Most Common Words', fontsize=60)
plt.show()

We can see that the most frequent words do not necessarily express a feeling, in other words they are probably not very polarized. Nevertheless we have a good frequency of buy and sell words which are often associated with buy/sell signals

### Bigram/Trigram

In [None]:
c_vec = CountVectorizer(stop_words="english", ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(df_news.text_clean)
# count frequency of ngrams
count_values = ngrams.sum(axis = 0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[0,i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})
df_ngram[:10]

The bigrams and trigrams show nothing interesting for the most frequent of them

### To try to see if the training on a pre-labeled base is relevant, we perform the previous statistics on the pre-labeled base


In [None]:
data_label["text_clean"] = data_label["text"].apply(lambda x : clean_string(x, stem = "Spacy"))
data_label["text_clean"]

In [None]:
# Repartition of the content lengths (characters)
%matplotlib inline
data_label["text_clean"].drop_duplicates().apply(len).hist(bins=40)

In [None]:
allwords = [s.split(" ")[k] for s in data_label["text_clean"] for k in range(len(s.split(" "))) if s.split(" ")[k] != ""]
mostcommon = FreqDist(allwords).most_common(100)
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(str(mostcommon))
fig = plt.figure(figsize=(30,10), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.title('Top 100 Most Common Words', fontsize=100)
plt.tight_layout(pad=0)
plt.show()

In [None]:
mostcommon_small = FreqDist(allwords).most_common(25)
x, y = zip(*mostcommon_small)
plt.figure(figsize=(50,30))
plt.margins(0.02)
plt.bar(x, y)
plt.xlabel('Words', fontsize=50)
plt.ylabel('Frequency of Words', fontsize=50)
plt.yticks(fontsize=40)
plt.xticks(rotation=60, fontsize=40)
plt.title('Frequency of 25 Most Common Words', fontsize=60)
plt.show()

In [None]:
c_vec = CountVectorizer(stop_words="english", ngram_range=(2,3))
# matrix of ngrams
ngrams = c_vec.fit_transform(data_label["text_clean"])
# count frequency of ngrams
count_values = ngrams.sum(axis = 0)
# list of ngrams
vocab = c_vec.vocabulary_
df_ngram = pd.DataFrame(sorted([(count_values[0,i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'frequency', 1:'bigram/trigram'})
df_ngram[:10]

We already notice a better quality of the data:

1.   The size of the sentences is more reasonable
2.   The most frequent words have more meaning than what we have recovered

This is quite problematic knowing that we want to train a model on one to classify the other










# <font color="chillipepper">**Baseline : Naive Bayes Model on prelabeled dataset**</font>


### We use previously lemmatized dataset ``` data_label ```



### Multinomial Naive Bayes model with Count Vectorizer

In [None]:
from sklearn.model_selection import train_test_split

X_train , X_test, y_train, y_test = train_test_split(data_label["text_clean"], data_label["label"], train_size = 0.7, stratify = data_label["label"], shuffle = True)

In [None]:
from sklearn.naive_bayes import MultinomialNB

c_vec = CountVectorizer(binary = True, stop_words="english", ngram_range=(1,1))

X_train_count = c_vec.fit_transform(X_train).toarray()

NB1 = MultinomialNB(alpha = 1)

NB1.fit(X_train_count, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

X_test_count = c_vec.transform(X_test).toarray()
y_pred = NB1.predict(X_test_count)

names = ["positive", "neutral", "negative"]
print(classification_report(np.array(y_test).flatten(), np.array(y_pred).flatten(), target_names=names, labels=np.unique(y_pred)))


### Multinomial Naive Bayes model with Tf-idf Vectorizer

In [None]:
from sklearn.naive_bayes import MultinomialNB

tfidf_vec = TfidfVectorizer(binary = True, stop_words="english", ngram_range=(1,1))

X_train_tfidf = tfidf_vec.fit_transform(X_train).toarray()

NB2 = MultinomialNB(alpha = 1)

NB2.fit(X_train_tfidf, y_train)

In [None]:
X_test_tfidf = tfidf_vec.transform(X_test).toarray()
y_pred = NB1.predict(X_test_tfidf)

names = ["positive", "neutral", "negative"]
print(classification_report(np.array(y_test).flatten(), np.array(y_pred).flatten(), target_names=names, labels=np.unique(y_pred)))



# <font color="chillipepper">**Training of labeled dataset : Naive Bayes within SVM**</font>

https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf

There is a ready-made module for naive bayes within svm : `nbsvm` but we won't use it here

In [None]:
vectorizer = CountVectorizer(binary=True,ngram_range=(1,2)) #using unigrams, bigrams 

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

vocab=vectorizer.get_feature_names_out()

We calculate the log-count ratio

In [None]:
X_train_vec_pos = X_train_vec[(y_train.reset_index()=="positive").index,:]
X_train_vec_neg =  X_train_vec[(y_train.reset_index()=="negative").index,:]

In [None]:
R = np.log((X_train_vec_pos.sum(axis=0)+1)/(X_train_vec_pos.sum(0).sum()+len(vocab))/(X_train_vec_neg.sum(axis=0)+1)/(X_train_vec_neg.sum(0).sum()+len(vocab)))
R = np.squeeze(np.asarray(R))
R

In [None]:
from sklearn.svm import LinearSVC

In [None]:
x_nb=X_train_vec.multiply(R)
nbsvm = LinearSVC().fit(x_nb, y_train) #Naive Bayes is an input feature

y_pred = nbsvm.predict(X_test_vec.multiply(R))

names = ["positive", "neutral", "negative"]
print(classification_report(np.array(y_test).flatten(), np.array(y_pred).flatten(), target_names=names, labels=np.unique(y_pred)))

We will now focus on the application of these models to our unlabeled data


# <font color="chillipepper">**Pretrained FinBERT model**</font>



https://github.com/yya518/FinBERT

In [None]:
pip install transformers

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

sentences = ["there is a shortage of capital, and we need extra financing", 
             "growth is strong and we have plenty of liquidity", 
             "there are doubts about our finances", 
             "profits are flat"]

inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = finbert(**inputs)[0]

labels = {0:'neutral', 1:'positive',2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '----', labels[np.argmax(outputs.detach().numpy()[idx])])


# <font color="chillipepper">**Results**</font>



In [None]:
df_news

In [None]:
sentiment_nb1 = NB1.predict(c_vec.transform(df_news["text_clean"]).toarray())

In [None]:
sentiment_nb2 = NB2.predict(tfidf_vec.transform(df_news["text_clean"]).toarray())

In [None]:
sentiment_nbsvm = nbsvm.predict(vectorizer.transform(df_news["text_clean"]).multiply(R))

In [None]:
labels = {0:'neutral', 1:'positive',2:'negative'}
def label_bert_ouput(x):
  n = len(x)
  m = 0
  for i in range(n):
    if x[i] > x[m]:
      m = i
  return labels[m]

In [None]:
### Very long : ~ 1 hour

from tqdm import tqdm
sentences = df_news["text"].tolist()
sentiment_finbert = []
for k in tqdm(range(len(sentences))):
  input = tokenizer(sentences[0], return_tensors="pt", padding=True)
  l = finbert(**input)[0].detach().numpy()[0]
  sentiment_finbert.append(label_bert_ouput(l))

Histogram of classification

In [None]:
pd.Series(sentiment_nb1).value_counts()

In [None]:
pd.Series(sentiment_nb2).value_counts()

In [None]:
pd.Series(sentiment_nbsvm).value_counts()

In [None]:
pd.Series(sentiment_finbert).value_counts()

# We clearly have too little oriented news to capture anything. This is particularly explicit with the finbert model which does not classify any of our news headlines as positive or negative.