In [16]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
import re
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

Feature Engineering:

Truth is, this dataset is relatively small. Thus we try to create numerical features from NLP which may be simpler, interpretable, and data-efficient. We may be missing out on some subtleties, but training deep NLP on limited data is likely to overfit. We start with simple feature additions: 

In [17]:
data=pd.read_csv('headlinesNLPdata.csv')

In [18]:
#Adding headline length & word count
data['headline_length']=data['Headlines'].apply(len)
data['word_count']=data['Headlines'].apply(lambda x: len(x.split()))
data.head()

Unnamed: 0.1,Unnamed: 0,Headlines,ticker,date,return,direction,headline_length,word_count
0,13956,Virgin Galactic and Upwork among industrial g...,DSS,2020-07-09 01:01:00,-0.08875,0,78,11
1,13955,"DNJR leads financial gainers, BYFC and BSBK a...",BTBT,2020-07-09 01:07:00,-0.243243,0,58,9
2,13951,Comscore renews agreement for ondemand essent...,SCOR,2020-07-09 01:20:00,0.066434,1,65,8
3,13950,Notable earnings before Friday's open,GBX,2020-07-09 01:21:00,0.155157,1,39,5
4,13949,Aramark's self-guided convenience store recei...,ARMK,2020-07-09 01:21:00,0.040924,1,78,9


Adding more features here: headline length and word count. 

In [19]:
#Adding date based features
data['date']=pd.to_datetime(data['date'],errors='coerce')
data['day_of_week']=data['date'].dt.dayofweek #Monday=0, Sunday=6
day_dummies=pd.get_dummies(data['day_of_week'],prefix='day').astype(int) #need to convert to dummy variables for sake of NLP
data=pd.concat([data,day_dummies],axis=1)
data=data.drop('day_of_week',axis=1)
data.head()

Unnamed: 0.1,Unnamed: 0,Headlines,ticker,date,return,direction,headline_length,word_count,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,13956,Virgin Galactic and Upwork among industrial g...,DSS,2020-07-09 01:01:00,-0.08875,0,78,11,0,0,0,1,0,0,0
1,13955,"DNJR leads financial gainers, BYFC and BSBK a...",BTBT,2020-07-09 01:07:00,-0.243243,0,58,9,0,0,0,1,0,0,0
2,13951,Comscore renews agreement for ondemand essent...,SCOR,2020-07-09 01:20:00,0.066434,1,65,8,0,0,0,1,0,0,0
3,13950,Notable earnings before Friday's open,GBX,2020-07-09 01:21:00,0.155157,1,39,5,0,0,0,1,0,0,0
4,13949,Aramark's self-guided convenience store recei...,ARMK,2020-07-09 01:21:00,0.040924,1,78,9,0,0,0,1,0,0,0


Again, adding another feature: day of the week, which may impact the movement of the stock. We'll note the conversion to dummy variables, considering that Monday = 0 and Sunday = 6 carries no meaning numerically (True/False -> 1/0).

NLP stuff:

In [20]:
analyzer=SentimentIntensityAnalyzer()
data[['neg_sentiment', 'neu_sentiment', 'pos_sentiment', 'compound_sentiment']] = data['Headlines'].apply(lambda x: pd.Series(analyzer.polarity_scores(x)))
data.head()

Unnamed: 0.1,Unnamed: 0,Headlines,ticker,date,return,direction,headline_length,word_count,day_0,day_1,day_2,day_3,day_4,day_5,day_6,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment
0,13956,Virgin Galactic and Upwork among industrial g...,DSS,2020-07-09 01:01:00,-0.08875,0,78,11,0,0,0,1,0,0,0,0.254,0.746,0.0,-0.5267
1,13955,"DNJR leads financial gainers, BYFC and BSBK a...",BTBT,2020-07-09 01:07:00,-0.243243,0,58,9,0,0,0,1,0,0,0,0.298,0.702,0.0,-0.5267
2,13951,Comscore renews agreement for ondemand essent...,SCOR,2020-07-09 01:20:00,0.066434,1,65,8,0,0,0,1,0,0,0,0.0,0.686,0.314,0.4939
3,13950,Notable earnings before Friday's open,GBX,2020-07-09 01:21:00,0.155157,1,39,5,0,0,0,1,0,0,0,0.0,1.0,0.0,0.0
4,13949,Aramark's self-guided convenience store recei...,ARMK,2020-07-09 01:21:00,0.040924,1,78,9,0,0,0,1,0,0,0,0.0,0.69,0.31,0.5574


The 'compound sentiment' here from VADER summarizes the overall sentiment (positive sentiment is positive numerically, negative sentiment is negative numerically, 0 is neutral). We also include the negative, neutral, and positive sentiment scores for additional features. 

In [21]:
#now doing the same thing but with "FinBERT", which is more financially geared sentiment scorer for text
tokenizer=AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone") #loading FinBERT model and tokenizer
model=AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
finbert_pipeline=pipeline("sentiment-analysis",model=model,tokenizer=tokenizer,return_all_scores=True) #pipeline for sentiment analysis
#function to get FinBERT probabilities for one headline
def finbert_probs(text): 
    result=finbert_pipeline(text)[0]
    scores={f"finbert_{r['label'].lower()}": r['score'] for r in result}
    return pd.Series(scores)

data[['finbert_neg','finbert_neu','finbert_pos']]=data['Headlines'].apply(finbert_probs) #apply to all data

data.head()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Unnamed: 0.1,Unnamed: 0,Headlines,ticker,date,return,direction,headline_length,word_count,day_0,day_1,...,day_4,day_5,day_6,neg_sentiment,neu_sentiment,pos_sentiment,compound_sentiment,finbert_neg,finbert_neu,finbert_pos
0,13956,Virgin Galactic and Upwork among industrial g...,DSS,2020-07-09 01:01:00,-0.08875,0,78,11,0,0,...,0,0,0,0.254,0.746,0.0,-0.5267,0.9989373,0.000611,0.0004511559
1,13955,"DNJR leads financial gainers, BYFC and BSBK a...",BTBT,2020-07-09 01:07:00,-0.243243,0,58,9,0,0,...,0,0,0,0.298,0.702,0.0,-0.5267,0.9992293,0.000104,0.0006671073
2,13951,Comscore renews agreement for ondemand essent...,SCOR,2020-07-09 01:20:00,0.066434,1,65,8,0,0,...,0,0,0,0.0,0.686,0.314,0.4939,0.9998584,0.00012,2.139914e-05
3,13950,Notable earnings before Friday's open,GBX,2020-07-09 01:21:00,0.155157,1,39,5,0,0,...,0,0,0,0.0,1.0,0.0,0.0,0.9999824,4e-06,1.326873e-05
4,13949,Aramark's self-guided convenience store recei...,ARMK,2020-07-09 01:21:00,0.040924,1,78,9,0,0,...,0,0,0,0.0,0.69,0.31,0.5574,2.842021e-08,1.0,9.325699e-09


Similar to VADER step, but VADER is much more general purpose whereas FinBERT is more geared towards finance. It produces three numeric features here (per headline) that capture the full sentiment distribution (positive, neutral, or negative with the total of the scores summing to 1). 

In [22]:
#now doing some TFIDF stuff - extracting numerical features from headline text by converting important words into TFIDF scores
tfidf=TfidfVectorizer(max_features=100,stop_words='english') #limiting to the top 100 words, and not including stop words which might dominate in terms of frequency
X_tfidf=tfidf.fit_transform(data['Headlines']) #convert each headline into a vector of TFIDF scores corresponding to top 100 words
tfidf_df=pd.DataFrame(X_tfidf.toarray(),columns=tfidf.get_feature_names_out(),index=data.index)
data=pd.concat([data,tfidf_df],axis=1)

#display these new columns, putting most important words at the front
tfidf_cols=tfidf_df.columns
avg_tfidf=tfidf_df[tfidf_cols].mean().sort_values(ascending=False)
top_words=avg_tfidf.index.tolist()
display_cols=['Headlines'] + top_words[:10]
data[display_cols].head(10)

Unnamed: 0,Headlines,beats,revenue,eps,q2,dividend,misses,declares,results,reports,new
0,Virgin Galactic and Upwork among industrial g...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"DNJR leads financial gainers, BYFC and BSBK a...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Comscore renews agreement for ondemand essent...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Notable earnings before Friday's open,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Aramark's self-guided convenience store recei...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Keurig Dr Pepper tapped to pour out a beat-an...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,AMC Networks +5% on reported sale evaluation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Intact Financial announces catastrophe Q2 los...,0.0,0.0,0.0,0.329633,0.0,0.0,0.0,0.0,0.0,0.0
8,Assurant declares $0.63 dividend,0.0,0.0,0.0,0.0,0.686487,0.0,0.727142,0.0,0.0,0.0
9,NICE upgraded to Buy on pandemic tailwinds,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now we have 100 new columns with TF-IDF word scores for each headline. As expected, lots of 0s across the board, but this is completely normal considering that are our dataset has lots of different words. Reminder that TF-IDF = Term frequency (how often word appears normalized by length) x Inverse Document frequency (how "rare" a word is). 

In [23]:
#Minor cleaning
print(data.columns)
data=data.drop(labels=['Unnamed: 0','Headlines', 'ticker','date'], axis='columns')
data.head()

Index(['Unnamed: 0', 'Headlines', 'ticker', 'date', 'return', 'direction',
       'headline_length', 'word_count', 'day_0', 'day_1',
       ...
       'strong', 'study', 'systems', 'target', 'tech', 'technologies',
       'therapeutics', 'trust', 'vaccine', 'year'],
      dtype='object', length=122)


Unnamed: 0,return,direction,headline_length,word_count,day_0,day_1,day_2,day_3,day_4,day_5,...,strong,study,systems,target,tech,technologies,therapeutics,trust,vaccine,year
0,-0.08875,0,78,11,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.243243,0,58,9,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.066434,1,65,8,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.155157,1,39,5,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.040924,1,78,9,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Checking if we have any remaining categorical data features:

In [24]:
#Checking if we have categorical data
print(data.dtypes)
print(data.select_dtypes(include=['object','category']).head())

return             float64
direction            int64
headline_length      int64
word_count           int64
day_0                int64
                    ...   
technologies       float64
therapeutics       float64
trust              float64
vaccine            float64
year               float64
Length: 118, dtype: object
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [25]:
#save data
data.to_csv('headlinesNLPdata.csv')