# Sentiment Analysis for Predicting Stock Market Movements using News Headlines

# Data Preparation and Preproccesing

### Library Import

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tqdm
import yfinance as yf
from tqdm import tqdm

In [2]:
#Text cleaning
import contractions
import re
import string

In [3]:
#Text pre-procesing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#PoS Tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\deiro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Data import

In [4]:
df = pd.read_csv('dataset/Combined_News_DJIA.csv', encoding = "ISO-8859-1", parse_dates=["Date"])

In [5]:
#Get the name of the headline columns
cols = []
for i in range(1,26):
    col = ("Top{}".format(i))
    cols.append(col)

### Extracting Stock data DJIA

In [6]:
# Set the start and end date
start_date = '2008-07-15'
end_date = '2016-07-02'
tkr_djia ='^DJI'

In [7]:
# Get the data
DJIA = yf.download(tkr_djia, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [8]:
df_djia=pd.DataFrame(DJIA)
df_djia = df_djia.reset_index()
df_djia = df_djia.sort_values(by=['Date'], ascending=False,ignore_index=True)

In [9]:
df_djia['Date'] = df_djia['Date'].dt.date
df_djia['Date'] = pd.to_datetime(df_djia['Date'])

In [10]:
df_djia['Next_1_Adj_Close'] = df_djia['Adj Close'].shift(-1)
df_djia['Next_2_Adj_Close'] = df_djia['Adj Close'].shift(-2)
df_djia['Next_3_Adj_Close'] = df_djia['Adj Close'].shift(-3)
df_djia['Next_4_Adj_Close'] = df_djia['Adj Close'].shift(-4)
df_djia['Next_5_Adj_Close'] = df_djia['Adj Close'].shift(-5)

In [11]:
df_djia['Label_1day'] = np.where(df_djia['Next_1_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_2day'] = np.where(df_djia['Next_2_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_3day'] = np.where(df_djia['Next_3_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_4day'] = np.where(df_djia['Next_4_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_5day'] = np.where(df_djia['Next_5_Adj_Close'] >= df_djia['Adj Close'], 1, 0)

In [12]:
df_djia.to_csv('dataset/upload_DJIA_table.csv',sep=',', encoding='utf-8',index=False) 

### Data cleaning

In [13]:
df.isna().sum()

Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    1
Top24    3
Top25    3
dtype: int64

In [14]:
df = df.replace(np.nan, 'no news')

In [15]:
# Create a new column that contains all the headlines from Top1 to Top25
df["news"] = df.filter(regex=("Top.*")).apply(lambda x: ''.join(str(x.values)), axis=1)

In [16]:
def txt_cleaning(text):
    # Remove the HTML tags    
    text = re.sub('b\"|b\'|\\\\|\\\"', '', text)
    # Remove non ASCII
    text = text.encode("ascii", errors="ignore").decode()
    # Remove any punctuation
    text = text.translate(text.maketrans('', '', string.punctuation))
    # Remove any extra whitespace    
    text = re.sub('\s+', ' ', text)
    # Change US to usa (in this way it is not confused with the pronoun us)
    text = re.sub(r'US', 'usa', text)
    # Convert to lowercase
    text = text.lower() 
    # Chage to the abbrevation    
    text = re.sub(r"united states of america", "usa", text)
    # Chage to the abbrevation 
    text = re.sub(r"america", "usa", text)
    # Remove contractions 
    text = contractions.fix(text)
    #Remove possessive noun
    text = text.replace("'s", "")
    # Remove any HTML tags
    text = re.sub(r'<.*?>', '', text)  
    # Remove numbers 
    text = re.sub(r'\d+', '', text)
    # Remove any special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
      
    return text

In [17]:
df['clean_news'] = df['news'].apply(lambda x: txt_cleaning(x))

In [18]:
for col in tqdm(cols):
    df[col] = df[col].apply(lambda x: txt_cleaning(x)) 

100%|██████████| 25/25 [00:03<00:00,  7.40it/s]


### Text Pre-processing

In [19]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deiro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deiro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Tokenization
df['tokenized'] = df['clean_news'].apply(lambda x: word_tokenize(x))

In [21]:
#Tokenize stopwords removal
stop_words = set(stopwords.words('english'))
df['news_without_stopwords'] = df['tokenized'].apply(lambda words: [word for word in words if word not in stop_words])

In [22]:
# Stemming
stemmer = PorterStemmer()
df['news_stemmed'] = df['news_without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

In [23]:
#PoS
df['news_pos'] = df['news_without_stopwords'].apply(lambda x: nltk.pos_tag(x))

In [24]:
#Lemma without PoS
lem = WordNetLemmatizer()
df['news_lemmatized'] = df['news_without_stopwords'].apply(lambda words: [lem.lemmatize(word) for word in words])

In [25]:
#Lemma with PoS
lemma_list = []

for words in tqdm(df['news_pos']):
    tmp=[]
    for lemma, pos in words:
        if pos.startswith("NN"):
            tmp.append(lem.lemmatize(lemma, pos='n'))
        elif pos.startswith('VB'):
            tmp.append(lem.lemmatize(lemma, pos='v'))
        elif pos.startswith('JJ'):
            tmp.append(lem.lemmatize(lemma, pos='a'))
        elif pos.startswith('R'):
            tmp.append(lem.lemmatize(lemma, pos='r'))
        else:
            tmp.append(lem.lemmatize(lemma))
            
    lemma_list.append(tmp)
    
df['news_lemmatized_pos'] = lemma_list

100%|██████████| 1989/1989 [00:06<00:00, 327.73it/s]


### Feature Selection 

In [26]:
from ast import literal_eval
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_validate

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [28]:
attributes = df.columns[30:].to_list()
attributes

['news_without_stopwords',
 'news_stemmed',
 'news_pos',
 'news_lemmatized',
 'news_lemmatized_pos']

In [29]:
def mean_array(arr):
    return np.mean(arr)

In [30]:
def model_classifier(model):
    cv_scores = {}
    
    for att in tqdm(attributes):                
        X_train = train_data[att]

        cv = CountVectorizer(analyzer=lambda x: x) 
        X_train_tok = cv.fit_transform(X_train)

        tfidf = TfidfTransformer() 
        X_train_vec = tfidf.fit_transform(X_train_tok)

        ovr = OneVsRestClassifier(model)           
        ovr.fit(X_train_vec, y_train)    

        cv_scores[att] = cross_validate(ovr, X_train_tok, y_train, cv=10, scoring=scoring) 
        
    
    return  cv_scores

In [31]:
scoring = ['f1_macro', 'f1_micro', 'f1_weighted', 'roc_auc', 'accuracy']

In [32]:
train_data = df[df['Date']<'20141231']
y_train = train_data.Label

In [33]:
model = LogisticRegression()
cs = model_classifier(model)
cscores_log = pd.DataFrame(cs)
cscores_log = cscores_log.applymap(mean_array)
cscores_log

100%|██████████| 5/5 [01:05<00:00, 13.14s/it]


Unnamed: 0,news_without_stopwords,news_stemmed,news_pos,news_lemmatized,news_lemmatized_pos
fit_time,1.187515,1.000963,1.391213,1.179726,1.157479
score_time,0.01254,0.008991,0.011193,0.009884,0.00956
test_f1_macro,0.499103,0.493788,0.502349,0.489184,0.493481
test_f1_micro,0.508075,0.501863,0.513665,0.498758,0.501242
test_f1_weighted,0.504298,0.498304,0.508189,0.494469,0.497888
test_roc_auc,0.495094,0.504634,0.486561,0.49572,0.504232
test_accuracy,0.508075,0.501863,0.513665,0.498758,0.501242


In [34]:
model = XGBClassifier()
cs = model_classifier(model)
cscores_xg = pd.DataFrame(cs)
cscores_xg = cscores_xg.applymap(mean_array)
cscores_xg

100%|██████████| 5/5 [05:25<00:00, 65.15s/it]


Unnamed: 0,news_without_stopwords,news_stemmed,news_pos,news_lemmatized,news_lemmatized_pos
fit_time,4.537685,4.244043,6.088741,5.217611,5.060859
score_time,0.037822,0.028347,0.042194,0.033515,0.038786
test_f1_macro,0.474031,0.500998,0.462986,0.487102,0.488516
test_f1_micro,0.487578,0.514286,0.479503,0.501242,0.503727
test_f1_weighted,0.480928,0.507604,0.470529,0.494182,0.495768
test_roc_auc,0.466076,0.494539,0.460229,0.477436,0.480608
test_accuracy,0.487578,0.514286,0.479503,0.501242,0.503727


In [35]:
model = MultinomialNB()
cs = model_classifier(model)
cscores_gau = pd.DataFrame(cs)
cscores_gau = cscores_gau.applymap(mean_array)
cscores_gau

100%|██████████| 5/5 [00:05<00:00,  1.16s/it]


Unnamed: 0,news_without_stopwords,news_stemmed,news_pos,news_lemmatized,news_lemmatized_pos
fit_time,0.016829,0.014626,0.020447,0.01705,0.019065
score_time,0.012606,0.011076,0.011963,0.017337,0.008107
test_f1_macro,0.445617,0.468964,0.45186,0.455696,0.461272
test_f1_micro,0.47205,0.490062,0.481366,0.481988,0.486957
test_f1_weighted,0.454947,0.47726,0.462298,0.464867,0.470717
test_roc_auc,0.459224,0.464744,0.456236,0.459602,0.462977
test_accuracy,0.47205,0.490062,0.481366,0.481988,0.486957


In [36]:
model = SVC()
cs = model_classifier(model)
cscores_svm = pd.DataFrame(cs)
cscores_svm = cscores_svm.applymap(mean_array)
cscores_svm

100%|██████████| 5/5 [09:10<00:00, 110.14s/it]


Unnamed: 0,news_without_stopwords,news_stemmed,news_pos,news_lemmatized,news_lemmatized_pos
fit_time,8.374074,8.07228,8.335169,7.964488,7.84604
score_time,1.885935,1.752652,1.881442,1.749889,1.730186
test_f1_macro,0.419758,0.456773,0.374995,0.432975,0.434538
test_f1_micro,0.531677,0.538509,0.520497,0.528571,0.526708
test_f1_weighted,0.441216,0.474409,0.400429,0.452511,0.453676
test_roc_auc,0.500171,0.511694,0.494713,0.506065,0.513814
test_accuracy,0.531677,0.538509,0.520497,0.528571,0.526708


In [37]:
model = RandomForestClassifier()
cs = model_classifier(model)
cscores_rf = pd.DataFrame(cs)
cscores_rf = cscores_rf.applymap(mean_array)
cscores_rf

100%|██████████| 5/5 [08:19<00:00, 99.83s/it] 


Unnamed: 0,news_without_stopwords,news_stemmed,news_pos,news_lemmatized,news_lemmatized_pos
fit_time,9.231701,7.678864,11.951187,8.359783,8.116455
score_time,0.107499,0.095057,0.09881,0.091508,0.0894
test_f1_macro,0.467093,0.45324,0.430672,0.459037,0.448368
test_f1_micro,0.524224,0.51677,0.508696,0.521118,0.519876
test_f1_weighted,0.48169,0.468926,0.44831,0.474363,0.464869
test_roc_auc,0.509299,0.494789,0.478015,0.485604,0.466545
test_accuracy,0.524224,0.51677,0.508696,0.521118,0.519876


## Data pre processing to headlines Top1 to Top25 

In [38]:
for col in tqdm(cols):
    # Tokenize
    df[col] = df[col].apply(lambda x: word_tokenize(x))     
    # Remove stopwords
    df[col] = df[col].apply(lambda words: [word for word in words if word not in stop_words]) 
    # Stemming
    df[col] = df[col].apply(lambda x: [stemmer.stem(word) for word in x])    

100%|██████████| 25/25 [00:47<00:00,  1.89s/it]


## Merging with stock data 

In [39]:
df_labels=df_djia[['Date','Label_1day','Label_2day','Label_3day','Label_4day','Label_5day']].copy()

In [40]:
df = pd.merge(df,df_labels,on='Date')

In [41]:
df.drop(['tokenized','news_without_stopwords','news_pos', 'news_lemmatized','news_lemmatized_pos'], axis=1, inplace=True)  

In [42]:
df.to_csv('dataset/pre_process_all_news_days.csv',sep=',', encoding='utf-8',index=False) 

In [43]:
df_small = df[['Date','clean_news','news_stemmed','Label','Label_1day',
       'Label_2day', 'Label_3day', 'Label_4day', 'Label_5day']].copy()

In [44]:
df_small.to_csv('dataset/pre_process_news_days.csv',sep=',', encoding='utf-8',index=False) 