## Sentiment Analysis for Predicting Stock Market Movements Preprocess

### Library Import

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tqdm
import yfinance as yf
from tqdm import tqdm

In [2]:
#Text cleaning
import contractions
import re
import string

In [3]:
#Text pre-procesing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
#PoS Tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\deiro\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### Data import

In [4]:
df = pd.read_csv('dataset/Combined_News_DJIA.csv', encoding = "ISO-8859-1", parse_dates=["Date"])

In [5]:
# Set the start and end date
start_date = '2008-07-15'
end_date = '2016-07-02'
tkr_djia ='^DJI'

In [6]:
# Get the data
DJIA = yf.download(tkr_djia, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


## Feature engineering

In [7]:
df_djia=pd.DataFrame(DJIA)
df_djia = df_djia.reset_index()
df_djia = df_djia.sort_values(by=['Date'], ascending=False,ignore_index=True)

In [8]:
df_djia['Date'] = df_djia['Date'].dt.date
df_djia['Date'] = pd.to_datetime(df_djia['Date'])

In [9]:
df_djia['Next_1_Adj_Close'] = df_djia['Adj Close'].shift(-1)
df_djia['Next_2_Adj_Close'] = df_djia['Adj Close'].shift(-2)
df_djia['Next_3_Adj_Close'] = df_djia['Adj Close'].shift(-3)
df_djia['Next_4_Adj_Close'] = df_djia['Adj Close'].shift(-4)
df_djia['Next_5_Adj_Close'] = df_djia['Adj Close'].shift(-5)

In [10]:
df_djia['Label_1day'] = np.where(df_djia['Next_1_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_2day'] = np.where(df_djia['Next_2_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_3day'] = np.where(df_djia['Next_3_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_4day'] = np.where(df_djia['Next_4_Adj_Close'] >= df_djia['Adj Close'], 1, 0)
df_djia['Label_5day'] = np.where(df_djia['Next_5_Adj_Close'] >= df_djia['Adj Close'], 1, 0)

In [11]:
df_djia.to_csv('dataset/upload_DJIA_table.csv',sep=',', encoding='utf-8',index=False) 

### Data cleaning

In [12]:
df = df.replace(np.nan, 'no news')

In [13]:
# Create a new column that contains all the headlines from Top0 to Top25
df["news"] = df.filter(regex=("Top.*")).apply(lambda x: ''.join(str(x.values)), axis=1)

In [14]:
#Get the name of the headline columns
cols = []
for i in range(1,26):
    col = ("Top{}".format(i))
    cols.append(col)

In [15]:
def txt_cleaning(text):
    # Remove the HTML tags    
    text = re.sub('b\"|b\'|\\\\|\\\"', '', text)
    # Remove non ASCII
    text = text.encode("ascii", errors="ignore").decode()
    # Remove any punctuation
    text = text.translate(text.maketrans('', '', string.punctuation))
    # Remove any extra whitespace    
    text = re.sub('\s+', ' ', text)
    # Change US to usa (in this way it is not confused with the pronoun us)
    text = re.sub(r'US', 'usa', text)
    # Convert to lowercase
    text = text.lower() 
    # Chage to the abbrevation    
    text = re.sub(r"united states of america", "usa", text)
    # Chage to the abbrevation 
    text = re.sub(r"america", "usa", text)
    # Remove contractions 
    text = contractions.fix(text)
    #Remove possessive noun
    text = text.replace("'s", "")
    # Remove any HTML tags
    text = re.sub(r'<.*?>', '', text)  
    # Remove numbers 
    text = re.sub(r'\d+', '', text)
    # Remove any special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
  
    
    return text

In [16]:
df['clean_news'] = df['news'].apply(lambda x: txt_cleaning(x))

In [17]:
for col in tqdm(cols):
    df[col] = df[col].apply(lambda x: txt_cleaning(x)) 

100%|██████████| 25/25 [00:03<00:00,  6.47it/s]


### Text Pre-processing

In [18]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deiro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deiro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
# Tokenization
df['tokenized'] = df['clean_news'].apply(lambda x: word_tokenize(x))

In [20]:
for col in tqdm(cols):
    df[col] = df[col].apply(lambda x: word_tokenize(x)) 

100%|██████████| 25/25 [00:12<00:00,  1.99it/s]


In [21]:
#Tokenize stopwords removal
stop_words = set(stopwords.words('english'))
df['news_without_stopwords'] = df['tokenized'].apply(lambda words: [word for word in words if word not in stop_words])

In [22]:
for col in tqdm(cols):
    df[col] = df[col].apply(lambda words: [word for word in words if word not in stop_words]) 

100%|██████████| 25/25 [00:00<00:00, 70.42it/s] 


In [23]:
# Stemming
stemmer = PorterStemmer()
df['news_stemmed'] = df['news_without_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

In [24]:
#PoS
df['news_pos'] = df['news_without_stopwords'].apply(lambda x: nltk.pos_tag(x))

In [25]:
for col in tqdm(cols):
    df[col] = df[col].apply(lambda x: nltk.pos_tag(x))

100%|██████████| 25/25 [01:39<00:00,  4.00s/it]


In [26]:
#Lemma without PoS
lem = WordNetLemmatizer()
df['news_lemmatized'] = df['news_without_stopwords'].apply(lambda words: [lem.lemmatize(word) for word in words])

In [27]:
#Lemma with PoS
lemma_list = []

for words in tqdm(df['news_pos']):
    tmp=[]
    for lemma, pos in words:
        if pos.startswith("NN"):
            tmp.append(lem.lemmatize(lemma, pos='n'))
        elif pos.startswith('VB'):
            tmp.append(lem.lemmatize(lemma, pos='v'))
        elif pos.startswith('JJ'):
            tmp.append(lem.lemmatize(lemma, pos='a'))
        elif pos.startswith('R'):
            tmp.append(lem.lemmatize(lemma, pos='r'))
        else:
            tmp.append(lem.lemmatize(lemma))
            
    lemma_list.append(tmp)
    
df['news_lemmatized_pos'] = lemma_list

100%|██████████| 1989/1989 [00:05<00:00, 379.51it/s]


In [28]:
for col in tqdm(cols):
    #Lemma with PoS
    lemma_list = []

    for words in df[col]:
        tmp=[]
        for lemma, pos in words:
            if pos.startswith("NN"):
                tmp.append(lem.lemmatize(lemma, pos='n'))
            elif pos.startswith('VB'):
                tmp.append(lem.lemmatize(lemma, pos='v'))
            elif pos.startswith('JJ'):
                tmp.append(lem.lemmatize(lemma, pos='a'))
            elif pos.startswith('R'):
                tmp.append(lem.lemmatize(lemma, pos='r'))
            else:
                tmp.append(lem.lemmatize(lemma))

        lemma_list.append(tmp)

    df[col] = lemma_list    

100%|██████████| 25/25 [00:04<00:00,  5.31it/s]


In [32]:
df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,news_without_stopwords,news_stemmed,news_pos,news_lemmatized,news_lemmatized_pos,Label_1day,Label_2day,Label_3day,Label_4day,Label_5day
0,2008-08-08,0,"[georgia, down, two, russian, warplane, countr...","[break, musharraf, impeach]","[russia, today, columns, troop, roll, south, o...","[russian, tank, move, towards, capital, south,...","[afghan, child, rap, impunity, un, official, s...","[russian, tank, enter, south, ossetia, whilst,...","[break, georgia, invades, south, ossetia, russ...","[enemy, combatent, trial, nothing, sham, salim...",...,"[georgia, downs, two, russian, warplanes, coun...","[georgia, down, two, russian, warplan, countri...","[(georgia, JJ), (downs, NNS), (two, CD), (russ...","[georgia, down, two, russian, warplane, countr...","[georgia, down, two, russian, warplane, countr...",0,0,0,0,0
1,2008-08-11,1,"[usa, nato, help, u, help, u, help, iraq]","[bush, put, foot, georgian, conflict]","[jewish, georgian, minister, thanks, israeli, ...","[georgian, army, flees, disarray, russian, adv...","[olympic, open, ceremony, firework, fake]","[mossad, fraudulent, new, zealand, passport, i...","[russia, anger, israeli, military, sale, georgia]","[usan, citizen, live, sossetia, blame, usa, ge...",...,"[usa, nato, help, us, help, us, help, iraq, bu...","[usa, nato, help, us, help, us, help, iraq, bu...","[(usa, JJ), (nato, NN), (help, NN), (us, PRP),...","[usa, nato, help, u, help, u, help, iraq, bush...","[usa, nato, help, u, help, u, help, iraq, bush...",0,0,0,0,0


In [30]:
df_labels=df_djia[['Date','Label_1day','Label_2day','Label_3day','Label_4day','Label_5day']].copy()

In [31]:
df = pd.merge(df,df_labels,on='Date')

In [34]:
df.to_csv('pre_process_all_news_days.csv',sep=',', encoding='utf-8',index=False) 