In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [61]:
df=pd.read_csv(r'Data/Stock News Data.csv',parse_dates=['Date'])

## 1. Data Exploration

In [62]:
df.head()[:2]

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Flintoff injury piles on woe for England,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,On the critical list,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite


In [63]:
df.shape

(4101, 27)

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4101 entries, 0 to 4100
Data columns (total 27 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    4101 non-null   datetime64[ns]
 1   Label   4101 non-null   int64         
 2   Top1    4101 non-null   object        
 3   Top2    4101 non-null   object        
 4   Top3    4101 non-null   object        
 5   Top4    4101 non-null   object        
 6   Top5    4101 non-null   object        
 7   Top6    4101 non-null   object        
 8   Top7    4101 non-null   object        
 9   Top8    4101 non-null   object        
 10  Top9    4101 non-null   object        
 11  Top10   4101 non-null   object        
 12  Top11   4101 non-null   object        
 13  Top12   4101 non-null   object        
 14  Top13   4101 non-null   object        
 15  Top14   4101 non-null   object        
 16  Top15   4101 non-null   object        
 17  Top16   4101 non-null   object        
 18  Top17   

We have a date column, a label column, and 25 top news columns. Each row represents a day.

We can see that some news headlines is missing for 'Top 24' and 'Top 25' 

In [65]:
##Check whether the class is balanced or not
df.Label.value_counts()

1    2166
0    1935
Name: Label, dtype: int64

## 2. Data Cleaning

My Goal is to see if the news can affect the stock price on the same day, so the data column is not needed for this purpose. We also need to combine all 25 headlines into one document for later modelling.

In [66]:
df['Headlines']=np.nan
for i in range(len(df)):
    df['Headlines'][i]=' '.join(str(x) for x in df.iloc[i,2:])

In [67]:
df.head()[:2]

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25,Headlines
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,Scorecard,Hughes' instant hit buoys Blues,Jack gets his skates on at ice-cold Alex,Chaos as Maracana builds up for United,Depleted Leicester prevail as Elliott spoils E...,Hungry Spurs sense rich pickings,Gunners so wide of an easy target,...,Hunters threaten Jospin with new battle of the...,Kohl's successor drawn into scandal,The difference between men and women,"Sara Denver, nurse turned solicitor",Diana's landmine crusade put Tories in a panic,Yeltsin's resignation caught opposition flat-f...,Russian roulette,Sold out,Recovering a title,A 'hindrance to operations': extracts from the...
1,2000-01-04,0,Scorecard,The best lake scene,Leader: German sleaze inquiry,"Cheerio, boyo",The main recommendations,Has Cubie killed fees?,Has Cubie killed fees?,Has Cubie killed fees?,...,The timing of their lives,Dear doctor,Irish court halts IRA man's extradition to Nor...,Burundi peace initiative fades after rebels re...,PE points the way forward to the ECB,Campaigners keep up pressure on Nazi war crime...,Jane Ratcliffe,Yet more things you wouldn't know without the ...,Millennium bug fails to bite,Scorecard The best lake scene Leader: German s...


We can now drop off the individual news column now.

In [68]:
df=df[['Date','Label','Headlines']]
df.head()[:2]

Unnamed: 0,Date,Label,Headlines
0,2000-01-03,0,A 'hindrance to operations': extracts from the...
1,2000-01-04,0,Scorecard The best lake scene Leader: German s...


We can see many abbrevation used in the news. This can get avoided while cleaning. So, we need to replace the abbrevation with the actual meanings.

In [69]:
import re

In [70]:
def find_abbr(text):
    abbr = []
    for i in re.finditer(r"([A-Za-z]+| )([A-Za-z]\.){2,}", text):
        abbr.append(i.group())
    df_abbr = pd.Series(abbr)
    return df_abbr.unique()

In [78]:
news_comb=' '.join(df.Headlines)
abbr=find_abbr(news_comb)

In [83]:
abbr

array([' A.I.', ' M.A.N.D.Y.', ' U.N.', ' U.S.', ' U.K.', ' S.A.',
       ' U.S.C.', ' D.C.', ' N.J.', ' i.e.', ' P.I.', ' A.N.C.', ' a.m.',
       ' A.K.A.', ' P.R.', ' R.I.', 'nU.S.', ' E.U.', ' H.I.V.',
       ' I.H.T.', ' B.C.', ' J.P.', ' N.S.', 'crimese.g.', ' C.I.A.',
       ' p.m.', 'Ph.D.', ' N.Y.', ' U.A.E.', 'sq.m.', ' I.M.F.', ' y.o.',
       ' i.a.', ' I.D.', ' M.A.', ' H.W.', ' O.K.', ' N.K.', ' B.S.',
       ' A.T.M.', ' W.H.O.', ' N.S.A.', ' P.M.', ' F.B.I.', ' P.E.I.',
       ' a.k.a.', ' S.E.', ' A.D.', ' T.B.', ' J.K.', ' L.G.B.T.'],
      dtype=object)

We need to replace all the abbrevation with actual meaning

In [99]:
'''convert abbrevations to full words'''
for i in range(len(df)):
    text = str(df.Headlines[i])
    text = re.sub(r"A.I.", " Area of Interest ", text)
    text = re.sub(r"M.A.N.D.Y.", " Drugs ", text)
    text = re.sub(r"A.T.M.", " Automated Teller Machine ", text)
    text = re.sub(r"C.I.A.", " Central Intelligence Agency ", text)
    text = re.sub(r"D.C.", " District of columbia ", text)
    text = re.sub(r"E.U.", " Europian Union ", text)
    text = re.sub(r"F.B.I.", " Federal Bureau of Investigation ", text)
    text = re.sub(r"H.I.V.", " Human immunodeficiency virus ", text)
    text = re.sub(r"I.H.T.", " inheritance tax ", text)
    text = re.sub(r"I.M.F.", " International Monetary Fund ", text)
    text = re.sub(r"I.D.", " identification ", text)
    text = re.sub(r"L.G.B.T.", " minority ", text)
    text = re.sub(r"M.A.", " Massachusetts ", text)
    text = re.sub(r"N.J.", " new jersey ", text)
    text = re.sub(r"N.K.", " north korea ", text)
    text = re.sub(r"N.S.A.", " National Security Agency ", text)
    text = re.sub(r"N.Y.", " new york ", text)
    text = re.sub(r"P.E.I.", " Prince Edward Island ", text)
    text = re.sub(r"P.M.", " prime minister ", text)
    text = re.sub(r"P.R.C", " china ", text)
    text = re.sub(r"S.A.", " south africa ", text)
    text = re.sub(r"R.I.", " Rhode Island ", text)
    text = re.sub(r"U.A.E.", " United Arab Emirates ", text)
    text = re.sub(r"U.K.", " england ", text)
    text = re.sub(r"U.N.", " New Jersey ", text)
    text = re.sub(r"U.S.", " America ", text)
    text = re.sub(r"U.S.C.", " University of south california ", text)
    text = re.sub(r"W.H.O", " world health organization ", text)
    text = re.sub(r"a.m.", " morning ", text)
    text = re.sub(r"p.m.", " afternoon ", text)
    text = re.sub(r"Ph.D.", " doctor of philosophy ", text)
    text = re.sub(r"sq.m.", " square meter ", text)
    text = re.sub(r"sq.m.", " square meter ", text)

    df.Headlines[i]=text

In [100]:
news_comb=' '.join(df.Headlines)
abbr=find_abbr(news_comb)

In [104]:
''' These are the abbrevations that are not important as they mostly 
contains the words from stop wrods'''
abbr

array([' i.e.', ' P.I.', ' A.N.C.', ' A.K.A.', ' P.R.', ' B.C.', ' J.P.',
       ' N.S.', 'crimese.g.', ' y.o.', ' i.a.', ' H.W.', ' O.K.', ' B.S.',
       ' a.k.a.', ' S.E.', ' A.D.', ' T.B.', ' J.K.'], dtype=object)

In [105]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [106]:
ps=PorterStemmer()
wl=WordNetLemmatizer()
corpus=[]

### `For Bag of Words`

#### 1. Tokenize the text
#### 2. Remove non-alphabetic characters and one-letter words, including numbers and punctuations
#### 3. Remove stop words

In [109]:
df['headlines_str']=np.nan
for i in range(len(df.Headlines)):
    df.headlines_str[i]=df.Headlines[i].lower()
    df.headlines_str[i]=re.sub("[^a-zA-z]"," ",df.headlines_str[i])
    words=nltk.word_tokenize(df.headlines_str[i])
    words=[wl.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    df.headlines_str[i]=' '.join(words)

### `For Word2vec --- Doc2vec`

In [116]:
df['headlines_words']=[df.headlines_str[i].split() for i in range(len(df))]

In [117]:
df.head()

Unnamed: 0,Date,Label,Headlines,headlines_str,headlines_words
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,hindrance operation extract leaked report scor...,"[hindrance, operation, extract, leaked, report..."
1,2000-01-04,0,Scorecard The best lake scene Leader: German s...,scorecard best lake scene leader german sleaze...,"[scorecard, best, lake, scene, leader, german,..."
2,2000-01-05,0,Coventry caught on counter by Flo United's riv...,coventry caught counter flo united rival road ...,"[coventry, caught, counter, flo, united, rival..."
3,2000-01-06,1,Pilgrim knows how to progress Thatcher facing ...,pilgrim know progress thatcher facing ban mcil...,"[pilgrim, know, progress, thatcher, facing, ba..."
4,2000-01-07,1,Hitches and Horlocks Beckham off but United su...,hitch horlocks beckham united survive breast c...,"[hitch, horlocks, beckham, united, survive, br..."


In [160]:
from gensim.models import Word2Vec
import gensim

In [251]:
# Load word2vec model (trained on an enormous Google corpus)
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\goura\Documents\DS and ML Projects\8. Projects\Stock Sentiment Analysis\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [297]:
def doc2vec(model, wordlist):
    '''
    Use a word2vec embedding model to get the vecter of each word of the wordlist.
    Now we have a list of vecters, len(list)= number of words in the doc, len(vector)= the model type, e.g.300
    Convert each doc into a vector by np.mean. len(doc vec) = 300
    '''
    # Filter the list of vectors to include only those that Word2Vec has a vector for
    vector_list = [model[word] for word in wordlist if word in model]
    doc_vector = np.mean(vector_list, axis=0)
    return doc_vector

In [298]:
# convert each document (list of words) to a document vecter, then save into a list of doc_vec
df['word2vec'] = [doc2vec(model, doc) for doc in df.headlines_words]  

In [299]:
df.head()

Unnamed: 0,Date,Label,Headlines,headlines_str,headlines_words,word2vec
0,2000-01-03,0,A 'hindrance to operations': extracts from the...,hindrance operation extract leaked report scor...,"[hindrance, operation, extract, leaked, report...","[0.019919062, 0.051673025, -0.025186863, 0.062..."
1,2000-01-04,0,Scorecard The best lake scene Leader: German s...,scorecard best lake scene leader german sleaze...,"[scorecard, best, lake, scene, leader, german,...","[0.038909823, 0.049511578, 0.07234156, 0.05103..."
2,2000-01-05,0,Coventry caught on counter by Flo United's riv...,coventry caught counter flo united rival road ...,"[coventry, caught, counter, flo, united, rival...","[-0.028612338, 0.047972914, 0.013764942, 0.097..."
3,2000-01-06,1,Pilgrim knows how to progress Thatcher facing ...,pilgrim know progress thatcher facing ban mcil...,"[pilgrim, know, progress, thatcher, facing, ba...","[-0.021918356, 0.04694813, 0.03538333, 0.06647..."
4,2000-01-07,1,Hitches and Horlocks Beckham off but United su...,hitch horlocks beckham united survive breast c...,"[hitch, horlocks, beckham, united, survive, br...","[-0.014015404, 0.045476243, 0.021973068, 0.047..."


In [310]:
df.to_pickle("./preprocessed_df.pkl")