In [2]:
import pandas as pd
import numpy as np
import scipy
import os
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle
pd.options.display.max_rows = 6000


import gensim
from gensim import corpora, models, matutils
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Data overview

In [2]:
df = pd.read_csv('data/Combined_News_DJIA.csv')
df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."


In [3]:
df.iloc[0,2]

'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war"'

In [8]:
df.shape

(1989, 27)

In [9]:
# topic 23 has 1 missing value, 24 &25 have 3 missing values
# This is not an issue because we are going to combine the news into one document for each day.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989 entries, 0 to 1988
Data columns (total 27 columns):
Date     1989 non-null object
Label    1989 non-null int64
Top1     1989 non-null object
Top2     1989 non-null object
Top3     1989 non-null object
Top4     1989 non-null object
Top5     1989 non-null object
Top6     1989 non-null object
Top7     1989 non-null object
Top8     1989 non-null object
Top9     1989 non-null object
Top10    1989 non-null object
Top11    1989 non-null object
Top12    1989 non-null object
Top13    1989 non-null object
Top14    1989 non-null object
Top15    1989 non-null object
Top16    1989 non-null object
Top17    1989 non-null object
Top18    1989 non-null object
Top19    1989 non-null object
Top20    1989 non-null object
Top21    1989 non-null object
Top22    1989 non-null object
Top23    1988 non-null object
Top24    1986 non-null object
Top25    1986 non-null object
dtypes: int64(1), object(26)
memory usage: 419.7+ KB


In [7]:
# the two classes are balanced
df.Label.value_counts()

1    1065
0     924
Name: Label, dtype: int64

# Data cleaning

-  remove date column, because we are trying to see if the same day headlines can affect the same day stock close value.
-  combine each day's 25 news into one document
-  write a function to convert one string into a list of words
-  convert every document in df.news into a list of words

In [10]:
# remove the date column
# the stock close trend is directly affected by the news from the same day.
df = df.drop(['Date'], axis = 1)

In [11]:
# combine all 25 news of the same day into a long string for doc2vec and vectorization.
combine_news = []

for index, row in df.iterrows():
    combine_news.append(' '.join(str(x) for x in row[1:]))
df['news'] = pd.DataFrame(combine_news)

In [12]:
# all the news headlines for 08/08/2008
df.loc[0,'news']

'b"Georgia \'downs two Russian warplanes\' as countries move to brink of war" b\'BREAKING: Musharraf to be impeached.\' b\'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)\' b\'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire\' b"Afghan children raped with \'impunity,\' U.N. official says - this is sick, a three year old was raped and they do nothing" b\'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.\' b"Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO\'s side" b"The \'enemy combatent\' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it." b\'Georgian troops retreat from S. Osettain capital, presumably leaving several hundred people killed. [VIDEO]\' b\'Did the U.S. Prep Georgia for War with Russia?\'

In [13]:
# Find all the 2/3 character abbrevations from the string corpus. Convert them to full form. 
# Then we can remove the punctuations without worrying losing the meaning of the abbrevation words.

# combine all the news into a very long string
long_news_str = '   '.join(df.news)

# find all the abbrevations of 2 and 3 characters
def find_abbr(text):
    abbr = []
    for i in re.finditer(r"([A-Za-z]+| )([A-Za-z]\.){2,}", text):
        abbr.append(i.group())
    df_abbr = pd.Series(abbr)
    return df_abbr.unique()

find_abbr(long_news_str)

array([' U.N.', ' U.S.', ' U.K.', ' S.A.', ' U.S.C.', ' D.C.', ' N.J.',
       ' i.e.', ' P.I.', ' A.N.C.', ' a.m.', ' A.K.A.', ' P.R.', ' R.I.',
       'nU.S.', ' E.U.', ' H.I.V.', ' I.H.T.', ' B.C.', ' J.P.', ' N.S.',
       'crimese.g.', ' C.I.A.', ' p.m.', 'Ph.D.', ' N.Y.', ' U.A.E.',
       'sq.m.', ' I.M.F.', ' y.o.', ' i.a.', ' I.D.', ' M.A.', ' H.W.',
       ' O.K.', ' N.K.', ' B.S.', ' A.T.M.', ' W.H.O.', ' N.S.A.',
       ' P.M.', ' F.B.I.', ' P.E.I.', ' a.k.a.', ' S.E.', ' A.D.',
       ' T.B.', ' J.K.', ' L.G.B.T.'], dtype=object)

In [None]:
# get ready stop words for the tokenization function below
mywords = ['breaking','whilst', 'say', 'says', 'today','yesterday', 'news', 'tomorrow','iii', 'ii', 'like', 'ha',]
final_stop = stopwords.words('english') + mywords

In [14]:
def text_to_wordlist(text, remove_stop_words=True, lemma_words=True):
    
    ''' Clean each document into a list of words:
    1. convert abbrevations to full words
    2. tokenize the text
    3. remove non-alphabetic characters and one-letter words, including numbers and punctuations
    4. remove stop words
    '''
    # clean the text, convert only the abbrs that are meaningful
    text = re.sub(r" A.T.M. ", " Automated Teller Machine ", text)
    text = re.sub(r" C.I.A. ", " Central Intelligence Agency ", text)
    text = re.sub(r" D.C. ", " District of columbia ", text)
    text = re.sub(r" E.U. ", " Europian Union ", text)
    text = re.sub(r" F.B.I. ", " Federal Bureau of Investigation ", text)
    text = re.sub(r" H.I.V. ", " Human immunodeficiency virus ", text)
    text = re.sub(r" I.H.T. ", " inheritance tax ", text)
    text = re.sub(r" I.M.F. ", " International Monetary Fund ", text)
    text = re.sub(r" I.D. ", " identification ", text)
    text = re.sub(r" L.G.B.T. ", " minority ", text)
    text = re.sub(r" M.A. ", " Massachusetts ", text)
    text = re.sub(r" N.J. ", " new jersey ", text)
    text = re.sub(r" N.K. ", " north korea ", text)
    text = re.sub(r" N.S.A. ", " National Security Agency ", text)
    text = re.sub(r" N.Y. ", " new york ", text)
    text = re.sub(r" P.E.I. ", " Prince Edward Island ", text)
    text = re.sub(r" P.M. ", " prime minister ", text)
    text = re.sub(r" P.R.C ", " china ", text)
    text = re.sub(r" S.A. ", " south africa ", text)
    text = re.sub(r" R.I. ", " Rhode Island ", text)
    text = re.sub(r" U.A.E. ", " United Arab Emirates ", text)
    text = re.sub(r" U.K. ", " england ", text)
    text = re.sub(r" U.N. ", " new jersey ", text)
    text = re.sub(r" U.S. ", " america ", text)
    text = re.sub(r" U.S.C. ", " university of south california ", text)
    text = re.sub(r" W.H.O ", " world health organization ", text)
    text = re.sub(r" a.m. ", " morning ", text)
    text = re.sub(r" p.m. ", " afternoon ", text)
    text = re.sub(r" Ph.D. ", " doctor of philosophy ", text)
    text = re.sub(r" sq.m. ", " square meter ", text)
    
    # Tokenize the string into word tokens
    tokens = word_tokenize(text)
    
    # further clean the tokens: split toekns like "b'Russia" which still have punctuations in the token
    ls = []
    for word in tokens:
        if "'" in word:
            ls = ls + word.split("'")
    tokens = tokens + ls
    
    # Optionally, shorten words to their stems
    if lemma_words:
        tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    
    #Remove one letter tokens & non-alphabetic tokens, such as punctuation, then lower the tokens
    tokens = [word.lower() for word in tokens if (word.isalpha() and len(word)>1)]

    # remove stop words  
    if remove_stop_words:
        tokens = [word for word in tokens if word not in final_stop]
               
    return tokens

In [15]:
# convert each document to list of words
df.news = df.news.apply(lambda x: text_to_wordlist(x))

In [16]:
df.news[0]

['georgia',
 'two',
 'russian',
 'warplane',
 'country',
 'move',
 'brink',
 'war',
 'musharraf',
 'impeached',
 'columns',
 'troop',
 'roll',
 'south',
 'ossetia',
 'footage',
 'fighting',
 'youtube',
 'tank',
 'moving',
 'towards',
 'capital',
 'south',
 'ossetia',
 'reportedly',
 'completely',
 'destroyed',
 'georgian',
 'artillery',
 'fire',
 'afghan',
 'child',
 'raped',
 'new',
 'jersey',
 'official',
 'sick',
 'three',
 'year',
 'old',
 'wa',
 'raped',
 'nothing',
 'russian',
 'tank',
 'entered',
 'south',
 'ossetia',
 'georgia',
 'shoot',
 'two',
 'russian',
 'jet',
 'georgia',
 'invades',
 'south',
 'ossetia',
 'russia',
 'warned',
 'would',
 'intervene',
 'side',
 'combatent',
 'trial',
 'nothing',
 'sham',
 'salim',
 'haman',
 'sentenced',
 'year',
 'kept',
 'longer',
 'anyway',
 'feel',
 'troop',
 'retreat',
 'osettain',
 'capital',
 'presumably',
 'leaving',
 'several',
 'hundred',
 'people',
 'killed',
 'video',
 'america',
 'prep',
 'georgia',
 'war',
 'russia',
 'gives'

In [None]:
with open('news_wordlists.pkl', 'wb') as f:
    pickle.dump(df, f)

# Word2vec --- Doc2vec  (transfer learning)

In [19]:
# Load word2vec model (trained on an enormous Google corpus)
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/peiguo/Downloads/GoogleNews-vectors-negative300.bin', binary=True)

In [20]:
economy_vec = model['south']
economy_vec[:20] # First 20 components of 300 total

array([-0.02478027, -0.10986328,  0.06591797, -0.09130859,  0.17578125,
        0.08007812, -0.18261719, -0.34765625, -0.16113281,  0.30859375,
        0.13769531,  0.11181641,  0.12207031,  0.07128906,  0.00436401,
       -0.20898438, -0.18945312,  0.08056641,  0.03393555, -0.07910156],
      dtype=float32)

In [21]:
def doc2vec(model, wordlist):
    '''
    Use a word2vec embedding model to get the vecter of each word of the wordlist.
    Now we have a list of vecters, len(list)= number of words in the doc, len(vector)= the model type, e.g.300
    Convert each doc into a vector by np.mean. len(doc vec) = 300
    '''
    # Filter the list of vectors to include only those that Word2Vec has a vector for
    vector_list = [model[word] for word in wordlist if word in model.vocab]
    doc_vector = np.mean(vector_list, axis=0)
    return doc_vector

In [22]:
# convert each document (list of words) to a document vecter, then save into a list of doc_vec
x_doc = [doc2vec(model, doc) for doc in df.news]  
X_doc= pd.Series(x_doc, name = 'doc_vec') # list to df

In [25]:
# top 20 numbers in the first document vector
X_doc[0][:20]

array([-0.00466015,  0.04605767,  0.03547058,  0.10155483, -0.03422928,
       -0.01389613,  0.0094356 , -0.14502455,  0.03542512,  0.0929451 ,
       -0.02405647, -0.13372633, -0.0746136 ,  0.03855896, -0.08115924,
        0.10458709, -0.0641892 ,  0.0719087 ,  0.0019091 , -0.12026239],
      dtype=float32)

In [26]:
X_doc.shape

(1989,)

In [27]:
# a new dataframe containing only label, news, doc_vec
newdf = pd.concat([df.Label,df.news,X_doc], axis = 1)

# convert each row in the news column from a list of tokens to a string
tokenstrlist = []
for tokenlist in  df.news:
    tokenstr = ' '.join(tokenlist)
    tokenstrlist.append(tokenstr)

# add the string format new as clolumn 'news_str' to the data frame
newdf['news_str'] = pd.Series(tokenstrlist)

# the final data frame is ready for modeling
newdf.head()

Unnamed: 0,Label,news,doc_vec,news_str
0,0,"[georgia, two, russian, warplane, country, mov...","[-0.0046601472, 0.046057668, 0.035470575, 0.10...",georgia two russian warplane country move brin...
1,1,"[wont, america, nato, help, wont, help, help, ...","[-0.01796527, 0.026893076, 0.05216946, 0.11043...",wont america nato help wont help help iraq put...
2,0,"[adorable, sang, opening, ceremony, wa, fake, ...","[0.020226372, 0.05665661, 0.038335405, 0.09110...",adorable sang opening ceremony wa fake russia ...
3,0,"[america, refuse, israel, weapon, attack, iran...","[0.009319111, 0.04263116, 0.062353328, 0.08478...",america refuse israel weapon attack iran repor...
4,1,"[expert, admit, legalise, drug, south, osetia,...","[0.01713654, 0.04969087, 0.062367942, 0.105228...",expert admit legalise drug south osetia pictur...


In [None]:
with open('label_news_docvec_newsstr.pkl', 'wb') as f:
    pickle.dump(newdf, f)