 - Required Libraries

In [7]:
import os
import pickle
import numpy as np
import pandas as pd
import re
import calendar
import warnings
from sklearn.model_selection import train_test_split

 - Keras imports

In [8]:
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.layers import Bidirectional, Embedding, LSTM, Dense, Conv1D, GlobalMaxPool1D, MaxPool1D, MaxPooling1D, Dropout, Activation , Flatten , Input, concatenate
from keras.callbacks import ModelCheckpoint
from keras import backend as K

 - Load the Data

In [12]:
reutersFile = 'news_reuters.csv'
stockFile = 'stockReturns.json'
df1 = pd.read_csv('news_reuters.csv', header=None, 
                  names=['ticker', 'company', 'pub_date', 'headline', 'first_sent', 'category'])
df2 = pd.read_json('stockReturns.json')

 - Reformat, clean, merge, and tokenize the data

1. **reformat_y_data**
    - Convert stock data from continuous numeric data normalized to S&P returns into binary (postive/negative)

2. **clean_and_merge_data**
    - Filter X to only those tickers with stock data
3. **clean_text**
    - Replacing double spaces into a single space
    - Replace U.S. to United States so U won't get deleted with next replacement
    - Remove all capitalized words at the beginning of the sentence, since those are mostly places (aka NEW YORK)
    - Remove unnecessary punctuation (hyphens and asterisks)
    - Remove dates
4. **tokenize_sent**
    - Tokenize string into a sequence of words
5. **filt_to_one**
    - Filter dataset so that there is only one observation per day.
    - If there is more than one record, will use the topStory record if one exists.  
    - If one doesn't or there are 2 topStory records then it will randomly select one of the observations.

In [17]:
def reformat_y_data(data, tickerType='mid'):
    tmp = data[tickerType].apply(pd.Series)
    tmp = tmp.stack().rename('price', inplace=True).reset_index()
    tmp['y'] = np.where(tmp['price'] >= 0, 1, 0)
    tmp.rename(columns={'level_0': 'ticker', 'level_1': 'pub_date'}, inplace=True)
    return tmp

def clean_and_merge_data(X, Y):
    y_tickers = set(Y['ticker'])
    X = X.loc[X['ticker'].isin(y_tickers)]
    # Make sure data types are the same for merge    
    Y['pub_date'] = Y['pub_date'].astype(df1['pub_date'].dtype)
    Y['ticker'] = Y['ticker'].astype(df1['ticker'].dtype)
    return X.merge(Y, on=['ticker', 'pub_date'], how='left')

def clean_text(sent):
    monthStrings = list(calendar.month_name)[1:] + list(calendar.month_abbr)[1:]
    monthPattern = '|'.join(monthStrings)
    sent = re.sub(r' +', ' ', sent)
    sent = re.sub(r'U.S.', 'United States', sent)
    sent = re.sub(r'^(\W?[A-Z\s\d]+\b-?)', '', sent)
    sent = re.sub(r'^ ?\W ', '', sent)
    sent = re.sub(r'({}) \d+'.format(monthPattern), '', sent)
    sent = re.sub(r' +', ' ', sent)
    return sent 

def tokenize_sent(col):
    return [text_to_word_sequence(text, lower=False) for text in col]

def filt_to_one(x, random_state=10):
    if x.shape[0] > 1:
        if 'topStory' in x['category'].unique():
            x = x.loc[x['category'] == 'topStory']
        if x.shape[0] > 1:
            x = x.sample(n=1, random_state=random_state)
    return x

In [18]:
cleanY = reformat_y_data(df2, 'short')
merged = clean_and_merge_data(df1, cleanY)

 - Clean up the Data

In [19]:
# Clean up text
merged['headline'] = merged.headline.apply(clean_text)
merged['first_sent'] = merged.first_sent.apply(clean_text)

In [20]:
# Turn sentences into tokens
merged['headline_token'] = tokenize_sent(merged.headline)
merged['first_sent_token'] = tokenize_sent(merged.first_sent)

In [21]:
# Get one record per company/day
finalData = merged.groupby(by=['ticker', 'pub_date']).apply(filt_to_one)

In [22]:
# Combine Headline and First Sentence into one text 
finalData['final_text'] = finalData['headline_token'] + finalData.first_sent_token

In [23]:
# Remove observations with missing stock price
finalData.dropna(inplace=True)

In [24]:
new_columns = ['ticker2', 'company', 'pub_date2', 
            'headline', 'first_sent', 'category', 
            'price', 'y', 'headline_token', 
            'first_sent_token', 'final_text']
finalData.columns = new_columns
finalData.reset_index(inplace=True)

In [25]:
X = finalData['headline'].values
y = finalData['y'].values

 - Create a train and test set, retaining the same test set for every model

In [26]:
#split data into training and testing sets and stratify on y
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)

In [27]:
#how many unique words to use (i.e num rows in embedding vector)
MAX_NUM_WORDS=40 

#max number of words in a review to use
MAX_SEQUENCE_LENGTH=100 

In [28]:
#fit tokenizer on X_train
tokenizer = Tokenizer(num_words=200)
tokenizer.fit_on_texts(X_train)

In [29]:
#create padded sequences
sequences = tokenizer.texts_to_sequences(X_train)
train_data = pad_sequences(sequences, maxlen=100)

In [30]:
#convert y_train to one-hot encoded version
word_index = tokenizer.word_index
y_train_labels = to_categorical(np.asarray(y_train))

In [31]:
#check shape of train_data and y_train_labels
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', y_train_labels.shape)

Shape of data tensor: (8542, 100)
Shape of label tensor: (8542, 2)


In [32]:
#fit tokenizer on X_test
tokenizer_test = Tokenizer(num_words=200)
tokenizer_test.fit_on_texts(X_test)

In [33]:
#create padded sequences
sequences_test = tokenizer_test.texts_to_sequences(X_test)
test_data = pad_sequences(sequences_test, maxlen=100)

In [35]:
#convert y_test to one-hot encoded version
word_index_text = tokenizer_test.word_index
y_test_labels = to_categorical(np.asarray(y_test))

In [37]:
#check shape of test_data and y_test_labels
print('Shape of data tensor:', test_data.shape)
print('Shape of label tensor:', y_test_labels.shape)

Shape of data tensor: (2848, 100)
Shape of label tensor: (2848, 2)


 - Load word embeddings

In [39]:
#I assume that you have the 'glove.6B.100d.txt' file in your directory
GLOVE_DIR=''
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

 - Create Embedding Matrix

In [43]:
#set the size of each word vector
EMBEDDING_DIM = 100 

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    
    if embedding_vector is not None:
       # words not found in embedding index will be all-zeros.
       embedding_matrix[i] = embedding_vector
        
embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)

## Model 1: RNN

In [45]:
def create_rnn_model(seq_input_len, embed_matrix, 
                     n_RNN_nodes, n_dense_nodes, 
                     recurrent_dropout=0.2, 
                     drop_out=.2, n_out=2):
    
    word_input = Input(shape=(seq_input_len,), name='word_input_layer')
    word_embeddings = Embedding(input_dim=embed_matrix.shape[0],
                                output_dim=embed_matrix.shape[1],
                                weights=[embed_matrix], 
                                mask_zero=True, 
                                name='word_embedding_layer')(word_input) 
    hidden_layer1 = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=True, 
                                      recurrent_dropout=recurrent_dropout, 
                                      dropout=drop_out, name='hidden_layer1'))(word_embeddings)
    hidden_layer2 = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=False, 
                                      recurrent_dropout=recurrent_dropout,
                                      dropout=drop_out, name='hidden_layer2'))(hidden_layer1)
    dense_layer = Dense(units=n_dense_nodes, activation='relu', name='dense_layer')(hidden_layer2)
    drop_out3 = Dropout(drop_out)(dense_layer)
    output_layer = Dense(units=n_out, activation='softmax',
                         name='output_layer')(drop_out3)
    model = Model(inputs=[word_input], outputs=output_layer)
    model.compile(loss='categorical_crossentropy', optimizer="adam", 
                  metrics=['accuracy', recall, precision])
    return model 

 - Define functions to calculate precision and recall

In [46]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [47]:
n_out = 2
nb_epoch = 1
rnn_model = create_rnn_model(seq_input_len=train_data.shape[-1],
                             embed_matrix=embedding_matrix, 
                             recurrent_dropout=.4, drop_out=.5,
                             n_RNN_nodes=500, n_dense_nodes=500, n_out=n_out)

In [50]:
def train_and_test_model(model, x_train, y_train, x_test, y_test, 
                         modelSaveName, modelSavePath='',
                         batch_size=1014, epochs=2, validation_split=.1):
    print(model.summary())
    
    filepath = os.path.join(modelSavePath, modelSaveName + '.hdf5')
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1)
    callbacks_list = [checkpoint]
    model.fit(x=x_train, y=y_train, batch_size=batch_size, 
              epochs=epochs, validation_split=validation_split, 
              callbacks=callbacks_list)
    
    score, acc, rec, prec = model.evaluate(x_test, y_test, batch_size=batch_size)
    return (model, acc, rec, prec)    

In [51]:
rnn_res = train_and_test_model(rnn_model, train_data, 
                               y_train_labels, test_data, 
                               y_test_labels, 'rnn_model',
                               epochs=nb_epoch)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_input_layer (InputLayer (None, 100)               0         
_________________________________________________________________
word_embedding_layer (Embedd (None, 100, 100)          1022000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 1000)         2404000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 1000)              6004000   
_________________________________________________________________
dense_layer (Dense)          (None, 500)               500500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
output_layer (Dense)         (None, 2)                 1002      
Total para

## Model 2: CNN

In [53]:
def vectorize_sentences(data, lexicon, maxlen=200):
    X = []
    for sentences in data:
        x = [lexicon[token] if token in lexicon else lexicon['<UNK>'] for 
                                 token in sentences]
        x2 = np.eye(len(char_indices) + 1)[x]
        X.append(x2)
    return (pad_sequences(X, maxlen=maxlen))

def create_cnn_model(char_maxlen, vocab_size,
                     nb_filter=100, filter_kernels = [4] * 4,
                     pool_size=3, n_dense_nodes=100,
                     drop_out=.2, n_out=2):

    inputs = Input(shape=(char_maxlen, vocab_size), name='char_input_layer')

    conv1 = Conv1D(nb_filter, kernel_size=filter_kernels[0],
                  padding='valid', activation='relu',
                  input_shape=(char_maxlen, vocab_size))(inputs)
    
    maxpool1 = MaxPool1D(pool_size=pool_size)(conv1)

    conv2 = Conv1D(nb_filter, kernel_size=filter_kernels[1],
                          padding='valid', activation='relu')(maxpool1)
    maxpool2 = MaxPool1D(pool_size=pool_size)(conv2)

    conv3 = Conv1D(nb_filter, kernel_size=filter_kernels[2],
                          padding='valid', activation='relu')(maxpool2)

    conv4 = Conv1D(nb_filter, kernel_size=filter_kernels[3],
                          padding='valid', activation='relu')(conv3)

    maxpool3 = MaxPool1D(pool_size=pool_size)(conv4)
    flatten = Flatten()(maxpool3)

    dense_layer = Dense(n_dense_nodes, activation='relu')(flatten)
    dropout = Dropout(drop_out)(dense_layer)

    output_layer = Dense(n_out, activation='softmax', name='output')(dropout)

    model = Model(inputs=inputs, outputs=output_layer)

    model.compile(loss='categorical_crossentropy', optimizer="adam", 
                  metrics=['accuracy', recall, precision])    
    return model 


In [54]:
char_maxlen = 1024 
nb_filter = 128
dense_outputs = 1024
filter_kernels = [7, 5, 5, 3]
pool_size = 5

In [55]:
# Turn all tokens into one string and then all obs 
# into one overall string
trainTokensAsString = X_train
testTokensAsString = X_test
oneTxt = ' '.join(trainTokensAsString)

# Get info about characters
chars = set(oneTxt)
vocab_size = len(chars) + 1
print('total chars:', vocab_size)
char_indices = dict((c, i + 2) for i, c in enumerate(chars))
indices_char = dict((i + 2, c) for i, c in enumerate(chars))

char_indices['<UNK>'] = 1
indices_char[1] = '<UNK>'

trainCharData = vectorize_sentences(trainTokensAsString, char_indices, char_maxlen)
testCharData = vectorize_sentences(testTokensAsString, char_indices, char_maxlen)
trainCharData.shape
testCharData.shape
char_maxlen

total chars: 92


1024

In [56]:
cnn_model = create_cnn_model(char_maxlen=char_maxlen, 
                             vocab_size=vocab_size,
                             nb_filter=nb_filter, 
                             filter_kernels=filter_kernels,
                             pool_size=pool_size, 
                             n_dense_nodes=dense_outputs,
                             drop_out=.5, 
                             n_out=n_out)

cnn_res = train_and_test_model(cnn_model, trainCharData[:, :, 1:],
                               y_train_labels, 
                               testCharData[:, :, 1:], 
                               y_test_labels, 
                               'cnn_model',
                               epochs=nb_epoch)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
char_input_layer (InputLayer (None, 1024, 92)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1018, 128)         82560     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 203, 128)          0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 199, 128)          82048     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 35, 128)           82048     
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 33, 128)           49280     
__________

## Model 3: RNN+CNN

In [57]:
def create_cnn_rnn_model(rnn_input_len, char_maxlen, vocab_size,
                         embed_matrix, n_RNN_nodes, 
                         nb_filter=100, filter_kernels = [4] * 4,
                         pool_size=3, n_dense_nodes=100,
                         recurrent_dropout=0.2, 
                         drop_out=.2, n_out=2):
    
    word_input = Input(shape=(rnn_input_len,), name='word_input_layer')
    char_input = Input(shape=(char_maxlen, vocab_size), name='char_input_layer')
    
    word_embeddings = Embedding(input_dim=embed_matrix.shape[0],
                                output_dim=embed_matrix.shape[1],
                                weights=[embed_matrix], 
                                mask_zero=True, 
                                name='word_embedding_layer')(word_input) 

    rnn_output1 = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=True, 
                                      recurrent_dropout=recurrent_dropout, 
                                      dropout=drop_out, name='hidden_layer1'))(word_embeddings)
    
    rnn_output2 = Bidirectional(LSTM(units=n_RNN_nodes, return_sequences=False, 
                                      recurrent_dropout=recurrent_dropout,
                                      dropout=drop_out, name='hidden_layer2'))(rnn_output1)
            
    conv1 = Conv1D(nb_filter, kernel_size=filter_kernels[0],
                  padding='valid', activation='relu',
                  input_shape=(char_maxlen, vocab_size))(char_input)

    maxpool1 = MaxPool1D(pool_size=pool_size)(conv1)

    conv2 = Conv1D(nb_filter, kernel_size=filter_kernels[1],
                          padding='valid', activation='relu')(maxpool1)
    maxpool2 = MaxPool1D(pool_size=pool_size)(conv2)

    conv3 = Conv1D(nb_filter, kernel_size=filter_kernels[2],
                          padding='valid', activation='relu')(maxpool2)

    conv4 = Conv1D(nb_filter, kernel_size=filter_kernels[3],
                          padding='valid', activation='relu')(conv3)

    maxpool3 = MaxPool1D(pool_size=pool_size)(conv4)
    cnn_output = Flatten()(maxpool3)

    merged_layer = concatenate([cnn_output, rnn_output2])
    
    dense_layer1 = Dense(n_dense_nodes, activation='relu', name='dense_layer')(merged_layer)
    drop_out1 = Dropout(drop_out)(dense_layer1)
    dense_layer2 = Dense(n_dense_nodes, activation='relu')(drop_out1)
    drop_out2 = Dropout(drop_out)(dense_layer2)
    
    main_output = Dense(n_out, activation='softmax', name='output_layer')(drop_out2)

    model = Model(inputs=[word_input, char_input], outputs=[main_output])

    model.compile(loss='categorical_crossentropy', optimizer="adam", 
                  metrics=['accuracy', recall, precision])    

    return model 

In [58]:
cnn_rnn_model = create_cnn_rnn_model(rnn_input_len=train_data.shape[-1], 
                                     char_maxlen=char_maxlen, 
                                     vocab_size=vocab_size,
                                     embed_matrix=embedding_matrix, 
                                     n_RNN_nodes=500,
                                     nb_filter=nb_filter, 
                                     filter_kernels=filter_kernels,
                                     pool_size=pool_size, 
                                     n_dense_nodes=400,
                                     recurrent_dropout=0.4, 
                                     drop_out=.5, 
                                     n_out=n_out)

In [59]:
cnn_rnn_res = train_and_test_model(cnn_rnn_model, 
                               [train_data, trainCharData[:, :, 1:]],
                               y_train_labels, 
                               [test_data, testCharData[:, :, 1:]],
                               y_test_labels, 
                               'cnn_rnn_model',
                               epochs=nb_epoch)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input_layer (InputLayer)   (None, 1024, 92)     0                                            
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 1018, 128)    82560       char_input_layer[0][0]           
__________________________________________________________________________________________________
max_pooling1d_7 (MaxPooling1D)  (None, 203, 128)     0           conv1d_9[0][0]                   
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 199, 128)     82048       max_pooling1d_7[0][0]            
__________________________________________________________________________________________________
max_poolin

## 4. Compare performance of all of models in a table (precision and recall)

In [60]:
pd.DataFrame.from_records([cnn_res[1:4], rnn_res[1:4], cnn_rnn_res[1:4]], 
                          columns=['accuracy', 'recall', 'precision'], 
                         index=['cnn_mod', 'rnn_mod', 'cnn_rnn_mod'])

Unnamed: 0,accuracy,recall,precision
cnn_mod,0.500702,0.500702,0.500702
rnn_mod,0.501404,0.501404,0.501404
cnn_rnn_mod,0.500351,0.500351,0.500351


## 5. Look at your labeling and print out the underlying data compared to the labels - for each model print out 2-3 examples of a good classification and a bad classification. Make an assertion why your model does well or poorly on those outputs.

In [61]:
def print_classifications(classifications, classType, test_y, test_text):
    texts = [''.join(sent) for sent in test_text[classifications]]
    stock_movements = np.where(test_y[classifications], 'positive', 'negative')
    print('Examples of {} predictions:\n'.format(classType))
    for i in range(len(texts)):
        print('Stock movement was {}'.format(stock_movements[i]))
        print('News info:\n{}'.format(texts[i]))
        print('')

In [62]:
def predict_and_print_samples(model, modelName, test_x, test_y=y_test, test_text = X_test):
    """"Print out predictions of the model"""
    print('Stats for {} model'.format(modelName))
    res = model.predict(test_x)
    class_res = np.apply_along_axis(np.argmax, axis=1, arr=res)
    comparisons = class_res == test_y
    comparisons = pd.DataFrame(comparisons)
    good_class = comparisons.loc[comparisons[0] == True].index[0:3]
    bad_class = comparisons.loc[comparisons[0] == False].index[0:3]
    print_classifications(good_class, 'correct', test_y, test_text)
    print_classifications(bad_class, 'INcorrect', test_y, test_text)
    y_test_df = pd.DataFrame(y_test)
    top3MostProbPosArg = np.argsort(res[:, 1])[-3:]
    top3Y = y_test_df.iloc[top3MostProbPosArg]
    top3Probs = pd.Series(res[top3MostProbPosArg, 1], index=top3Y.index)
    top3Data = pd.concat([top3Y, top3Probs], axis=1)
    top3Data.columns = ['Actual', 'PositiveProb']
    print('')
    print('Top 3 Most Positive Probability:')
    print(top3Data)

In [63]:
predict_and_print_samples(rnn_res[0], 'RNN', test_data)
predict_and_print_samples(cnn_res[0], 'CNN', testCharData[:, :, 1:])
predict_and_print_samples(cnn_rnn_res[0], 'CNN_RNN', [test_data, testCharData[:, :, 1:]])

Stats for RNN model
Examples of correct predictions:

Stock movement was negative
News info:
Google refuses French order to apply 'right to be forgotten' globally 

Stock movement was negative
News info:
Disney axes Marvel's marketing department: report 

Stock movement was negative
News info:
Tech firms write to United States FCC to oppose 'net neutrality' plan 

Examples of INcorrect predictions:

Stock movement was negative
News info:
Futures advance with UMich data leading indicators due 

Stock movement was negative
News info:
Bristol-Myers receives positive CHMP opinion for Opdivo 

Stock movement was negative
News info:
Fondiaria-SAI says sold stake in Mediobanca 


Top 3 Most Positive Probability:
      Actual  PositiveProb
1907     1.0      0.532536
1983     1.0      0.538668
2394     0.0      0.544906
Stats for CNN model
Examples of correct predictions:

Stock movement was positive
News info:
Deals of the day-Mergers and acquisitions 

Stock movement was positive
News info:
J