This is a stock price prediction project that closely follows this website:https://medium.com/@Currie32/predicting-the-stock-market-with-the-news-and-deep-learning-7fc8f5f639bc
full project github: https://github.com/Currie32/Predicting-the-Dow-Jones-with-Headlines
model description: https://www.aclweb.org/anthology/C16-1229.pdf
Primarily uses Keras

In [34]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [3]:
dj = pd.read_csv("DowJones.csv")
news = pd.read_csv("RedditNews.csv")

In [4]:
dj.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [5]:
dj.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
dtype: int64

In [6]:
news.isnull().sum()

Date    0
News    0
dtype: int64

In [7]:
news.head()

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [8]:
print(dj.shape)
print(news.shape)

(1989, 7)
(73608, 2)


In [9]:
news = news[news.Date.isin(dj.Date)]

In [10]:
print(len(set(dj.Date)))
print(len(set(news.Date)))

1989
1989


In [11]:
#here the dj becomes the difference between the opening price of the day and the next date
# set_index first so that when .diff calculate the difference in price it won't effect the date
dj = dj.set_index('Date').diff(periods=1)
dj['Date'] = dj.index
dj = dj.reset_index(drop=True)

#drop unnecessary columns
dj = dj.drop(['High','Low','Close','Volume','Adj Close'],1)


In [12]:
dj

Unnamed: 0,Open,Date
0,,2016-07-01
1,-211.480468,2016-06-30
2,-256.740235,2016-06-29
3,-265.509765,2016-06-28
4,164.701172,2016-06-27
...,...,...
1984,-79.139649,2008-08-14
1985,100.739258,2008-08-13
1986,148.890625,2008-08-12
1987,-52.030273,2008-08-11


In [13]:
dj = dj[dj.Open.notnull()]

In [14]:
price = []
headlines = []

#row now is an index series, where the series is of the form [Open, Date] pair, therefore call row[1] to only get the series, which is the value of each row
for row in dj.iterrows():
    daily_headlines = []
    date = row[1]["Date"]
    price.append(row[1]['Open'])
    # collect news from the same date and put them into daily_headlines
    for row_ in news[news.Date==date].iterrows():
        daily_headlines.append(row_[1]['News'])
    headlines.append(daily_headlines)

In [15]:
print(len(price))
print(len(headlines))

1988
1988


In [16]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [17]:
def clean_text(text, remove_stopwords = True):
    
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'0,0', '00', text) 
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'u s ', ' united states ', text)
    text = re.sub(r'u n ', ' united nations ', text)
    text = re.sub(r'u k ', ' united kingdom ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    text = re.sub(r'0km ', '0 km ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
    return text


In [18]:
# Clean the headlines
clean_headlines = []

for daily_headlines in headlines:
    clean_daily_headlines = []
    for headline in daily_headlines:
        clean_daily_headlines.append(clean_text(headline))
    clean_headlines.append(clean_daily_headlines)


In [19]:
clean_headlines[0]

['jamaica proposes marijuana dispensers tourists airports following legalisation kiosks desks would give people license purchase 2 ounces drug use stay',
 'stephen hawking says pollution stupidity still biggest threats mankind certainly become less greedy less stupid treatment environment past decade',
 'boris johnson says run tory party leadership',
 'six gay men ivory coast abused forced flee homes pictured signing condolence book victims recent attack gay nightclub florida',
 'switzerland denies citizenship muslim immigrant girls refused swim boys report',
 'palestinian terrorist stabs israeli teen girl death bedroom',
 'puerto rico default $ 1 billion debt friday',
 'republic ireland fans awarded medal sportsmanship paris mayor',
 'afghan suicide bomber kills 40 bbc news',
 'us airstrikes kill least 250 isis fighters convoy outside fallujah official says',
 'turkish cop took istanbul gunman hailed hero',
 'cannabis compounds could treat alzheimer removing plaque forming proteins br

In [20]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

for date in clean_headlines:
    for headline in date:
        for word in headline.split():
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 35190


In [22]:
# load GLoVe's embeddings
embeddings_index = {}
with open('C:/Users/edwar/Projects/Stock Prediction/glove.840B.300d.txt', encoding ='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding
    f.close()        
print("word embeddings:", len(embeddings_index))

word embeddings: 2196016


In [23]:
# Find the number of words that are missing from GloVe, and are used more than our threshold.
missing_words = 0
threshold = 10

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from GloVe:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from GloVe: 47
Percent of words that are missing from vocabulary: 0.13%


In [24]:
# Use only the words that appear more than a set threashold or that appears in GLoVe.
# creates two dictionary that gives indexing value to each word that is used
 
value = 0
vocab_to_int = {}

for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value 
        value += 1
        
# Special tokens that will be added to our vocab <UNK> = unknown, <PAD> = 
codes = ["<UNK>","<PAD>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total Number of Unique Words:", len(word_counts))
print("Number of Words we will use:", len(vocab_to_int))
print("Percent of Words we will use: {}%".format(usage_ratio))    


Total Number of Unique Words: 35190
Number of Words we will use: 31265
Percent of Words we will use: 88.85%


In [25]:
for code in codes:
    print(vocab_to_int[code])

31263
31264


In [26]:
#Create New embeddings for words in word_counts that are not in embedding_index 

embedding_dim = 300

#total number of words to use in training

nb_words = len(vocab_to_int)

#create a embedding matrix, each row is for a word, with total of 300 columns for the dimension of the GloVe vectors

word_embedding_matrix = np.zeros((nb_words,embedding_dim))

for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        #create a new embedding vectors with random values for words not in GLoVe embedding
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        # we also add this newly generated random embedding to the embedding_index along with the rest of the GloVe embedding
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding
        
print(len(word_embedding_matrix))

31265


All the words in the embedding "including the pretrained ones" will be updated as the model train. 

In [27]:
# now to create a replicated headlines where all the words are transformed into their regarding index in vocab_to_int 
# int_headlines: all the headlines
# int_daily_headlines: all the headlines in a day
# int_headline: single headline in a day 
# int_headlines[int_daily_headlines[int_headline]]

# for words in the headlines that are not in vocab_to_int 
# (meaning the word didn't hit the threshold and isn't in the pretrained embedding index)
# we replace it with the <UNK> token for "unknown"
word_count = 0
unk_count = 0 

int_headlines = []

for date in clean_headlines:
    int_daily_headlines = []
    for headline in date:
        int_headline = []
        for word in headline.split():
            word_count += 1
            if word in vocab_to_int:
                int_headline.append(vocab_to_int[word])
            else:
                int_headline.append(vocab_to_int["<UNK>"])
                unk_count += 1
        int_daily_headlines.append(int_headline)
    int_headlines.append(int_daily_headlines)

unk_percent = round((unk_count/word_count)*100,4)

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))





Total number of words in headlines: 615989
Total number of UNKs in headlines: 5262
Percent of words that are UNK: 0.8542%


In [28]:
#length of individual headline
lengths = []
for date in int_headlines:
    for headline in date:
        lengths.append(len(headline))

lengths = pd.DataFrame(lengths, columns=['count'])

In [29]:
lengths.describe()

Unnamed: 0,count
count,49693.0
mean,12.395891
std,6.790246
min,1.0
25%,7.0
50%,10.0
75%,16.0
max,41.0


In [30]:
# limiting the length of total words per day to 200 words
# and headlines to 16 words to reduce training time

max_headline_length = 16
max_daily_length = 200
# pad_headlines only divides up the news up to date, not individual news (2D)
# for news length that does not reach the max_daily_length, we pad it with the <PAD> token 

pad_headlines = []

for date in int_headlines:
    pad_daily_headlines = []
    for headline in date:
        #add the headline to pad daily headline if headline is less than max length
        if len(headline) <= max_headline_length:
            for word in headline:
                #needs to append word by word so pad_daily_headline is 1D
                pad_daily_headlines.append(word)
        else:
            headline = headline[:max_headline_length]
            for word in headline:
                pad_daily_headlines.append(word)
    if len(pad_daily_headlines) < max_daily_length:
        for i in range(max_daily_length - len(pad_daily_headlines)):
            pad = vocab_to_int["<PAD>"]
            pad_daily_headlines.append(pad)
    else:
        pad_daily_headlines = pad_daily_headlines[:max_daily_length]
    pad_headlines.append(pad_daily_headlines)
        

In [31]:
# normalize (between [0,1]) the opening prices(target values, response variable)
max_price = max(price)
min_price = min(price)
mean_price = np.mean(price)

def normalize(price):
    return ((price-min_price)/(max_price-min_price))

In [32]:
norm_price = []
for p in price:
    norm_price.append(normalize(p))

In [33]:
# checking normalization
print(min(norm_price))
print(max(norm_price))
print(np.mean(norm_price))

0.0
1.0
0.5448422454901358


In [36]:
# create train test split


x_train, x_test, y_train, y_test = train_test_split(pad_headlines, norm_price, test_size=0.15, random_state=2)

x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [37]:
# Check the lengths
print(len(x_train))
print(len(x_test))

1689
299
