# Stock market predictions based on news
The prediction is made based on different news on that days and correlated with the respective price movements (1-up or 0-down). It works as a binary classifier. For this program to work, the DJIA.csv file must be uploaded before starting. 

In [None]:
#install libraries
!pip install keras
!pip install nltk
!pip install numpy
!pip install pandas
!pip install sklearn

In [None]:
#import and download required libraries
from numpy import array
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, LSTM
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('all')

In [None]:
#read the data
dataframe = pd.read_csv('DJIA.csv')
dataframe.head()

In [4]:
#split data into features (headlines/corpus) and labels (stock movement classified as up and down)
dataframe['Headlines'] = dataframe[dataframe.columns[2:]].apply(lambda x: '. '.join(x.dropna().astype(str)),axis=1) #dropna used for the last columns which have nan values
corpus = dataframe['Headlines']
labels = dataframe['Label']
#remove punctuation and make lowercase
corpus.replace("[^a-zA-Z]", " ", regex=True, inplace=True) 
corpus = corpus.str.lower()

In [None]:
#calculate the sentiment value for each line (all news of a day)
sia = SentimentIntensityAnalyzer()
results = [] #will contain the compound score
for line in corpus:
  pol_score = sia.polarity_scores(line)
  results.append(pol_score)
results

In [None]:
#move the results to a dataframe and see the correlation
score = pd.DataFrame(results)['compound']
score.corr(labels)

The correlation value will be around 0, meaning that there is no correlation between the sentiment of the news and the stock movement. So a resonable approach is to create a model to work out the movement directly from text rather than sentiment score. 

In [7]:
#convert the datatypes so it can be worked with them later
corpus =  corpus.tolist()
labels = labels.to_numpy()

In [8]:
#set the stop words (including the leading 'b' which appears in some sentences)
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.add("b") #the letter 'b' appear at the beginning of some headlines and it is redundant

In [9]:
#remove stop words 
corpus_without_sw = []
#for every sentence
for sent in corpus:
  sent_tokens = word_tokenize(sent)
  tokens_without_sw = [w for w in sent_tokens if not w in stop_words]
  filtered_sent = (" ").join(tokens_without_sw)
  #print(filtered_sent)
  corpus_without_sw.append(filtered_sent)

In [10]:
#tokenize the corpus 
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(corpus_without_sw)

In [11]:
#get the vocabulary length which is used in the first layer of the model
vocab_length = len(word_tokenizer.word_index) +1

In [12]:
#convert all the sentences (lines) in corpus to numeric arrays 
embedded_sentences = word_tokenizer.texts_to_sequences(corpus_without_sw)
#print(embedded_sentences)

In [13]:
#get size of the largest line (in number of words) and pad the other lines with zeros at the end until they reach that size
word_count = lambda sentence: len(word_tokenize(sentence))
longest_sentence = max(corpus_without_sw, key=word_count)
length_long_sentence = len(word_tokenize(longest_sentence))

padded_sentences = pad_sequences(embedded_sentences, length_long_sentence, padding='post')
#print(padded_sentences)

For testing, the headlines which will be used must be formated to have the length of length_long_sentence for the neural network to work. 

In [14]:
#split the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(padded_sentences, labels, test_size=0.3)

In [None]:
#create a model
model = Sequential()
model.add(Embedding(vocab_length, 20, input_length=length_long_sentence))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
#fit training data to the model
model.fit(x_train, y_train, epochs=50, verbose =1)

In [None]:
#calculate loss and accuracy 
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print(accuracy*100)

In [None]:
#create a confusion matrix for the prediction results
from sklearn.metrics import confusion_matrix
y_pred = model.predict_classes(x_test)
confusion_matrix(y_test, y_pred)