In [1]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path

%matplotlib inline

In [2]:
# Import the dataset
file_path = Path("Resources/Stocks_News_Sentiment.csv")
Sentiment_df = pd.read_csv(file_path)
Sentiment_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,date,title,description,text,language,title_compound,title_pos,title_neu,title_neg,title_sent,text_compound,text_pos,text_neu,text_neg,text_sent
0,0,0,2021-03-05T16:22:19Z,The technology selloff is getting to be somewh...,"Tech stocks are getting hammered today, with p...","Tech stocks are getting hammered today, with p...",en,0.0,0.0,1.0,0.0,0,0.2516,0.145,0.763,0.092,1
1,1,1,2021-04-02T13:30:00Z,3 Pet Stocks You’ll Love As Much As Your Furry...,Pet ownership is up and so are the stocks of m...,This story originally appeared on MarketBeatIt...,en,0.8074,0.477,0.523,0.0,1,0.6369,0.107,0.893,0.0,1
2,2,2,2021-03-22T11:18:23Z,US STOCKS-Futures point to gains for tech-rela...,US STOCKS-Futures point to gains for tech-rela...,FILE PHOTO: People are seen on Wall St. outsid...,en,0.5994,0.308,0.692,0.0,1,0.0,0.0,1.0,0.0,0
3,3,3,2021-03-10T08:28:00Z,"European shares slip as miners, travel stocks ...",European stocks pulled back on Wednesday after...,By Reuters Staff\r\nFILE PHOTO: The German sha...,en,0.296,0.216,0.784,0.0,1,0.296,0.064,0.936,0.0,1
4,4,4,2021-03-12T01:00:00Z,RPT-COLUMN-Global oil inventories to become ti...,Global refineries will increase crude processi...,(Repeats Thursdays column with no changes to t...,en,0.0,0.0,1.0,0.0,0,-0.296,0.0,0.932,0.068,-1


# Data Preprocessing

In [3]:
# Creating the X and y vectors
X = Sentiment_df["text"].values
y = Sentiment_df["text_sent"].values

In [4]:
# Import Keras modules for data encoding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Create an instance of the Tokenizer and fit it with the X text data
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(X)

In [6]:
# Print the first five elements of the encoded vocabulary
for token in list(tokenizer.word_index)[:5]:
    print(f"word: '{token}', token: {tokenizer.word_index[token]}")

word: 'reuters', token: 1
word: 'the', token: 2
word: 'chars', token: 3
word: 'on', token: 4
word: 'a', token: 5


In [7]:
# Transform the text data to numerical sequences
X_seq = tokenizer.texts_to_sequences(X)

# Contrast a sample numerical sequence with its text version
print("**Text comment**")
print({X[0]})
print("**Numerical sequence representation**")
print(X_seq[0])

**Text comment**
{'Tech stocks are getting hammered today, with previously high-flying shares of software companies taking even more damage.\r\nFor a sector that has enjoyed a year in the sun, recent trading sessions hav… [+2674 chars]'}
**Numerical sequence representation**
[98, 9, 33, 99, 100, 101, 29, 102, 103, 104, 22, 10, 105, 106, 107, 56, 108, 109, 12, 41, 5, 110, 57, 58, 111, 5, 112, 6, 2, 113, 59, 114, 115, 116, 117, 3]


In [8]:
# Padding sequences
X_pad = pad_sequences(X_seq, maxlen=140, padding="post")

In [9]:
# Creating training, validation, and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, random_state=78)

# Build and Train the LSTM RNN Model

In [10]:
# Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [11]:
# Model set-up
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
max_words = 140
embedding_size = 64

In [12]:
# Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=280))

# Output layer
model.add(Dense(1, activation="sigmoid"))

In [13]:
# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam"
)

In [14]:
# Summarize the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 140, 64)           22912     
_________________________________________________________________
lstm (LSTM)                  (None, 280)               386400    
_________________________________________________________________
dense (Dense)                (None, 1)                 281       
Total params: 409,593
Trainable params: 409,593
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Training the model
batch_size = 1000
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=batch_size,
    verbose=1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ba546792c8>

In [16]:
# Make sentiment predictions
predicted = model.predict_classes(X_test[:10]) 



In [17]:
# The table below compares the actual text (not the sequences) from the original dataframe to the predicted values
# For that purpose we need to apply train_test_split with the same random state to the original X and save it as  X_test_original (we don't need the other values)
_, X_test_original, _, _ = train_test_split(X, y, random_state=78)

In [18]:
# Create a DataFrame of Real and Predicted values
sentiments = pd.DataFrame({"Text": X_test_original[:10], "Actual": y_test[:10], "Predicted": predicted.ravel()})
sentiments

Unnamed: 0,Text,Actual,Predicted
0,* MSCI EM stocks index up 0.6%; Fed vows to st...,-1,0
1,By Reuters Staff\r\nFILE PHOTO: The German sha...,0,0
2,FILE PHOTO: People are seen on Wall St. outsid...,0,0
3,By Reuters Staff\r\nFILE PHOTO: The German sha...,1,0
4,This story originally appeared on MarketBeatIt...,1,0


In [22]:
# Accuracy
from sklearn.metrics import accuracy_score

print("RNN LSTM Accuracy %.2f" % (accuracy_score(y_test, predicted)))

RNN LSTM Accuracy 0.40


In [24]:
# Import the confusion_matrix method from sklearn
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predicted)
print("Confusion Matrix from the RNN LSTM Model")
display(cm)

Confusion Matrix from the RNN LSTM Model


array([[0, 1, 0],
       [0, 2, 0],
       [0, 2, 0]], dtype=int64)

In [23]:
# Import the classification_report method from sklearn
from sklearn.metrics import classification_report

# Display classification report for the RNN LSTM Model
print("Classification Report for the RNN LSTM Model")
print(classification_report(predicted, y_test))

Classification Report for the RNN LSTM Model
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       1.00      0.40      0.57         5
           1       0.00      0.00      0.00         0

    accuracy                           0.40         5
   macro avg       0.33      0.13      0.19         5
weighted avg       1.00      0.40      0.57         5



  _warn_prf(average, modifier, msg_start, len(result))
