In [1]:
# Initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from pathlib import Path

%matplotlib inline

In [2]:
# Import the dataset
file_path = Path("Resources/Sentiments.csv")
Sentiment_df = pd.read_csv(file_path)
Sentiment_df.head()

Unnamed: 0.1,Unnamed: 0,text,text_sent
0,0,The crypto bull market remains in full throttl...,1
1,1,"Mairs &amp; Power, an investment management fi...",1
2,2,Nvidia (NVDA) - Get Report has been a bit of ...,1
3,3,You can only hate and love so much. The love f...,1
4,4,Nvidia Corp.s stock and the broader chip secto...,1


# Data Preprocessing

In [3]:
# Creating the X and y vectors
X = Sentiment_df["text"].values
y = Sentiment_df["text_sent"].values

In [4]:
# Import Keras modules for data encoding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Create an instance of the Tokenizer and fit it with the X text data
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(X)

In [6]:
# Print the first five elements of the encoded vocabulary
for token in list(tokenizer.word_index)[:5]:
    print(f"word: '{token}', token: {tokenizer.word_index[token]}")

word: 'the', token: 1
word: 'chars', token: 2
word: 'a', token: 3
word: 'of', token: 4
word: 'to', token: 5


In [7]:
# Transform the text data to numerical sequences
X_seq = tokenizer.texts_to_sequences(X)

# Contrast a sample numerical sequence with its text version
print("**Text comment**")
print({X[0]})
print("**Numerical sequence representation**")
print(X_seq[0])

**Text comment**
{'The crypto bull market remains in full throttle. This has been good news for Nvidia (NVDA). The chip giant sells the GPUs used as crypto mining rigs, providing the company with another stream of reve… [+2648 chars]'}
**Numerical sequence representation**
[1, 553, 1270, 45, 695, 7, 921, 2187, 22, 18, 41, 267, 217, 11, 453, 240, 1, 342, 454, 2188, 1, 1271, 343, 15, 553, 1272, 2189, 2190, 1, 46, 20, 119, 2191, 4, 2192, 2193, 2]


In [8]:
# Padding sequences
X_pad = pad_sequences(X_seq, maxlen=140, padding="post")

In [9]:
# Creating training, validation, and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, random_state=78)

# Build and Train the LSTM RNN Model

In [10]:
# Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [11]:
# Model set-up
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
max_words = 140
embedding_size = 64

In [12]:
# Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=280))

# Output layer
model.add(Dense(1, activation="sigmoid"))

In [13]:
# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam"
)

In [14]:
# Summarize the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 140, 64)           333440    
_________________________________________________________________
lstm (LSTM)                  (None, 280)               386400    
_________________________________________________________________
dense (Dense)                (None, 1)                 281       
Total params: 720,121
Trainable params: 720,121
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Training the model
batch_size = 1000
model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=batch_size,
    verbose=1,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c8f1b7f388>

In [22]:
# Make sentiment predictions
predicted = model.predict_classes(X_test) 



In [23]:
# The table below compares the actual text (not the sequences) from the original dataframe to the predicted values
# For that purpose we need to apply train_test_split with the same random state to the original X and save it as  X_test_original (we don't need the other values)
_, X_test_original, _, _ = train_test_split(X, y, random_state=78)

In [25]:
# Create a DataFrame of Real and Predicted values
sentiments = pd.DataFrame({"Text": X_test_original, "Actual": y_test, "Predicted": predicted.ravel()})
sentiments

Unnamed: 0,Text,Actual,Predicted
0,"Every week, Benzinga conducts a survey to coll...",1,1
1,(CNN) Photographer Faizan Ahmad had never even...,1,1
2,"New York, March 12, 2021 /PRNewswire/ -- Inves...",-1,1
3,Palantir Technologies Inc (NYSE: PLTR) and Bla...,1,1
4,GettyAny interview conducted with Meghan Markl...,1,1
...,...,...,...
140,"Back in 1921, Dr. Charles P. Steinmetz, the pi...",1,1
141,The global chip shortage has caused havoc for ...,-1,1
142,The Dow Jones Industrial Average rose but the ...,-1,1
143,Police say 21-year-old man charged with 10 mur...,-1,1


In [26]:
# Accuracy
from sklearn.metrics import accuracy_score

print("RNN LSTM Accuracy %.2f" % (accuracy_score(y_test, predicted)))

RNN LSTM Accuracy 0.72


In [27]:
# Import the confusion_matrix method from sklearn
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, predicted)
print("Confusion Matrix from the RNN LSTM Model")
display(cm)

Confusion Matrix from the RNN LSTM Model


array([[  0,  40],
       [  0, 105]], dtype=int64)

In [28]:
# Import the classification_report method from sklearn
from sklearn.metrics import classification_report

# Display classification report for the RNN LSTM Model
print("Classification Report for the RNN LSTM Model")
print(classification_report(predicted, y_test))

Classification Report for the RNN LSTM Model
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.72      0.84       145

    accuracy                           0.72       145
   macro avg       0.50      0.36      0.42       145
weighted avg       1.00      0.72      0.84       145



  _warn_prf(average, modifier, msg_start, len(result))
