In [1]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf

%matplotlib inline

In [2]:
 # Set the random seed for reproducibility
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(2)

In [3]:
file_path = Path("C:\\Users\\Zach.000\\Documents\\SMUFinTech\\GitHub\\Portfolio\\project-3\\Reddit_Data_Cleaned.csv")
df_reddit = pd.read_csv(file_path)
df_reddit.head(75)

Unnamed: 0.1,Unnamed: 0,subreddit,text,Subjectivity,Polarity,Analysis,date only,hour,date_y,Adj Close,tick
0,16225,wallstreetbets,anyone playing tesla earnings? i keep hearing ...,0.680000,0.086667,Positive,2018-10-22,13,2018-10-22 13:30:00-05:00,52.009998,1.0
1,16226,wallstreetbets,&gt; tesla what news? model 3 ship numbers?,0.000000,0.000000,Neutral,2018-10-22,13,2018-10-22 13:30:00-05:00,52.009998,1.0
2,16227,wallstreetbets,its odd timing since i think tesla will have s...,0.425000,0.038889,Positive,2018-10-22,12,2018-10-22 12:30:00-05:00,51.382000,0.0
3,16228,wallstreetbets,its $tsla so i get it but at the same time dam...,0.375000,0.104167,Positive,2018-10-22,12,2018-10-22 12:30:00-05:00,51.382000,0.0
4,16229,wallstreetbets,man if $tsla could stop burning like a tire fi...,0.750000,0.800000,Positive,2018-10-22,12,2018-10-22 12:30:00-05:00,51.382000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
70,16375,wallstreetbets,tesla needs to push a software update that ena...,0.100000,0.000000,Neutral,2018-10-23,12,2018-10-23 12:30:00-05:00,58.548023,1.0
71,16376,wallstreetbets,i put $370 into tsla yesterday and woke up to ...,1.000000,0.600000,Positive,2018-10-23,12,2018-10-23 12:30:00-05:00,58.548023,1.0
72,16377,wallstreetbets,less and less of what? cars people don't want ...,0.233333,-0.233333,Negative,2018-10-23,12,2018-10-23 12:30:00-05:00,58.548023,1.0
73,16378,wallstreetbets,you guys are really desperate if you have to m...,0.535000,-0.070000,Negative,2018-10-23,12,2018-10-23 12:30:00-05:00,58.548023,1.0


In [4]:
df_reddit['text'] = df_reddit['text'].astype(str)


In [5]:
df_reddit.dtypes

Unnamed: 0        int64
subreddit        object
text             object
Subjectivity    float64
Polarity        float64
Analysis         object
date only        object
hour              int64
date_y           object
Adj Close       float64
tick            float64
dtype: object

In [6]:
df_reddit.shape

(161643, 11)

In [7]:
 # Create the features set (X) and the target vector (y)
X = df_reddit['text'].values
y = df_reddit["tick"].values

In [8]:
 # Import the Tokenizer method from Keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
 # Create an instance of the Tokenizer and fit it with the X text data
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(X)

In [10]:
 # Print the first five elements of the encoded vocabulary
for token in list(tokenizer.word_index)[:5]:
    print(f"word: '{token}', token: {tokenizer.word_index[token]}")

word: 'tsla', token: 1
word: 'the', token: 2
word: 'to', token: 3
word: 'tesla', token: 4
word: 'i', token: 5


In [11]:
# Transform the text data to numerical sequences
X_seq = tokenizer.texts_to_sequences(X)

In [12]:
 # Contrast a sample numerical sequence with its text version
print("**Text comment**")
print({X[0]})

**Text comment**
{'anyone playing tesla earnings? i keep hearing good things but that iv is crazy high. maybe yolo a fd the night before earnings.'}


In [13]:
print("**Numerical sequence representation**")
print(X_seq[0])

**Numerical sequence representation**
[187, 443, 4, 134, 5, 200, 2656, 101, 458, 23, 15, 255, 8, 352, 191, 290, 343, 6, 448, 2, 693, 116, 134]


In [14]:
# Import the pad_sequences method from Keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
 # Set the pad size
max_words = 10000

# Pad the sequences using the pad_sequences() method
X_pad = pad_sequences(X_seq, maxlen=max_words, padding="post")

In [16]:
 # Create the train, test, and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pad, y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [17]:
 # Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [18]:
 # Model set-up
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
embedding_size = 64

In [27]:
 # Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=300))

# Output layer
model.add(Dense(units=1, activation="sigmoid")) 

In [28]:
# Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy",
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)

In [29]:
 # Show model summary
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 10000, 64)         2715840   
_________________________________________________________________
lstm_4 (LSTM)                (None, 300)               438000    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 301       
Total params: 3,154,141
Trainable params: 3,154,141
Non-trainable params: 0
_________________________________________________________________


In [30]:
 # Training the model
batch_size = 100
epochs = 100
model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
)

Epoch 1/100


ResourceExhaustedError:    OOM when allocating tensor with shape[10000,100,300] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node gradients/strided_slice_2_grad/StridedSliceGrad}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[PartitionedCall]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_7232]

Function call stack:
train_function -> train_function -> train_function


In [31]:
# Predict classes using the testing data
y_pred = model.predict_classes(X_test, batch_size=61000)

In [32]:
# Accuracy
from sklearn.metrics import accuracy_score

print("RNN LSTM Accuracy %.2f" %  (accuracy_score(y_test, y_pred)))


In [216]:
 # Import the confusion_matrix method from sklearn
from sklearn.metrics import confusion_matrix

In [34]:
 # Confusion matrtix metrics from the RNN LSTM model
tn_rnn, fp_rnn, fn_rnn, tp_rnn = confusion_matrix(y_test, y_pred).ravel()

# Dataframe to display confusion matrix from the RNN LSTM model
cm_rnn_df = pd.DataFrame(
    {
        "Positive(1)": [f"TP={tp_rnn}", f"FP={fp_rnn}"],
        "Negative(0)": [f"FN={fn_rnn}", f"TN={tn_rnn}"],
    },
    index=["Positive(1)", "Negative(0)"],
)
cm_rnn_df.index.name = "Actual"
cm_rnn_df.columns.name = "Predicted"
print("Confusion Matrix from the RNN LSTM Model")
display(cm_rnn_df)

In [218]:
 # Import the classification_report method from sklearn
from sklearn.metrics import classification_report

In [35]:
# Display classification report for the RNN LSTM Model
print("Classification Report for the RNN LSTM Model")
print(classification_report(y_pred, y_test))

In [220]:
 # Import the roc_curve and auc metrics from sklearn
from sklearn.metrics import roc_curve, auc

In [221]:
 # Making predictions to feed the roc_curve module
test_predictions_rnn = model.predict(X_test, batch_size=100000)

In [222]:
# Data for ROC Curve - RNN LSTM Model
fpr_test_rnn, tpr_test_rnn, thresholds_test_rnn = roc_curve(y_test, test_predictions_rnn)

In [223]:
 # AUC for the RNN LSTM Model
auc_test_rnn = auc(fpr_test_rnn, tpr_test_rnn)
auc_test_rnn = round(auc_test_rnn, 4)

In [224]:
 # Dataframe to plot ROC Curve for the RNN LSTM model
roc_df_test_rnn = pd.DataFrame({"FPR Test": fpr_test_rnn, "TPR Test": tpr_test_rnn,})

In [36]:
 roc_df_test_rnn.plot(
    x="FPR Test",
    y="TPR Test",
    color="blue",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve (AUC={auc_test_rnn})",
)