In [25]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
import tensorflow as tf


%matplotlib inline

In [26]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [27]:
# Set the random seed for reproducibility
from numpy.random import seed
seed(1)

from tensorflow import random
random.set_seed(2)

In [28]:
 # Load the dataset
file_path = Path("C:\\Users\\Zach.000\\Documents\\SMUFinTech\\GitHub\\Portfolio\\project-3\\Resources\\elonmusk_tweets.csv")
df_tweets = pd.read_csv(file_path)
df_tweets.head(10)

Unnamed: 0,date,text
0,2020-08-16T08:42:01.000Z,We must pass The Great Filter
1,2020-10-09T20:00:00.000Z,“The future of cars can and will be electric i...
2,2020-10-08T14:36:14.000Z,We put the 2020 @Tesla Model Y Long Range elec...
3,2020-10-07T22:47:59.000Z,"Starman, last seen leaving Earth, made its fir..."
4,2020-10-06T12:33:30.000Z,Deployment of 60 Starlink satellites confirmed
5,2020-10-06T11:44:54.000Z,4th flight & landing for this booster
6,2020-10-06T11:24:42.000Z,5 minutes from launch. Looks good so far.
7,2020-10-05T23:42:20.000Z,Turn volume to 11 & play Powerglide in your Tesla
8,2020-10-05T23:22:49.000Z,"Music volume on a Tesla goes to 11, because it..."
9,2020-10-05T22:38:33.000Z,Rewatched Young Frankenstein this weekend. Sti...


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Zach.000\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [30]:
# Create the TESLA Tweet sentiment scores DataFrame
#tweet_sentiments = []

#for text in df_tweets["text"]:
   # try:
    #    text = df_tweets["text"]
     #   date = df_tweets['date']
      #  sentiment = analyzer.polarity_scores(text)
       # compound = sentiment["compound"]
        #pos = sentiment["pos"]
        #neu = sentiment["neu"]
        #neg = sentiment["neg"]
        
        #tweet_sentiments.append({
         #   "text": text,
          #  "date": date,
           # "compound": compound,
            #"positive": pos,
            #"negative": neg,
            #"neutral": neu
            
        #})
        
   # except AttributeError:
    #    pass

In [21]:
# Create DataFrame
#sentiment_df = pd.DataFrame(tweet_sentiments)

#sentiment_df

In [18]:
# Reorder DataFrame columns
#cols = ["date", "text", "compound", "positive", "negative", "neutral"]


In [32]:
#sentiment_df = sentiment_df[cols]



In [None]:
# Creating the X and y vectors
X = reviews_df["full_review_text"].values
y = reviews_df["sentiment"].values

In [None]:
# Create the train, test, and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)

In [None]:
# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
# Define two lists to store vader sentiment scoring
y_vader_pred = []
y_vader_prob = []

In [None]:
 # Score sentiment of test set using Vader
for comment in X_test:
    y_vader_prob.append(analyzer.polarity_scores(comment)["pos"])
    sentiment_score = analyzer.polarity_scores(comment)["compound"]
    if sentiment_score >= 0.1:
        y_vader_pred.append(1)
    else:
        y_vader_pred.append(0)

In [None]:
# Option 1: Normalizing data using MinMaxScaler from sklearn
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(np.array(y_vader_prob).reshape(-1,1))
y_vader_prob_norm = scaler.transform(np.array(y_vader_prob).reshape(-1,1))
y_vader_prob_norm[:5]

In [None]:
# Option 2: Using a comprehension list
normalized = [(x - min(y_vader_prob)) / (max(y_vader_prob) - min(y_vader_prob))
              for x in y_vader_prob]
normalized[:5]

In [None]:
 # Import Keras modules for data encoding
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
# Create an instance of the Tokenizer and fit it with the X text data
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(X)

In [None]:
 # Print the first five elements of the encoded vocabulary
for token in list(tokenizer.word_index)[:5]:
    print(f"word: '{token}', token: {tokenizer.word_index[token]}")

In [None]:
# Transform the text data to numerical sequences
X_seq = tokenizer.texts_to_sequences(X)

# Contrast a sample numerical sequence with its text version
print("**Text comment**")
print({X[0]})
print("**Numerical sequence representation**")
print(X_seq[0])

In [None]:
# Import the pad_sequences method from Keras
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Padding sequences
X_pad = pad_sequences(X_seq, maxlen=140, padding="post")

In [None]:
 # Creating training, validation, and testing sets using the encoded data
X_train_rnn, X_test_rnn, y_train_rnn, y_test_rnn = train_test_split(X_pad, y)

X_train_rnn, X_val_rnn, y_train_rnn, y_val_rnn = train_test_split(X_train_rnn, y_train_rnn)

In [None]:
# Import Keras modules for model creation
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
# Model set-up
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
max_words = 140
embedding_size = 64

In [None]:
 # Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=280))

# Output layer
model.add(Dense(units=1, activation="sigmoid"))

In [None]:
 # Compile the model
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[
        "accuracy",
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)

In [None]:
# Show model summary
model.summary()

In [None]:
# Training the model
batch_size = 1000
epochs = 10
model.fit(
    X_train_rnn,
    y_train_rnn,
    validation_data=(X_val_rnn, y_val_rnn),
    epochs=epochs,
    batch_size=batch_size,
    verbose=1,
) 

In [None]:
 # Predict classes using the testing data
y_rnn_pred = model.predict_classes(X_test_rnn, batch_size=1000)

In [None]:
 # Accuracy
from sklearn.metrics import accuracy_score

print("Vader Accuracy: %.2f" % (accuracy_score(y_test, y_vader_pred)))
print("RNN LSTM Accuracy %.2f" % (accuracy_score(y_test_rnn, y_rnn_pred)))

In [None]:
# Import the confusion_matrix method from sklearn
from sklearn.metrics import confusion_matrix

In [None]:
# Confusion matrtix metrics from Vader
tn_vader, fp_vader, fn_vader, tp_vader = confusion_matrix(y_test, y_vader_pred).ravel()

# Dataframe to display confusion matrix from Vader
cm_vader_df = pd.DataFrame(
    {
        "Positive(1)": [f"TP={tp_vader}", f"FP={fp_vader}"],
        "Negative(0)": [f"FN={fn_vader}", f"TN={tn_vader}"],
    },
    index=["Positive(1)", "Negative(0)"],
)
cm_vader_df.index.name = "Actual"
cm_vader_df.columns.name = "Predicted"
print("Confusion Matrix from Vader")
display(cm_vader_df) 

In [None]:
# Confusion matrtix metrics from the RNN LSTM model
tn_rnn, fp_rnn, fn_rnn, tp_rnn = confusion_matrix(y_test_rnn, y_rnn_pred).ravel()

# Dataframe to display confusion matrix from the RNN LSTM model
cm_rnn_df = pd.DataFrame(
    {
        "Positive(1)": [f"TP={tp_rnn}", f"FP={fp_rnn}"],
        "Negative(0)": [f"FN={fn_rnn}", f"TN={tn_rnn}"],
    },
    index=["Positive(1)", "Negative(0)"],
)
cm_rnn_df.index.name = "Actual"
cm_rnn_df.columns.name = "Predicted"
print("Confusion Matrix from the RNN LSTM Model")
display(cm_rnn_df)

In [None]:
 # Import the classification_report method from sklearn
from sklearn.metrics import classification_report

In [None]:
# Display classification report for Vader
print("Classification Report for Vader")
print(classification_report(y_vader_pred, y_test))

In [None]:
 # Display classification report for the RNN LSTM Model
print("Classification Report for the RNN LSTM Model")
print(classification_report(y_rnn_pred, y_test_rnn))

In [None]:
# Import the roc_curve and auc metrics from sklearn
from sklearn.metrics import roc_curve, auc

In [None]:
 # Data for ROC Curve - VADER
fpr_test_vader, tpr_test_vader, thresholds_test_vader = roc_curve(y_test, y_vader_prob_norm)

In [None]:
 # AUC for VADER
auc_test_vader = auc(fpr_test_vader, tpr_test_vader)
auc_test_vader = round(auc_test_vader, 4)

In [None]:
# Dataframe to plot ROC Curve for VADER
roc_df_test_vader = pd.DataFrame({"FPR Test": fpr_test_vader, "TPR Test": tpr_test_vader,})

In [None]:
roc_df_test_vader.plot(
    x="FPR Test",
    y="TPR Test",
    color="red",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve - Vader (AUC={auc_test_vader})",
)

In [None]:
# Making predictions to feed the roc_curve module
test_predictions_rnn = model.predict(X_test_rnn, batch_size=1000)

In [None]:
 # Data for ROC Curve - RNN LSTM Model
fpr_test_rnn, tpr_test_rnn, thresholds_test_rnn = roc_curve(y_test_rnn, test_predictions_rnn)

In [None]:
# AUC for the RNN LSTM Model
auc_test_rnn = auc(fpr_test_rnn, tpr_test_rnn)
auc_test_rnn = round(auc_test_rnn, 4)

In [None]:
# Dataframe to plot ROC Curve for the RNN LSTM model
roc_df_test_rnn = pd.DataFrame({"FPR Test": fpr_test_rnn, "TPR Test": tpr_test_rnn,})

In [None]:
roc_df_test_rnn.plot(
    x="FPR Test",
    y="TPR Test",
    color="blue",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve (AUC={auc_test_rnn})",
)