In [None]:
!pip install keras_preprocessing

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [None]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.layers import Bidirectional
from keras.layers import Dropout


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix,roc_curve,auc
from tensorflow.keras.layers import Dense, SimpleRNN, Activation, Input
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import roc_auc_score,precision_score, recall_score, f1_score,accuracy_score

In [None]:
def Evaluation (Method,Comment,Actual, Predicted,Scores):
    ROC_AUC=roc_auc_score(Actual,Predicted)
    Precision=precision_score(Actual,Predicted)
    Accuracy=accuracy_score(Actual,Predicted)
    Recall=recall_score(Actual,Predicted)
    F1=f1_score(Actual,Predicted)
    print('Accuracy=',Accuracy)
    Scores[Comment] = {'Method':Method,'ROC_AUC':ROC_AUC,'Accuracy':Accuracy,
                       'Precision':Precision,'Recall':Recall,'F1':F1}

In [None]:
#return dictionary with word: embedding_vector
def load_embedding(fname):
    embeddings_index = {}
    f = open(fname, encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            embedding_vector = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = embedding_vector
    f.close()

    return embeddings_index

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisStockProject/dataset/pre_process_news_days.csv', encoding = "ISO-8859-1")

In [None]:
from ast import literal_eval
df['news_stemmed'] = df['news_stemmed'].apply(lambda x: literal_eval(x))

Split Data

In [None]:
# Split the data into train, validation, and test sets
train_data = df[(df['Date'] >= '2008-08-08') & (df['Date'] <= '2013-12-31')]
val_data = df[(df['Date'] >= '2014-01-02') & (df['Date'] <= '2014-12-31')]
test_data = df[(df['Date'] >= '2015-01-02') & (df['Date'] <= '2016-07-01')]

In [None]:
text_train = train_data.news_stemmed.values
text_val = val_data.news_stemmed.values
text_test = test_data.news_stemmed.values

In [None]:
EMBEDDING_DIM = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)
word_index = tokenizer.word_index
MAX_WORDS = len(word_index)+1
print('number of unique tokens are: ', len(word_index))

number of unique tokens are:  23270


In [None]:
max_sequence=800
embeddings_index = load_embedding("/content/drive/MyDrive/Colab Notebooks/SentimentAnalysisStockProject/dataset/glove.6B.50d.txt")
embedding_matrix = np.random.random((MAX_WORDS, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

RNN

In [None]:
train_sequences = tokenizer.texts_to_sequences(text_train)
train_bow_data = tokenizer.sequences_to_matrix(train_sequences, mode='binary')
X_train = pad_sequences(train_bow_data, maxlen=max_sequence,padding='post')

val_sequences = tokenizer.texts_to_sequences(text_val)
val_bow_data = tokenizer.sequences_to_matrix(val_sequences, mode='binary')
X_val = pad_sequences(val_bow_data, maxlen=max_sequence,padding='post')

test_sequences = tokenizer.texts_to_sequences(text_test)
test_bow_data = tokenizer.sequences_to_matrix(test_sequences, mode='binary')
X_test = pad_sequences(test_bow_data, maxlen=max_sequence,padding='post')

In [None]:
sc={}
for i, lbl in enumerate(['Label','Label_1day','Label_2day', 'Label_3day', 'Label_4day', 'Label_5day']):
  Y_train = np.array(train_data[lbl])
  Y_val = np.array(val_data[lbl])
  Y_test = np.array(test_data[lbl])

  hidden_dims = 25
  np.random.seed(80)
  model_RNN = Sequential()
  model_RNN.add(Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_sequence, trainable=True))
  model_RNN.add(SimpleRNN(hidden_dims, recurrent_dropout=0.1))
  model_RNN.add(Dropout(0.5))
  model_RNN.add(Dense(units=1, activation='sigmoid'))
  model_RNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4)
  hist_RNN = model_RNN.fit(X_train, Y_train,
                            validation_data=(X_val, Y_val),
                            epochs=5, batch_size=32, callbacks=[es])
  y_prob = model_RNN.predict(X_test)
  y_pred_RNN = y_prob.round()
  Evaluation ('BoW + RNN',lbl,Y_test, y_pred_RNN,sc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy= 0.5079365079365079
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy= 0.5079365079365079
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy= 0.5502645502645502
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy= 0.5317460317460317
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy= 0.5343915343915344
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy= 0.5317460317460317


In [None]:
df_sc_RNN = pd.DataFrame.from_dict(sc, orient='index')
df_sc_RNN.sort_values(by=['ROC_AUC','Accuracy'],ascending=False,inplace=True)
df_sc_RNN

Unnamed: 0,Method,ROC_AUC,Accuracy,Precision,Recall,F1
Label_2day,BoW + RNN,0.502924,0.550265,1.0,0.005848,0.011628
Label_4day,BoW + RNN,0.5,0.534392,0.0,0.0,0.0
Label_3day,BoW + RNN,0.5,0.531746,0.0,0.0,0.0
Label_5day,BoW + RNN,0.5,0.531746,0.0,0.0,0.0
Label,BoW + RNN,0.5,0.507937,0.507937,1.0,0.673684
Label_1day,BoW + RNN,0.5,0.507937,0.0,0.0,0.0
