In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
import tensorflow as tf

# Data prep

In [12]:
df = pd.read_csv("../0.Data/stock_data.csv")
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42, shuffle=True)

stemmer = PorterStemmer()
stopword_list = stopwords.words('english')

def preprocess(text):
    
    # 1. Remove punctuations
    text = re.sub('-',' ', text)
    text = re.sub('[^A-Za-z\s\d]',' ', text)
    # 2. To lower
    text = text.lower()
    # 3. Stemming
    text = stemmer.stem(text)
    # 4. Remove stop words
    # 5. Return tokenized data
    return_seq = [x for x in text.split() if x not in stopword_list]

    if len(return_seq) <= 50:
        return_seq.extend(['']* (50 - len(return_seq)))
    elif len(return_seq) > 50:
        return_seq = return_seq[:50]

    return return_seq

X_train_preprocessed = X_train.apply(preprocess)
X_test_preprocessed = X_test.apply(preprocess)

unique_words = []
for x in X_train_preprocessed:
    unique_words.extend(x)
unique_words = set(unique_words)

word_to_num = defaultdict(lambda:9999)
word_to_num.update(zip(unique_words, np.arange(len(unique_words))))
num_to_word = dict(zip(list(word_to_num.values()), list(word_to_num.keys())))

train_X = np.array([[word_to_num[word] for word in sent] for sent in X_train_preprocessed])
test_X = np.array([[word_to_num[word] for word in sent] for sent in X_test_preprocessed])

y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

## TF data preparation

In [13]:
train_X_tensor = tf.data.Dataset.from_tensor_slices(train_X)
train_y_tensor = tf.data.Dataset.from_tensor_slices(y_train)
train_dataset = tf.data.Dataset.zip((train_X_tensor, train_y_tensor))
train_dataset = train_dataset.shuffle(buffer_size=12)
train_dataset = train_dataset.batch(batch_size=64)

test_X_tensor = tf.data.Dataset.from_tensor_slices(test_X)
test_y_tensor = tf.data.Dataset.from_tensor_slices(y_test)
test_dataset = tf.data.Dataset.zip((test_X_tensor, test_y_tensor))
test_dataset = test_dataset.batch(batch_size=64)

# Neural network

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim = len(unique_words)+1, output_dim = 50, name='embedding_layer'),
    tf.keras.layers.LSTM(units=40, return_sequences= False, name='lstm_layer'),
    tf.keras.layers.Dense(units=50, activation = 'leaky_relu', name='dense_layer'),
    tf.keras.layers.Dense(units=1, activation = 'sigmoid', name='output_layer'),
])

In [15]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0001),
metrics=[tf.keras.metrics.Accuracy()],
loss=tf.keras.losses.BinaryCrossentropy(),)

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_layer (Embedding)  (None, None, 50)         432500    
                                                                 
 lstm_layer (LSTM)           (None, 40)                14560     
                                                                 
 dense_layer (Dense)         (None, 50)                2050      
                                                                 
 output_layer (Dense)        (None, 1)                 51        
                                                                 
Total params: 449,161
Trainable params: 449,161
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.fit(train_dataset, epochs=10,
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(monitor='val_loss', save_best_only=True, filepath = 'model_checkpoints/sentiment_analysis/'),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=5)],
    validation_data = test_dataset)

Epoch 1/10


2022-08-29 11:40:34.268020: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-08-29 11:40:34.624993: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-08-29 11:40:34.819167: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-08-29 11:40:38.667053: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-08-29 11:40:38.799596: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 2/10



INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 3/10



INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 4/10



INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 5/10



INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 6/10



INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 7/10
Epoch 8/10



INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


INFO:tensorflow:Assets written to: model_checkpoints/sentiment_analysis/assets


Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2bf4d2400>