In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
import torch

# Data prep

In [3]:
df = pd.read_csv("data/stock_data.csv")
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42, shuffle=True)

stemmer = PorterStemmer()
stopword_list = stopwords.words('english')

def preprocess(text):
    
    # 1. Remove punctuations
    text = re.sub('-',' ', text)
    text = re.sub('[^A-Za-z\s\d]',' ', text)
    # 2. To lower
    text = text.lower()
    # 3. Stemming
    text = stemmer.stem(text)
    # 4. Remove stop words
    # 5. Return tokenized data
    return_seq = [x for x in text.split() if x not in stopword_list]

    if len(return_seq) < 50:
        return_seq.extend(['']* (50 - len(return_seq)))
    elif len(return_seq) > 50:
        return_seq = return_seq[:50]

    return return_seq

X_train_preprocessed = X_train.apply(preprocess)
X_test_preprocessed = X_test.apply(preprocess)

unique_words = []
for x in X_train_preprocessed:
    unique_words.extend(x)
unique_words = set(unique_words)

word_to_num = defaultdict(lambda:9999)
word_to_num.update(zip(unique_words, np.arange(len(unique_words))))
num_to_word = dict(zip(list(word_to_num.values()), list(word_to_num.keys())))

train_X = np.array([[word_to_num[word] for word in sent] for sent in X_train_preprocessed])
test_X = np.array([[word_to_num[word] for word in sent] for sent in X_test_preprocessed])

y_train = np.array(y_train).reshape(-1,1)
y_test = np.array(y_test).reshape(-1,1)

# PT data preparation

In [11]:
train_data = torch.utils.data.TensorDataset(torch.from_numpy(train_X), torch.from_numpy(y_train))
train_dataset = torch.utils.data.DataLoader(train_data, shuffle = True, batch_size=64)

# Neural network

## PT

In [96]:
class torch_model(torch.nn.Module):

    def __init__(self,
    embedding_in_size : int = len(unique_words),
    embedding_dim : int = 64,
    lstm_units : int = 50,
    fc1_units : int = 100,
    fc2_units : int = 50
    ):
        super(torch_model, self).__init__()
        
        self.embedding_in_size = embedding_in_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.fc1_units = fc1_units
        self.fc2_units = fc2_units

        self.embedding = torch.nn.Embedding(num_embeddings=self.embedding_in_size, embedding_dim=self.embedding_dim,)
        self.lstm = torch.nn.LSTM(batch_first = True, input_size = self.embedding_dim, num_layers=2, hidden_size = self.lstm_units, bidirectional = False,)
        self.fc1 = torch.nn.Linear(in_features = self.lstm_units, out_features = self.fc1_units)
        self.fc2 = torch.nn.Linear(in_features = self.fc1_units, out_features = self.fc2_units)
        self.output = torch.nn.Linear(in_features=self.fc2_units, out_features=1)

    def forward(self, x, hidden=None):

        x = self.embedding(x)
        
        x, hidden = self.lstm(x, hidden)
        x = x[:,-1]
        
        x = torch.nn.functional.leaky_relu(self.fc1(x))
        x = torch.nn.functional.leaky_relu(self.fc2(x))
        x = torch.sigmoid(self.output(x))

        return x, hidden

In [97]:
model = torch_model()

optim = torch.optim.Adam(lr = 0.0001, params=model.parameters())

epochs = 10

for epoch in np.arange(epochs):

    Loss=0

    for i,data in enumerate(train_dataset, 0):

        feats, target = data
        optim.zero_grad()

        y_p, hidden = model(feats)
        
        loss = torch.nn.functional.binary_cross_entropy(y_p.float(), target.float())

        loss.backward()
        optim.step()
        Loss += loss.item()

    print(f"Epoch: {epoch}, loss: {Loss}")

Epoch: 0, loss: 48.60568195581436
Epoch: 1, loss: 44.590212255716324
Epoch: 2, loss: 43.01429033279419
Epoch: 3, loss: 42.821065068244934
Epoch: 4, loss: 43.31420338153839
Epoch: 5, loss: 42.96724385023117
Epoch: 6, loss: 42.92213958501816
Epoch: 7, loss: 43.05576699972153
Epoch: 8, loss: 42.87947353720665
Epoch: 9, loss: 43.07330825924873
