In [2]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# numericalization
from collections import Counter

# modeling
from sklearn.model_selection import train_test_split
# import torch
# from torch.utils.data import DataLoader, TensorDataset

pd.set_option('display.max_columns', 500)
title_fontsize = 15

ModuleNotFoundError: No module named 'torch'

In [3]:
tweets = pd.read_csv('./data/tweets_cleaned_tokenized_words.csv', 
                     parse_dates=['Date'], index_col=['Unnamed: 0'])
# tweets.drop(['Content'], axis=1, inplace = True)
stocks = pd.read_csv('./data/scraped_stock_2015_2022.csv')

tweets.head()

Unnamed: 0,Date,Content,ticker,score,label,Lemmatized_tokens
0,2015-01-01 00:00:57,lx21 made 10008 aapl check out learn howtotrad...,AAPL,0.0,neutral,"['lx21', 'made', '10008', 'aapl', 'check', 'ou..."
1,2015-01-01 00:01:36,insanity today weirdo massive selling aapl bid...,AAPL,-0.8271,negative,"['insanity', 'today', 'weirdo', 'massive', 'se..."
2,2015-01-01 00:01:50,sp100 stocks performance hd low sbux tgt dvn i...,AMZN,-0.4278,neutral,"['sp100', 'stock', 'performance', 'hd', 'low',..."
3,2015-01-01 00:06:47,gm tsla volkswagen pushes 2014 record recall t...,TSLA,0.0,neutral,"['gm', 'tsla', 'volkswagen', 'push', '2014', '..."
4,2015-01-01 00:10:05,swing trading 891 return 14 days swingtrading ...,AAPL,0.0,neutral,"['swing', 'trading', '891', 'return', '14', 'd..."


### Numeric Tokenization

In [4]:
# create a list of all tweets in column
all_tweets = tweets['Content'].tolist()

# join the tweets togaether to make one big blob
text = ' '.join(all_tweets)

# finding the most frequent words
words = text.split()
word_counts = Counter(words)

# sort the words so the most frequent are on top
total_words = len(words)
sorted_words = word_counts.most_common(total_words)

In [5]:
# create an int-mapping dictionary so most common words will have lower indexes
vocab_to_int = {w:i+1 for i, (w, c) in enumerate(sorted_words)}

In [6]:
len(vocab_to_int)

1067899

In [7]:
tweets

Unnamed: 0,Date,Content,ticker,score,label,Lemmatized_tokens
3923954,2022-09-29 22:40:47,group lawmakers led sen elizabeth warren dmass...,PG,-0.0772,neutral,"['group', 'lawmaker', 'led', 'sen', 'elizabeth..."
3923955,2022-09-29 22:23:54,nio im money mean bad investment whole market ...,PG,0.2500,neutral,"['nio', 'im', 'money', 'mean', 'bad', 'investm..."
3923956,2022-09-29 18:34:51,today’s drop spx perfect example happens aapl ...,PG,-0.6197,neutral,"['today’s', 'drop', 'spx', 'perfect', 'example..."
3923957,2022-09-29 13:38:47,wage inflation ⬆️ profit margin ⬇️ amzn,PG,0.4404,positive,"['wage', 'inflation', '⬆️', 'profit', 'margin'..."
3923958,2022-09-29 13:04:39,amazon loan 150m small businesses next 3 years...,PG,0.3470,positive,"['amazon', 'loan', '150m', 'small', 'business'..."
...,...,...,...,...,...,...
3988231,2016-07-16 13:15:41,reanalysis study square pie charts 2009 – eage...,PG,0.0000,neutral,"['reanalysis', 'study', 'square', 'pie', 'char..."
3988232,2016-07-16 13:05:27,square pie chart beats rest perception study ...,PG,0.0000,neutral,"['square', 'pie', 'chart', 'beat', 'rest', 'pe..."
3988233,2016-07-16 02:35:31,hard ignore mcdonalds obese futurity,PG,-0.4404,neutral,"['hard', 'ignore', 'mcdonalds', 'obese', 'futu..."
3988234,2016-07-15 23:57:39,tamir rice story make police shooting disappea...,PG,-0.2263,neutral,"['tamir', 'rice', 'story', 'make', 'police', '..."


In [8]:
len(all_tweets)

18789

In [9]:
# numericalize the words so they are defined by integers rather than strings
tweets_int = []
for word in all_tweets:
    r = [vocab_to_int[w] for w in word.split()]
    tweets_int.append(r)
print(tweets_int[0:3])

[[581, 7737, 1027, 6293, 3121, 4680, 15135, 1172, 1817, 289, 7738, 4140, 984, 3, 6294, 3747, 4681, 256, 446, 1070, 38, 1818], [2320, 187, 121, 426, 95, 2034, 675, 67, 213, 4682, 105, 302, 455, 20, 302, 187, 3416, 219, 2672, 114, 63, 18, 3, 22, 52, 2320], [2499, 868, 1126, 619, 1571, 447, 18, 3, 397, 492, 464, 1212, 985, 20, 464, 503, 3748, 1572, 47, 1650, 1213, 67, 2673, 464]]


In [10]:
# numericalize the labels so that we can define them as positive, neutral, or negative

encoded_labels = [1 if label =='positive' else 0 if label == 'neutral' else -1 for label in tweets['label']]
encoded_labels = np.array(encoded_labels)

tweets['label'] = encoded_labels

In [11]:
def pad_features(tweets_int, seq_length):
    ''' Return features of tweets_ints, where each tweet is 
    padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(tweets_int), seq_length), dtype = int)
    
    for i, twt in enumerate(tweets_int):
        tweets_len = len(twt)
        
        if tweets_len <= seq_length:
            zeroes = list(np.zeros(seq_length-tweets_len))
            new = zeroes+twt
        elif tweets_len > seq_length:
            new = twt[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

In [12]:
tweets

Unnamed: 0,Date,Content,ticker,score,label,Lemmatized_tokens
3923954,2022-09-29 22:40:47,group lawmakers led sen elizabeth warren dmass...,PG,-0.0772,0,"['group', 'lawmaker', 'led', 'sen', 'elizabeth..."
3923955,2022-09-29 22:23:54,nio im money mean bad investment whole market ...,PG,0.2500,0,"['nio', 'im', 'money', 'mean', 'bad', 'investm..."
3923956,2022-09-29 18:34:51,today’s drop spx perfect example happens aapl ...,PG,-0.6197,0,"['today’s', 'drop', 'spx', 'perfect', 'example..."
3923957,2022-09-29 13:38:47,wage inflation ⬆️ profit margin ⬇️ amzn,PG,0.4404,1,"['wage', 'inflation', '⬆️', 'profit', 'margin'..."
3923958,2022-09-29 13:04:39,amazon loan 150m small businesses next 3 years...,PG,0.3470,1,"['amazon', 'loan', '150m', 'small', 'business'..."
...,...,...,...,...,...,...
3988231,2016-07-16 13:15:41,reanalysis study square pie charts 2009 – eage...,PG,0.0000,0,"['reanalysis', 'study', 'square', 'pie', 'char..."
3988232,2016-07-16 13:05:27,square pie chart beats rest perception study ...,PG,0.0000,0,"['square', 'pie', 'chart', 'beat', 'rest', 'pe..."
3988233,2016-07-16 02:35:31,hard ignore mcdonalds obese futurity,PG,-0.4404,0,"['hard', 'ignore', 'mcdonalds', 'obese', 'futu..."
3988234,2016-07-15 23:57:39,tamir rice story make police shooting disappea...,PG,-0.2263,0,"['tamir', 'rice', 'story', 'make', 'police', '..."


In [15]:
tweets_len = [len(x) for x in tweets_int]

X = pd.DataFrame(pad_features(tweets_int, max(tweets_len)))
y = pd.DataFrame(encoded_labels)

In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

In [63]:
# # X.to_csv('./data/tweets_numericalized.csv')
# # Y.to_csv('./data/sentiments_numericalized.csv')

# X = pd.read_csv('./data/tweets_numericalized.csv', index_col = ['Unnamed: 0'])
# y = pd.read_csv('./data/sentiments_numericalized.csv', index_col = ['Unnamed: 0'])

In [16]:
# splitting into test-train-validation sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [17]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(X_train.values), torch.from_numpy(y_train.values))
valid_data = TensorDataset(torch.from_numpy(X_valid.values), torch.from_numpy(y_valid.values))
test_data = TensorDataset(torch.from_numpy(X_test.values), torch.from_numpy(y_test.values))


# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [18]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y.squeeze())

Sample input size:  torch.Size([50, 26])
Sample input: 
 tensor([[    0,     0,     0,  ...,   109, 15677, 15678],
        [    0,     0,     0,  ...,   217,  1307,   332],
        [    0,     0,     0,  ...,    19,   975,  2630],
        ...,
        [    0,     0,     0,  ...,   265,  5436,     3],
        [    0,     0,     0,  ...,     1,     2,     6],
        [    0,     0,     0,  ...,     0,  1893,  3596]])

Sample label size:  torch.Size([50, 1])
Sample label: 
 tensor([ 1,  0,  1,  0,  1,  0,  1,  0,  0,  0,  1,  0,  1,  1,  1,  0,  0,  0,
         0,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0, -1,  1,  0,  1,  1,  0,  0,
         0,  1,  1,  0,  0,  0,  1,  1,  0, -1,  0,  0,  1,  0])


In [20]:
import torch.nn as nn

class SentimentLSTM(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super().__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [21]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SentimentLSTM(
  (embedding): Embedding(39291, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [23]:
train_on_gpu = False

# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


# training params

epochs = 4 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 10
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        labels = labels.squeeze()
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        inputs = inputs.type(torch.LongTensor)
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        output = output.squeeze()
        loss = criterion(output, labels.float())
        loss.backward()
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:
                labels = labels.squeeze()

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                inputs = inputs.type(torch.LongTensor)
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print(f"Epoch: {e+1}/{epochs}...",
                  f"Step: {counter}...",
                  f"Loss: {loss.item():.6f}...",
                  f"Val Loss: {np.mean(val_losses):.6f}")

Epoch: 1/4... Step: 10... Loss: 0.279779... Val Loss: 0.478185
Epoch: 1/4... Step: 20... Loss: 0.568976... Val Loss: 0.464345
Epoch: 1/4... Step: 30... Loss: 0.418834... Val Loss: 0.446171
Epoch: 1/4... Step: 40... Loss: 0.489964... Val Loss: 0.433322
Epoch: 1/4... Step: 50... Loss: 0.389284... Val Loss: 0.417790
Epoch: 1/4... Step: 60... Loss: 0.261298... Val Loss: 0.413393
Epoch: 1/4... Step: 70... Loss: 0.223266... Val Loss: 0.407062
Epoch: 1/4... Step: 80... Loss: 0.586663... Val Loss: 0.416382
Epoch: 1/4... Step: 90... Loss: 0.284597... Val Loss: 0.395099
Epoch: 1/4... Step: 100... Loss: 0.445391... Val Loss: 0.371032
Epoch: 1/4... Step: 110... Loss: 0.503528... Val Loss: 0.380906
Epoch: 1/4... Step: 120... Loss: 0.124991... Val Loss: 0.437549
Epoch: 1/4... Step: 130... Loss: 0.184448... Val Loss: 0.394775
Epoch: 1/4... Step: 140... Loss: 0.493796... Val Loss: 0.336545
Epoch: 1/4... Step: 150... Loss: 0.320937... Val Loss: 0.331020
Epoch: 1/4... Step: 160... Loss: 0.278308... Val 

In [25]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:
    
    labels = labels.squeeze()

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    inputs = inputs.type(torch.LongTensor)
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print(f"Test loss: {np.mean(test_losses):.3f}")

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print(f"Test accuracy: {test_acc:.3f}")

Test loss: 0.003
Test accuracy: 0.770


In [30]:
test_losses

[0.265084832906723,
 -0.7018386721611023,
 0.6793506145477295,
 -0.8892024159431458,
 0.4299418330192566,
 0.15710344910621643,
 1.121410846710205,
 0.02134595438838005,
 0.45937344431877136,
 -0.2865404188632965,
 0.7863173484802246,
 -0.49968934059143066,
 0.4674068093299866,
 -0.082013338804245,
 -0.10906907916069031,
 -0.011358356103301048,
 -0.18553072214126587,
 -1.7107675075531006,
 -0.3410903811454773,
 0.0731324851512909,
 0.617652952671051,
 0.33814966678619385,
 0.6645143032073975,
 0.7397577166557312,
 -0.8421439528465271,
 -0.030098600313067436,
 0.30411165952682495,
 -0.06867992877960205,
 0.2350694239139557,
 0.12331392616033554,
 0.22327089309692383,
 -0.6173611879348755,
 0.14676427841186523,
 -0.20221295952796936,
 0.008613292127847672,
 0.18475717306137085,
 -0.34524184465408325,
 0.3647984266281128,
 -0.9183323383331299,
 -0.3468364477157593,
 -0.41568198800086975,
 0.26225724816322327,
 -0.23032374680042267,
 0.026247955858707428,
 -0.42285779118537903,
 0.73886507

In [120]:
test_df = pd.DataFrame(vocab_to_int.values(), index=vocab_to_int.keys())

test_df.to_csv('./data/vocab_words.csv')

In [8]:
from string import punctuation

def tokenize_review(test_review):
    
    test_review = test_review.lower() # lowercase
    
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()
    

    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])

    return test_ints

# test code and generate tokenized review
test_ints = tokenize_review(test_review_neg)
print(test_ints)


# test sequence padding
seq_length=200
features = pad_features(test_ints, seq_length)

print(features)


# test conversion to tensor and pass into your model
feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())


def predict(net, test_review, sequence_length=26):
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=sequence_length
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    if(train_on_gpu):
        feature_tensor = feature_tensor.cuda()
    
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    print(f'Prediction value, pre-rounding: {output.item():.6f}')
    
    # print custom response
    if(pred.item()==1):
        print("Positive review detected!")
    else:
        print("Negative review detected.")

NameError: name 'test_review_neg' is not defined

In [7]:
tweet = 'no'
predict(net, tweet, 26)

NameError: name 'predict' is not defined

In [76]:
tweets[tweets['label'] == 1]['Content'].iloc[4]

'top tech stocks highest share buybacks past quarter aapl googl msft meta chtr nvda wmt amzn klac v csco'