In [None]:
import json
import nltk
import os
import random
import re
import torch
import numpy as np

from torch import nn, optim
import torch.nn.functional as F

In [None]:
file_path = '/content/twits.json'
with open(file_path, 'r') as f:
    twits = json.load(f)

print(twits['data'][:10])


[{'message_body': '$FITB great buy at 26.00...ill wait', 'sentiment': 2, 'timestamp': '2018-07-01T00:00:09Z'}, {'message_body': '@StockTwits $MSFT', 'sentiment': 1, 'timestamp': '2018-07-01T00:00:42Z'}, {'message_body': '#STAAnalystAlert for $TDG : Jefferies Maintains with a rating of Hold setting target price at USD 350.00. Our own verdict is Buy  http://www.stocktargetadvisor.com/toprating', 'sentiment': 2, 'timestamp': '2018-07-01T00:01:24Z'}, {'message_body': '$AMD I heard there’s a guy who knows someone who thinks somebody knows something - on StockTwits.', 'sentiment': 1, 'timestamp': '2018-07-01T00:01:47Z'}, {'message_body': '$AMD reveal yourself!', 'sentiment': 0, 'timestamp': '2018-07-01T00:02:13Z'}, {'message_body': '$AAPL Why the drop? I warren Buffet taking out his position?', 'sentiment': 1, 'timestamp': '2018-07-01T00:03:10Z'}, {'message_body': '$BA bears have 1 reason on 06-29 to pay more attention https://dividendbot.com?s=BA', 'sentiment': -2, 'timestamp': '2018-07-01T

### Length of Data
Now let's look at the number of twits in dataset. Print the number of twits below.

In [None]:

len(twits['data'])

1548010

### Split Message Body and Sentiment Score

In [None]:
messages = [twit['message_body'] for twit in twits['data']]
sentiments = [twit['sentiment'] + 2 for twit in twits['data']]

### Pre-Processing

In [None]:
nltk.download('wordnet')


def preprocess(message):


    text = message.lower()

    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)

    text = re.sub(r'(\$[a-zA-Z]*)', ' ', text, flags=re.MULTILINE)

    text = re.sub(r'(@[a-zA-Z]*)', ' ', text, flags=re.MULTILINE)

    text = re.sub(r'([^a-zA-Z])', ' ', text, flags=re.MULTILINE)

    tokens = text.split()

    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens if len(token)>1]

    return tokens

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Preprocess All the Twits


In [None]:
tokenized = [preprocess(message) for message in messages]

### Bag of Words


In [None]:
from collections import Counter

bow = Counter(word for sentence in tokenized for word in sentence)

In [None]:
total_word_count = sum(bow.values())
freqs = {word: count/total_word_count for word, count in bow.items()}

low_cutoff = 1e-5

high_cutoff = 10

K_most_common = bow.most_common(high_cutoff)


filtered_words = [word for word in freqs if (freqs[word] > low_cutoff and word not in K_most_common)]
print(K_most_common)
len(filtered_words)

[('the', 397917), ('to', 378596), ('is', 283874), ('for', 272841), ('on', 241389), ('of', 210919), ('and', 208194), ('in', 204412), ('this', 203027), ('it', 193166)]


4537

### Updating Vocabulary by Removing Filtered Words
Let's creat three variables that will help with our vocabulary.

In [None]:
vocab = {word: ii for ii, word in enumerate(filtered_words)}

id2vocab = {ii: word for ii, word in vocab.items()}

filtered = [[word for word in sentence if word in vocab] for sentence in tokenized]

In [None]:
balanced = {'messages': [], 'sentiments':[]}

n_neutral = sum(1 for each in sentiments if each == 2)
N_examples = len(sentiments)
keep_prob = (N_examples - n_neutral)/4/n_neutral

for idx, sentiment in enumerate(sentiments):
    message = filtered[idx]
    if len(message) == 0:
        continue
    elif sentiment != 2 or random.random() < keep_prob:
        balanced['messages'].append(message)
        balanced['sentiments'].append(sentiment)

In [None]:
n_neutral = sum(1 for each in balanced['sentiments'] if each == 2)
N_examples = len(balanced['sentiments'])
n_neutral/N_examples

0.1941367125624467

In [None]:
token_ids = [[vocab[word] for word in message] for message in balanced['messages']]
sentiments = balanced['sentiments']

## Neural Network


#### Embed -> RNN -> Dense -> Softmax


In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, lstm_size, output_size, lstm_layers=1, dropout=0.1):

        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.lstm_size = lstm_size
        self.output_size = output_size
        self.lstm_layers = lstm_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)

        self.lstm = nn.LSTM(embed_size, lstm_size, lstm_layers, dropout=dropout, batch_first=False)
        self.dense = nn.Linear(in_features=lstm_size, out_features=output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data

        hidden_state = weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_()
        cell_state = weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_()

        return (hidden_state, cell_state)


    def forward(self, nn_input, hidden_state):

        batch_size = nn_input.size(0)

        x = nn_input.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden_state)

        lstm_out = lstm_out[-1,:,:]

        dense_out = self.dense(lstm_out)

        softmax_out = self.softmax(dense_out)

        return softmax_out, hidden_state

### View Model

In [None]:
model = TextClassifier(len(vocab), 10, 6, 5, dropout=0.1, lstm_layers=2)
model.embedding.weight.data.uniform_(-1, 1)
input = torch.randint(0, 1000, (5, 4), dtype=torch.int64)
hidden = model.init_hidden(4)

logps, _ = model.forward(input, hidden)
print(logps)

tensor([[-1.9205, -1.3536, -1.6702, -1.5217, -1.6681],
        [-1.9089, -1.3420, -1.6779, -1.5610, -1.6412],
        [-1.9037, -1.3519, -1.6702, -1.5456, -1.6562],
        [-1.9149, -1.3474, -1.6708, -1.5259, -1.6756]])


## Training


In [None]:
def dataloader(messages, labels, sequence_length=30, batch_size=32, shuffle=False):

    if shuffle:
        indices = list(range(len(messages)))
        random.shuffle(indices)
        messages = [messages[idx] for idx in indices]
        labels = [labels[idx] for idx in indices]

    total_sequences = len(messages)

    for ii in range(0, total_sequences, batch_size):
        batch_messages = messages[ii: ii+batch_size]

        batch = torch.zeros((sequence_length, len(batch_messages)), dtype=torch.int64)
        for batch_num, tokens in enumerate(batch_messages):
            token_tensor = torch.tensor(tokens)
            start_idx = max(sequence_length - len(token_tensor), 0)
            batch[start_idx:, batch_num] = token_tensor[:sequence_length]

        label_tensor = torch.tensor(labels[ii: ii+len(batch_messages)])

        yield batch, label_tensor

### Training and  Validation
With our data in nice shape, we'll split it into training and validation sets.

In [None]:
def train_test_split(dataset, split=0.70):
    split_idx = int(len(dataset) * split)

    return dataset[:split_idx], dataset[split_idx:]

train_features, valid_features = train_test_split(token_ids)
train_labels, valid_labels = train_test_split(sentiments)

In [None]:
text_batch, labels = next(iter(dataloader(train_features, train_labels, sequence_length=20, batch_size=64)))
model = TextClassifier(len(vocab)+1, 200, 128, 5, dropout=0.)
hidden = model.init_hidden(64)
logps, hidden = model.forward(text_batch, hidden)

### Training
It's time to train the neural network!

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TextClassifier(len(vocab)+1, 1024, 512, 5, lstm_layers=2, dropout=0.2)
model.embedding.weight.data.uniform_(-1, 1)
model.to(device)

TextClassifier(
  (embedding): Embedding(4538, 1024, padding_idx=0)
  (lstm): LSTM(1024, 512, num_layers=2, dropout=0.2)
  (dense): Linear(in_features=512, out_features=5, bias=True)
  (softmax): LogSoftmax()
)

In [None]:
epochs = 5
batch_size = 512
learning_rate = 0.001

print_every = 100
clip=5
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model.train()

for epoch in range(epochs):
    print('Starting epoch {}'.format(epoch + 1))

    steps = 0
    for text_batch, labels in dataloader(
            train_features, train_labels, batch_size=batch_size, sequence_length=20, shuffle=True):
        steps += 1

        hidden = model.init_hidden(labels.shape[0])

        text_batch, labels = text_batch.to(device), labels.to(device)
        for each in hidden:
            each.to(device)

        model.zero_grad()

        log_probs, hidden = model(text_batch, hidden)

        loss = criterion(log_probs, labels)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        if steps % print_every == 0:
            model.eval()

            val_losses = []

            for text_batch, labels in  dataloader(
                valid_features, valid_labels, batch_size=batch_size, sequence_length=20, shuffle=True):

                val_hidden = model.init_hidden(labels.shape[0])

                text_batch, labels = text_batch.to(device), labels.to(device)
                for each in val_hidden:
                    each.to(device)

                log_probs, val_hidden = model(text_batch, val_hidden)
                val_loss = criterion(log_probs, labels)

                val_losses.append(val_loss.item())

            model.train()
            print("Epoch: {}/{}...".format(epoch+1, epochs),
                  "Step: {}...".format(steps),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

Starting epoch 1
Epoch: 1/5... Step: 100... Loss: 0.963563... Val Loss: 1.007029
Epoch: 1/5... Step: 200... Loss: 0.836163... Val Loss: 0.907048
Epoch: 1/5... Step: 300... Loss: 0.902836... Val Loss: 0.850816
Epoch: 1/5... Step: 400... Loss: 0.789343... Val Loss: 0.809933
Epoch: 1/5... Step: 500... Loss: 0.762552... Val Loss: 0.786965
Epoch: 1/5... Step: 600... Loss: 0.687571... Val Loss: 0.769912
Epoch: 1/5... Step: 700... Loss: 0.698573... Val Loss: 0.761188
Epoch: 1/5... Step: 800... Loss: 0.816043... Val Loss: 0.743132
Epoch: 1/5... Step: 900... Loss: 0.745562... Val Loss: 0.740738
Epoch: 1/5... Step: 1000... Loss: 0.674274... Val Loss: 0.735963
Epoch: 1/5... Step: 1100... Loss: 0.715941... Val Loss: 0.728147
Epoch: 1/5... Step: 1200... Loss: 0.694073... Val Loss: 0.721874
Epoch: 1/5... Step: 1300... Loss: 0.666030... Val Loss: 0.718588
Epoch: 1/5... Step: 1400... Loss: 0.621976... Val Loss: 0.711632
Starting epoch 2
Epoch: 2/5... Step: 100... Loss: 0.634024... Val Loss: 0.717977
E

## Making Predictions


In [None]:
def predict(text, model, vocab):

    model.eval()

    tokens = preprocess(text)

    tokens = [token for token in tokens if token in vocab]

    tokens = [vocab[token] for token in tokens]

    text_input = torch.from_numpy(np.asarray(torch.FloatTensor(tokens).view(-1,1)))

    hidden = model.init_hidden(1)
    logps, _ = model.forward(text_input, hidden)

    pred = torch.exp(logps)

    return pred

In [None]:
with open(os.path.join('data', 'project_6_stocktwits', 'test_twits.json'), 'r') as f:
    test_data = json.load(f)

### Twit Stream

In [None]:
def twit_stream():
    for twit in test_data['data']:
        yield twit

next(twit_stream())

{'message_body': '$JWN has moved -1.69% on 10-31. Check out the movement and peers at  https://dividendbot.com?s=JWN',
 'timestamp': '2018-11-01T00:00:05Z'}

Using the `prediction` function, let's apply it to a stream of twits.

In [None]:
def score_twits(stream, model, vocab, universe):

    for twit in stream:

        text = twit['message_body']
        symbols = re.findall('\$[A-Z]{2,4}', text)
        score = predict(text, model, vocab)

        for symbol in symbols:
            if symbol in universe:
                yield {'symbol': symbol, 'score': score, 'timestamp': twit['timestamp']}

In [None]:
universe = {'$BBRY', '$AAPL', '$AMZN', '$BABA', '$YHOO', '$LQMT', '$FB', '$GOOG', '$BBBY', '$JNUG', '$SBUX', '$MU'}
score_stream = score_twits(twit_stream(), model, vocab, universe)

next(score_stream)

{'symbol': '$AAPL',
 'score': tensor([[ 0.1018,  0.0793,  0.1335,  0.1847,  0.5007]]),
 'timestamp': '2018-11-01T00:00:18Z'}