# Example notebook for batch processing news sentiments

In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import json
import nltk
import os
import random
import re
import torch
import dill
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm
from torch import nn, optim
import torch.nn.functional as F
import operator

In [5]:
f = open('./data/tickerList.pckl', 'rb')
tickerList= dill.load(f)
f.close()

news=pd.read_csv('./data/news.csv')

In [10]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, lstm_size, output_size, lstm_layers=1, dropout=0.1):
        super().__init__() #Initialize the model by setting up the layers
        self.vocab_size = vocab_size #vocab_size : The vocabulary size.
        self.embed_size = embed_size #embed_size : The embedding layer size.
        self.lstm_size = lstm_size #lstm_size : The LSTM layer size
        self.output_size = output_size # output_size : The output size
        self.lstm_layers = lstm_layers # lstm_layers : The number of LSTM layers
        self.dropout = dropout #dropout : The dropout probability
        
        # Setup embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # Setup additional layers
        self.lstm = nn.LSTM(embed_size, lstm_size, lstm_layers, 
                            dropout=dropout, batch_first=False)

        # dropout layer
        self.dropout = nn.Dropout(dropout)        
        
        # linear and sigmoid layers
        self.fc = nn.Linear(lstm_size, output_size)
        self.lsof = nn.LogSoftmax(dim=None)


    def init_hidden(self, batch_size): #Initializes hidden state #Parameter batch_size: The size of batches
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_(),
                  weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_())
        return hidden  # returns hidden_state  


    def forward(self, nn_input, hidden_state): # Perform a forward pass of our model on nn_input.
     #PARAMETERS nn_input: The batch of input to the NN. hidden_state: The LSTM hidden state.
    
        # embeddings and lstm_out
        nn_input = nn_input.long()
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden_state)
    
        # stack up lstm outputs
        lstm_out = lstm_out[-1,:,:]
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # Softmax function
        logps = self.lsof(out)
        
        # reshape to be batch_size first
        batch_size = nn_input.size(1)       
        
        # return last sigmoid output (log softmax output) and the new hidden state
        return logps, hidden

In [11]:
model = torch.load('./data/model1.pt')

In [12]:
with open('./data/twits.json', 'r') as f: twits = json.load(f)
# with open('test_data.json', 'r') as f: test_data = json.load(f)

In [13]:
messages = [twit['message_body'] for twit in twits['data']]
# Since the sentiment scores are discrete, we'll scale the sentiments to 0 to 4 for use in our network
sentiments = [twit['sentiment'] + 2 for twit in twits['data']]

In [14]:
def preprocess(message):
    
    # Lowercase the twit message
    text = message.lower()
    
    # Replace URLs with a space in the message
    text = re.sub(r'https?:\/\/.*[\r\n]*', ' ', text)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with $.
    text = re.sub(r'[$][A-Za-z][\S]*', ' ', text)
    
    # Replace StockTwits usernames with a space. The usernames are any word that starts with @.
    text = re.sub(r'[@][A-Za-z][\S]*', ' ', text)

    # Replace everything not a letter with a space
    text = re.sub(r'[\W_]+', ' ', text)
    
    # Tokenize by splitting the string on whitespace into a list of words
    tokens = text.split()

    # Lemmatize words using the WordNetLemmatizer. You can ignore any word that is not longer than one character.
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(token) for token in tokens if len(token) > 1]
    
    return tokens

In [15]:
tokenized = [preprocess(twit) for twit in messages]

In [16]:
# Create a vocabulary by using Bag of words
all_text = ' '.join([' '.join(tokens) for tokens in tokenized])
counts = Counter(all_text.split())
bow = dict(counts)

In [17]:
# Dictionart that contains the Frequency of words appearing in messages.
# The key is the token and the value is the frequency of that word in the corpus.
total_count = len(bow)
freqs = {word: count/total_count for word, count in bow.items()}

# Float that is the frequency cutoff. Drop words with a frequency that is lower or equal to this number.
low_cutoff = 1e-5

# Integer that is the cut off for most common words. Drop words that are the `high_cutoff` most common words.
high_cutoff = 10

# The k most common words in the corpus. Use `high_cutoff` as the k.
K_most_common = sorted(freqs.items(), key=operator.itemgetter(1))[-high_cutoff:]
K_most_common = [x[0] for x in K_most_common]

filtered_words = [word for word in freqs if (freqs[word] > low_cutoff and word not in K_most_common)]

In [18]:
# A dictionary for the `filtered_words`. The key is the word and value is an id that represents the word. 
vocab ={word: index for index, word in enumerate(filtered_words, 1)}
# Reverse of the `vocab` dictionary. The key is word id and value is the word. 
id2vocab = {ii: word for word, ii in vocab.items()}
# tokenized with the words not in `filtered_words` removed.
filtered = [[word for word in message if word in vocab] for message in tokenized]

In [19]:
def predict(text, model, vocab): # Make a prediction on a single sentence.

    tokens = preprocess(text)

    # Filter non-vocab words
    tokens = [token for token in tokens if token in vocab]
    # Convert words to ids
    tokens =  [vocab[token] for token in tokens]

    # Adding a batch dimension
    text_input = torch.from_numpy(np.asarray(torch.FloatTensor(tokens).view(-1, 1)))
    # Get the NN output
    hidden = model.init_hidden(1)
    logps, _ = model.forward(text_input, hidden)
    # Take the exponent of the NN output to get a range of 0 to 1 for each label.
    pred = torch.exp(logps)

    return pred #pred : Prediction vector

In [22]:
text = "Google is working on self driving cars, I'm bullish on $goog"
model.eval()
model.to("cpu")
predict(text, model, vocab)



tensor([[7.6556e-05, 1.3917e-02, 6.3105e-03, 6.7737e-01, 3.0233e-01]],
       grad_fn=<ExpBackward>)

In [23]:
news['score'] =0

In [None]:
for i in tqdm(range(1050000, 1100000)):
    text = str(news.loc[i, ['headline']]) + " " +str(news.loc[i, ['message_body']])
    sentimentscore = np.dot(predict(text, model, vocab).detach().numpy(),np.linspace(-2,2,5))
    news['score'].iloc[i] = sentimentscore

news.to_csv('./data/news1050000to1100000.csv', header=True)
news.head()

  2%|▏         | 1048/50000 [13:45<9:58:16,  1.36it/s] 

In [None]:
news.to_csv('./data/news1050000to1100000.csv', header=True)
news.head()