<a href="https://colab.research.google.com/github/hossein20s/tutorial/blob/master/Modeling_Stock_Market_Sentiment_with_LSTMs_and_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

print(tf.__version__)


1.13.1


In [2]:
import sys
sys.path.append('/content/gdrive/My Drive/python-lib')

from google.colab import drive
drive.mount('/content/gdrive')

persist_path = "gdrive/My Drive/models/modeling_stock_market"


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:

import pandas as pd

data = pd.read_csv("https://github.com/GarrettHoffman/lstm-oreilly/raw/master/data/StockTwits_SPY_Sentiment_2017.gz",
                   encoding="utf-8",
                   compression="gzip",
                   index_col=0)

# get messages and sentiment labels
messages = data.message.values
labels = data.sentiment.values

# View sample of messages with sentiment

for i in range(10):
    print("Messages: {}...".format(messages[i]),
          "Sentiment: {}".format(labels[i]))

Messages: $SPY crazy day so far!... Sentiment: bearish
Messages: $SPY Will make a new ATH this week. Watch it!... Sentiment: bullish
Messages: $SPY $DJIA white elephant in room is $AAPL. Up 14% since election. Strong headwinds w/Trump trade & Strong dollar. How many 7's do you see?... Sentiment: bearish
Messages: $SPY blocks above. We break above them We should push to double top... Sentiment: bullish
Messages: $SPY Nothing happening in the market today, guess I'll go to the store and spend some $.... Sentiment: bearish
Messages: $SPY What an easy call. Good jobs report: good economy, markets go up.  Bad jobs report: no more rate hikes, markets go up.  Win-win.... Sentiment: bullish
Messages: $SPY BS market.... Sentiment: bullish
Messages: $SPY this rally all the cheerleaders were screaming about this morning is pretty weak. I keep adding 2 my short at all spikes... Sentiment: bearish
Messages: $SPY Dollar ripping higher!... Sentiment: bearish
Messages: $SPY no reason to go down !... S

In [0]:
import numpy as np
import utils as utl


# make everything lower case and remove punctuation
messages = np.array([utl.preprocess_ST_message(message) for message in messages])

# keeping a collection of our full vocabularly and creating a mapping of each word to a unique index

full_lexicon = " ".join(messages).split()
vocab_to_int, int_to_vocab = utl.create_lookup_tables(full_lexicon)

In [5]:
from collections import Counter

messages_lens = Counter([len(x) for x in messages])
print("Zero-length messages: {}".format(messages_lens[0]))
print("Maximum message length: {}".format(max(messages_lens)))
print("Average message length: {}".format(np.mean([len(x) for x in messages])))

Zero-length messages: 1
Maximum message length: 244
Average message length: 78.21856920395598


In [0]:
messages, labels = utl.drop_empty_messages(messages, labels)

messages = utl.encode_ST_messages(messages, vocab_to_int)
labels = utl.encode_ST_labels(labels)
messages = utl.zero_pad_messages(messages, seq_len=244)

In [7]:
train_x, val_x, test_x, train_y, val_y, test_y = utl.train_val_test_split(messages, labels, split_frac=0.80)

print("Data Set Size")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

Data Set Size
Train set: 		(77572, 244) 
Validation set: 	(9697, 244) 
Test set: 		(9697, 244)


In [0]:
from utils import model_inputs
from utils import build_embedding_layer
from utils import build_lstm_layers
from utils import build_cost_fn_and_opt
from utils import build_accuracy


def build_and_train_network(model_dir, lstm_sizes, vocab_size, embed_size, epochs, batch_size,
                            learning_rate, keep_prob, train_x, val_x, train_y, val_y):
    
    inputs_, labels_, keep_prob_ = model_inputs()
    embed = build_embedding_layer(inputs_, vocab_size, embed_size)
    initial_state, lstm_outputs, lstm_cell, final_state = build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size)
    predictions, loss, optimizer = build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate)
    accuracy = build_accuracy(predictions, labels_)
    
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        
        sess.run(tf.global_variables_initializer())
        n_batches = len(train_x)//batch_size
        for e in range(epochs):
            state = sess.run(initial_state)
            
            train_acc = []
            for ii, (x, y) in enumerate(utl.get_batches(train_x, train_y, batch_size), 1):
                feed = {inputs_: x,
                        labels_: y[:, None],
                        keep_prob_: keep_prob,
                        initial_state: state}
                loss_, state, _,  batch_acc = sess.run([loss, final_state, optimizer, accuracy], feed_dict=feed)
                train_acc.append(batch_acc)
                
                if (ii + 1) % n_batches == 0:
                    
                    val_acc = []
                    val_state = sess.run(lstm_cell.zero_state(batch_size, tf.float32))
                    for xx, yy in utl.get_batches(val_x, val_y, batch_size):
                        feed = {inputs_: xx,
                                labels_: yy[:, None],
                                keep_prob_: 1,
                                initial_state: val_state}
                        val_batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                        val_acc.append(val_batch_acc)
                    
                    print("Epoch: {}/{}...".format(e+1, epochs),
                          "Batch: {}/{}...".format(ii+1, n_batches),
                          "Train Loss: {:.3f}...".format(loss_),
                          "Train Accruacy: {:.3f}...".format(np.mean(train_acc)),
                          "Val Accuracy: {:.3f}".format(np.mean(val_acc)))
    
        saver.save(sess, model_dir + '/sentiment.ckpt')

In [11]:
# Define Inputs and Hyperparameters
lstm_sizes = [128, 64]
vocab_size = len(vocab_to_int) + 1 #add one for padding
embed_size = 300
epochs = 3
batch_size = 256
learning_rate = 0.1
keep_prob = 0.5

with tf.Graph().as_default():
    build_and_train_network(persist_path, lstm_sizes, vocab_size, embed_size, epochs, batch_size,
                            learning_rate, keep_prob, train_x, val_x, train_y, val_y)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Use tf.cast instead.
Epoch: 1/3... Batch: 303/303... Train Loss: 0.244... Train Accruacy: 0.549... Val Accuracy: 0.574
Epoch: 2/3... Batch: 3

In [12]:
def test_network(model_dir, batch_size, test_x, test_y):
    
    inputs_, labels_, keep_prob_ = model_inputs()
    embed = build_embedding_layer(inputs_, vocab_size, embed_size)
    initial_state, lstm_outputs, lstm_cell, final_state = build_lstm_layers(lstm_sizes, embed, keep_prob_, batch_size)
    predictions, loss, optimizer = build_cost_fn_and_opt(lstm_outputs, labels_, learning_rate)
    accuracy = build_accuracy(predictions, labels_)
    
    saver = tf.train.Saver()
    
    test_acc = []
    with tf.Session() as sess:
        saver.restore(sess, tf.train.latest_checkpoint(model_dir))
        test_state = sess.run(lstm_cell.zero_state(batch_size, tf.float32))
        for ii, (x, y) in enumerate(utl.get_batches(test_x, test_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob_: 1,
                    initial_state: test_state}
            batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
            test_acc.append(batch_acc)
        print("Test Accuracy: {:.3f}".format(np.mean(test_acc)))

with tf.Graph().as_default():
    test_network(persist_path, batch_size, test_x, test_y)
   

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from gdrive/My Drive/models/modeling_stock_market/sentiment.ckpt
Test Accuracy: 0.607


# New Section