In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


# numericalization
from collections import Counter

# preprocessing
from string import punctuation
import re
import nltk
nltk.download('wordnet')

# modeling
from sklearn.model_selection import train_test_split

# neural nets
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential, Input, optimizers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from tensorflow import keras

pd.set_option('display.max_columns', 500)
title_fontsize = 15

[nltk_data] Downloading package wordnet to /Users/setone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/balanced_untokenized_cleaned_stocktwits.csv')

In [None]:
tokenizer = Tokenizer()
corpus = df['body'].values
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

In [None]:
# create an int-mapping dictionary
vocab_to_int = tokenizer.word_index

max_padding = 25
sequences = tokenizer.texts_to_sequences(corpus)
padded_sequences = pad_sequences(sequences, max_padding, padding='post')

In [None]:
X = pd.read_csv('/content/drive/MyDrive/padded_X.csv')
y = pd.read_csv('/content/drive/MyDrive/padded_y.csv')

In [None]:
# X = pd.read_csv('/content/drive/MyDrive/padded_X.csv')
# y = pd.read_csv('/content/drive/MyDrive/padded_y.csv')

# print(X.shape, y.shape)

X = pd.DataFrame(padded_sequences)
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=18)

In [None]:
model = Sequential()

embedding_vector_length = 300
model.add(Embedding(5000, embedding_vector_length, input_length=31)) 
model.add(LSTM(100)) 
model.add(Dense(1, activation='sigmoid')) 
Adam = optimizers.Adam(learning_rate=0.0005)
model.compile(loss='binary_crossentropy', optimizer=Adam, metrics=['accuracy']) 
# early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 31, 300)           1500000   
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 1,660,501
Trainable params: 1,660,501
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f2c06dccb50>

In [None]:
for lr in [0.0005, 0.0003, 0.0001, 0.009, 0.007, 0.005, 0.003]:
  for bs in [1024, 512, 256, 128, 64, 32]:
    print(f'batch size: {bs}, learning rate: {lr}')
    
    model = Sequential()

    embedding_vector_length = 300
    model.add(Embedding(5000, embedding_vector_length, input_length=149)) 
    model.add(LSTM(100)) 
    model.add(Dense(1, activation='sigmoid')) 
    Adam = optimizers.Adam(learning_rate=lr)
    model.compile(loss='binary_crossentropy', optimizer=Adam, metrics=['accuracy']) 
    # model.summary()

    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=bs)
    # score = model.evaluate(X_test, y_test, verbose=0)
    # print(f'     score: {score[0]}, loss: {score[1]}')

In [None]:
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Test loss: 0.6176503300666809
Test accuracy: 0.7711316347122192


In [None]:
# run preprocess function down below first

text = 'im bullish about this stock'
print(text, model.predict(pad_sequences(preprocess(text), 31, padding='post')))

text = 'im bearish about this stock'
print(text, model.predict(pad_sequences(preprocess(text), 31, padding='post')))

text = 'short this stock'
print(text, model.predict(pad_sequences(preprocess(text), 31, padding='post')))

text = 'i just lost all my money'
print(text, model.predict(pad_sequences(preprocess(text), 31, padding='post')))

text = 'oh crap im broke'
print(text, model.predict(pad_sequences(preprocess(text), 31, padding='post')))

im bullish about this stock [[0.17795417]]
im bearish about this stock [[0.67036855]]
short this stock [[0.9892418]]
i just lost all my money [[0.5148862]]
oh crap im broke [[0.10336161]]


In [None]:
model = keras.models.load_model('/content/drive/MyDrive/LSTM_model_0005LR.h5')

In [None]:
model.save('/content/drive/MyDrive/LSTM_model_0005LR.h5')

In [None]:
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from keras.utils import pad_sequences

dct = pd.read_csv('/content/drive/MyDrive/vocab_words.csv').to_dict(orient='records')[0]

websites = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
numbers = '\d+'
usernames = '@[^\s]+'
tickers = '\$[^\s]+'
extra_spaces = '  +'
hashtags = '\$[^\s]+'
next_lines = '\\n'

punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•" + '"'

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 
             'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
             'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 
             'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 
             'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
             'after', 'to', 'from', 'in', 'out', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 
             'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
             'such', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 
             'don', 'dont', 'should', 'shouldve', 'now', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 
             'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 
             'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 
             'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt', 'v', 'rn', 'lt', 'y', 'g', 'w', 
             'wk', 'sp', 'em', 'r', 'vs', 'd', 'ai', 't', 'mm', 'st', 'gt', 'n', 'id', 'p', 'f', 'm', 'b', 'c', 
             'pe', 'th', 'q', 'x', 'fb', 'ah', 'ill', 'u', 'oh', 'er', 'k', 's', 'im']



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
def preprocess(text):
    '''
    preprocess the text to input into model
    '''
    
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # make texts lowercase
    text = text.lower()
    
    # remove websites and usernames, if exist
    text = re.sub(websites, '', text)
    text = re.sub(usernames, '', text)
    text = re.sub(numbers, '', text)
    text = re.sub(tickers, '', text)
    text = re.sub(hashtags, '', text)
    text = re.sub(next_lines, '', text)
    
    # remove punctuation
    text = ''.join([x for x in text if x not in punctuation])
    
    # remove additional characters down to 2
    text = re.sub(re.compile(r'(\w)\1+'), r'\1\1', text)
    
    # remove stop words
    text = ' '.join(text.lower() for text in text.split() if text not in stopwords)
    
    # remove additional spaces
    text = re.sub(extra_spaces, '', text)
    
    # lemmatize & tokenize
    text = [lemmatizer.lemmatize(x) for x in w_tokenizer.tokenize(text)]
    
    # numericalize
    text_int = []
    text_int.append([dct.get(word, 0) for word in text])
    
    return text_int