In [17]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


# numericalization
from collections import Counter

# preprocessing
import re
import nltk
from nltk.corpus import stopwords # will give an altered version later cuz the default isn't great
from string import punctuation
from textblob import TextBlob
from collections import Counter
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# modeling
from sklearn.model_selection import train_test_split

# neural nets
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential, Input, optimizers
from keras.optimizers import Adam

pd.set_option('display.max_columns', 500)
title_fontsize = 15

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/setone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/setone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/setone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/setone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [18]:
df = pd.read_csv('./data/StockTwits_cleaned.csv')

### Cleaning up the Tweets

In [19]:
df['body'] = df['raw_content']

In [20]:
def remove_substring(string, pattern, replacement=''):
    
    substrings_to_remove = re.findall(pattern, string)
    for substring in substrings_to_remove:
        string = string.replace(substring, replacement)
        
    return string

def reduce_repeated_chars(string):
    new_string = ''
    i = 0
    while i < len(string):
        j = i + 1
        while j < len(string) and string[j] == string[i]:
            j += 1
        new_string += string[i] + (string[i] if j - i >= 2 else '')
        i = j
    return new_string

In [21]:
# making everything lowercase
df['body'] = df['body'].apply(lambda x: x.lower())

In [22]:
# removing patterns

website_pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
numbers = '\d+'
usernames = '@[^\s]+'
tickers = '\$[^\s]+'
extra_spaces = '  +'
hashtags = '\$[^\s]+'
next_lines = '\\n'


df['body'] = df['body'].apply(lambda x: remove_substring(x, website_pattern))
df['body'] = df['body'].apply(lambda x: x.encode('ascii', 'ignore').decode()) #emojis
df['body'] = df['body'].apply(lambda x: remove_substring(x, usernames))
df['body'] = df['body'].apply(lambda x: remove_substring(x, tickers))
df['body'] = df['body'].apply(lambda x: reduce_repeated_chars(x))
df['body'] = df['body'].apply(lambda x: remove_substring(x, numbers))
df['body'] = df['body'].apply(lambda x: remove_substring(x, hashtags))
df['body'] = df['body'].apply(lambda x: remove_substring(x, next_lines, ' '))

In [23]:
# removing all punctuation

punct = punctuation + '‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•'
print(punct)
df['body'] = df['body'].apply(lambda x: ''.join([c for c in x if c not in punct]))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•


In [24]:
# removing stop words 

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 
             'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
             'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 
             'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 
             'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
             'after', 'to', 'from', 'in', 'out', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 
             'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
             'such', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 
             'don', 'dont', 'should', 'shouldve', 'now', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 
             'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 
             'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 
             'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt']

df['body'] = df['body'].apply(lambda x: ' '.join(x for x in x.split() if x not in stopwords))

In [25]:
# finishing up by removing extra spaces
df['body'] = df['body'].apply(lambda x: remove_substring(x, extra_spaces, ''))

In [26]:
# removing rows with just white spaces
df = df[~(df['body'].str.contains('^\s$', regex=True))]

In [27]:
# any rows with just one word wont provide enough context, so we'll remove them as well
df = df[(df['body'].str.contains(' '))]

In [28]:
df.head()

Unnamed: 0,created_at,body,sentiment,raw_content
0,2020-12-15T14:40:00Z,moving fast early,1,$AAPL Moving fast early!
1,2020-12-15T14:39:57Z,confirms daily rip,1,$AAPL if confirms daily can rip
2,2020-12-15T14:39:42Z,word got miss chance,1,$AAPL word got out .... don’t miss your chance
3,2020-12-15T14:39:40Z,sold weeklies early lol,1,$AAPL who sold there weeklies to early lol
4,2020-12-15T14:39:36Z,another walk lets see gets gobbled up,1,$AAPL another walk at 126 let’s see if it gets...


In [29]:
# this will take ~24 hours
# # correct spellings

# def spelling_check(text):
#     global idx
    
#     idx += 1
    
#     if idx % 1e3 == 0: print(idx)
        
#     try:
#         result = str(TextBlob(text).correct())
#     except:
#         print(f'failed at {idx}')
        
#     return result

# idx = 0
# df['body'] = df['body'].apply(lambda x: spelling_check(x))

In [30]:
# subsetting a random sample of positive and negative to create a balanced dataset

df_negative = df[df['sentiment'] == 0]
df_positive = df.query("sentiment == 1").sample(n=len(df_negative))

df = pd.concat([df_negative, df_positive])
df.shape

(862570, 4)

In [34]:
lemmatizer = nltk.stem.WordNetLemmatizer()
df['body'] = df['body'].apply(lambda x: lemmatizer.lemmatize(x))
df

Unnamed: 0,created_at,body,sentiment,raw_content
14,2020-12-15T14:38:18Z,going right support even superior growth trade,0,$MSFT Going right through 214 support as if it...
49,2020-12-15T14:23:04Z,nobody gonna buy expensive ass iphones aint go...,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15T14:12:10Z,robinhood peeps gonna severely disappointed tu...,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15T13:33:52Z,always dump dump dump,0,$AAPL always dump dump dump.
106,2020-12-15T13:30:10Z,turd going anywhere pathetic,0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...,...
1942436,2021-05-11T12:36:16Z,january calls look really good totally buying ...,1,$TSLA $NIO $LI $XPEV \n\nJanuary 2023 calls lo...
1212986,2021-03-25T22:44:46Z,joe ohm raised price target,1,$TSLA joe ohm raised price target from $800 to...
1900463,2020-01-22T19:31:24Z,like people discovered electric cars first tim...,1,$TSLA It’s like people have just discovered el...
1213039,2021-03-25T21:05:38Z,yes baby yes,1,$TSLA Yes baby yes


In [35]:
df.to_csv('./data/balanced_untokenized_cleaned_stocktwits.csv', index_label=False)

### Lemmatization + Tokenization

In [466]:
# tokenization + lemmatization 

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

idx = 0

def lemmatize_text(text):
    global idx
    
    idx += 1
    
    if idx % 1e5 == 0: print(f'row {idx} complete')
        
    try: 
        result = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]
    except:
        print(f'failed at {idx}')
        
    return result

df['body'] = df['body'].apply(lambda x: lemmatize_text(x))

row 100000 complete
row 200000 complete
row 300000 complete
row 400000 complete
row 500000 complete
row 600000 complete
row 700000 complete
row 800000 complete


In [467]:
df.to_csv('./data/balanced_lemmatized_cleaned_stocktwits.csv', index_label=False)

### Numericalize

In [57]:
tokenizer = Tokenizer()
corpus = df['body'].values
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

In [58]:
# create an int-mapping dictionary
vocab_to_int = tokenizer.word_index

In [59]:
sequences = tokenizer.texts_to_sequences(corpus)
padded_sequences = pad_sequences(sequences, max_words, padding='post')

In [63]:
max_words = max(df['body'].str.len())
padded_sequences = pad_sequences(sequences, max_words, padding='post')

In [64]:
X = pd.DataFrame(padded_sequences)
y = df['sentiment']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=18)

In [66]:
X.to_csv('./data/padded_X.csv', index_label=False)
y.to_csv('./data/padded_y.csv', index_label=False)

### Create Preprocessing Prediction

In [None]:
def remove_substring(string, pattern, replacement=''):
    
    substrings_to_remove = re.findall(pattern, string)
    for substring in substrings_to_remove:
        string = string.replace(substring, replacement)
        
    return string

def reduce_repeated_chars(string):
    new_string = ''
    i = 0
    while i < len(string):
        j = i + 1
        while j < len(string) and string[j] == string[i]:
            j += 1
        new_string += string[i] + (string[i] if j - i >= 2 else '')
        i = j
    return new_string

In [77]:
from keras.utils import pad_sequences

websites = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
numbers = '\d+'
usernames = '@[^\s]+'
tickers = '\$[^\s]+'
extra_spaces = '  +'
hashtags = '\$[^\s]+'
next_lines = '\\n'

punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•" + '"'

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 
             'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
             'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 
             'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 
             'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
             'after', 'to', 'from', 'in', 'out', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 
             'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
             'such', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 
             'don', 'dont', 'should', 'shouldve', 'now', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 
             'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 
             'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 
             'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt']




In [78]:
def preprocess(text):
    '''
    preprocess the text to input into model
    '''
    
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # make texts lowercase
    text = text.lower()
    
    # remove websites and usernames, if exist
    text = re.sub(websites, '', text)
    text = re.sub(usernames, '', text)
    text = re.sub(numbers, '', text)
    text = re.sub(tickers, '', text)
    text = re.sub(hashtags, '', text)
    text = re.sub(next_lines, '', text)
    
    # remove punctuation
    text = ''.join([x for x in text if x not in punctuation])
    
    # remove stop words
    text = ' '.join(text.lower() for text in text.split() if text not in stopwords)
    
    # remove additional spaces
    text = re.sub(extra_spaces, '', text)
    
    # lemmatize & tokenize
    text = [lemmatizer.lemmatize(x) for x in w_tokenizer.tokenize(text)]
    
    # numericalize
    text_int = []
    text_int.append([dct[word] for word in text])
    
    return text_int

In [79]:
text = 'hello'
pad_sequences(preprocess(text), 31, padding='post')

NameError: name 'dct' is not defined

In [80]:
vocab_to_int

{'up': 1,
 'today': 2,
 'down': 3,
 'buy': 4,
 'going': 5,
 'go': 6,
 'tomorrow': 7,
 'like': 8,
 'get': 9,
 'tesla': 10,
 'day': 11,
 'market': 12,
 'back': 13,
 'bears': 14,
 'puts': 15,
 'next': 16,
 'stock': 17,
 'see': 18,
 'short': 19,
 'bulls': 20,
 'sell': 21,
 'week': 22,
 'time': 23,
 'no': 24,
 'calls': 25,
 'lol': 26,
 'good': 27,
 'lets': 28,
 'over': 29,
 'money': 30,
 'im': 31,
 'elon': 32,
 'coming': 33,
 'still': 34,
 'big': 35,
 'buying': 36,
 'price': 37,
 'long': 38,
 'green': 39,
 'close': 40,
 'one': 41,
 'red': 42,
 'think': 43,
 'gonna': 44,
 'soon': 45,
 'dip': 46,
 'apple': 47,
 'bought': 48,
 'shares': 49,
 'drop': 50,
 'new': 51,
 'last': 52,
 'news': 53,
 'people': 54,
 'come': 55,
 'earnings': 56,
 'right': 57,
 'would': 58,
 'hold': 59,
 'selling': 60,
 'way': 61,
 'run': 62,
 'open': 63,
 'shorts': 64,
 'take': 65,
 'make': 66,
 'even': 67,
 'got': 68,
 'know': 69,
 'looking': 70,
 'cant': 71,
 'company': 72,
 'keep': 73,
 'another': 74,
 'end': 75,
 'br