In [2]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


# numericalization
from collections import Counter

# preprocessing
import re
import nltk
from nltk.corpus import stopwords # will give an altered version later cuz the default isn't great
from string import punctuation
# from textblob import TextBlob
from collections import Counter
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# modeling
from sklearn.model_selection import train_test_split

# neural nets
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential, Input, optimizers
from keras.optimizers import Adam

pd.set_option('display.max_columns', 500)
title_fontsize = 15

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/setone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/setone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/setone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/setone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# df = pd.read_csv('./data/StockTwits_cleaned.csv')
df = pd.read_csv('./data/large_datafiles/Tweet.csv')

In [4]:
df

Unnamed: 0,tweet_id,writer,post_date,body,comment_num,retweet_num,like_num
0,550441509175443456,VisualStockRSRC,1420070457,"lx21 made $10,008 on $AAPL -Check it out! htt...",0,0,1
1,550441672312512512,KeralaGuy77,1420070496,Insanity of today weirdo massive selling. $aap...,0,0,0
2,550441732014223360,DozenStocks,1420070510,S&P100 #Stocks Performance $HD $LOW $SBUX $TGT...,0,0,0
3,550442977802207232,ShowDreamCar,1420070807,$GM $TSLA: Volkswagen Pushes 2014 Record Recal...,0,0,1
4,550443807834402816,i_Know_First,1420071005,Swing Trading: Up To 8.91% Return In 14 Days h...,0,0,1
...,...,...,...,...,...,...,...
3717959,1212159765914079234,TEEELAZER,1577836383,That $SPY $SPX puuump in the last hour was the...,1,0,6
3717960,1212159838882533376,ShortingIsFun,1577836401,In 2020 I may start Tweeting out positive news...,0,0,1
3717961,1212160015332728833,Commuternyc,1577836443,Patiently Waiting for the no twitter sitter tw...,0,0,5
3717962,1212160410692046849,MoriaCrypto,1577836537,I don't discriminate. I own both $aapl and $ms...,1,0,1


### Cleaning up the Tweets

In [3]:
df['body'] = df['raw_content']

In [4]:
def remove_substring(string, pattern, replacement=''):
    
    substrings_to_remove = re.findall(pattern, string)
    for substring in substrings_to_remove:
        string = string.replace(substring, replacement)
        
    return string

def reduce_repeated_chars(string):
    new_string = ''
    i = 0
    while i < len(string):
        j = i + 1
        while j < len(string) and string[j] == string[i]:
            j += 1
        new_string += string[i] + (string[i] if j - i >= 2 else '')
        i = j
    return new_string

In [5]:
# making everything lowercase
df['body'] = df['body'].apply(lambda x: x.lower())

In [None]:
# removing patterns

website_pattern = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
numbers = '\d+'
usernames = '@[^\s]+'
tickers = '\$[^\s]+'
extra_spaces = '  +'
hashtags = '\$[^\s]+'
next_lines = '\\n'


df['body'] = df['body'].apply(lambda x: remove_substring(x, website_pattern))
df['body'] = df['body'].apply(lambda x: x.encode('ascii', 'ignore').decode()) #emojis
df['body'] = df['body'].apply(lambda x: remove_substring(x, usernames))
df['body'] = df['body'].apply(lambda x: remove_substring(x, tickers))
df['body'] = df['body'].apply(lambda x: reduce_repeated_chars(x))
df['body'] = df['body'].apply(lambda x: remove_substring(x, numbers))
df['body'] = df['body'].apply(lambda x: remove_substring(x, hashtags))
df['body'] = df['body'].apply(lambda x: remove_substring(x, next_lines, ' '))

In [153]:
# removing all punctuation

punct = punctuation + '‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•'
print(punct)
df['body'] = df['body'].apply(lambda x: ''.join([c for c in x if c not in punct]))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•


In [89]:
# removing stop words 

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 
             'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
             'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 
             'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 
             'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
             'after', 'to', 'from', 'in', 'out', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 
             'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
             'such', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 
             'don', 'dont', 'should', 'shouldve', 'now', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 
             'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 
             'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 
             'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt', 'v', 'rn', 'lt', 'y', 'g', 'w', 
             'wk', 'sp', 'em', 'r', 'vs', 'd', 'ai', 't', 'mm', 'st', 'gt', 'n', 'id', 'p', 'f', 'm', 'b', 'c', 
             'pe', 'th', 'q', 'x', 'fb', 'ah', 'ill', 'u', 'oh', 'er', 'k', 's', 'im']

df['body'] = df['body'].apply(lambda x: ' '.join(x for x in x.split() if x not in stopwords))

In [154]:
# finishing up by removing extra spaces
df['body'] = df['body'].apply(lambda x: remove_substring(x, extra_spaces, ''))

In [155]:
# removing rows with just white spaces
df = df[~(df['body'].str.contains('^\s$', regex=True))]

In [156]:
# any rows with just one word wont provide enough context, so we'll remove them as well
df = df[(df['body'].str.contains(' '))]

In [157]:
df.head()

Unnamed: 0,created_at,body,sentiment,raw_content
0,2020-12-15T14:40:00Z,moving fast early,1,$AAPL Moving fast early!
1,2020-12-15T14:39:57Z,if confirms daily can rip,1,$AAPL if confirms daily can rip
2,2020-12-15T14:39:42Z,word got outdont miss your chance,1,$AAPL word got out .... don’t miss your chance
3,2020-12-15T14:39:40Z,who sold there weeklies to early lol,1,$AAPL who sold there weeklies to early lol
4,2020-12-15T14:39:36Z,another walk atlets see if it gets gobbled up...,1,$AAPL another walk at 126 let’s see if it gets...


In [158]:
# this will take ~24 hours
# # correct spellings

# def spelling_check(text):
#     global idx
    
#     idx += 1
    
#     if idx % 1e3 == 0: print(idx)
        
#     try:
#         result = str(TextBlob(text).correct())
#     except:
#         print(f'failed at {idx}')
        
#     return result

# idx = 0
# df['body'] = df['body'].apply(lambda x: spelling_check(x))

In [159]:
# subsetting a random sample of positive and negative to create a balanced dataset

df_negative = df[df['sentiment'] == 0]
df_positive = df.query("sentiment == 1").sample(n=len(df_negative))

df = pd.concat([df_negative, df_positive])
df.shape

(928816, 4)

In [160]:
# lemmatizing words

lemmatizer = nltk.stem.WordNetLemmatizer()
df['body'] = df['body'].apply(lambda x: lemmatizer.lemmatize(x))
df

Unnamed: 0,created_at,body,sentiment,raw_content
14,2020-12-15T14:38:18Z,going right throughsupport as if it isnt even...,0,$MSFT Going right through 214 support as if it...
49,2020-12-15T14:23:04Z,nobody gonna buy expensive ass iphones when t...,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15T14:12:10Z,robinhood peeps gonna be severely disappointe...,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15T13:33:52Z,always dump dump dump,0,$AAPL always dump dump dump.
106,2020-12-15T13:30:10Z,why is this turd not going anywhere this is p...,0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...,...
1593006,2022-01-28T15:12:17Z,soar baby soar,1,$TSLA soar baby soar
1116754,2021-11-09T15:28:11Z,evs getting decimated did brandon shit his pan...,1,$TSLA $LCID EV&#39;s getting decimated. Did Br...
1911649,2020-02-26T13:57:07Z,apparently bears have short term memory,1,$TSLA Apparently bears have short term memory
1899134,2022-02-23T18:38:07Z,holy shit i bought more callsmins ago and im ...,1,$SPY holy shit I bought more calls 5 mins ago ...


In [1]:
import pandas as pd

In [7]:
pd.read_csv('./data/preprocessing/balanced_untokenized_cleaned_stocktwits.csv')

Unnamed: 0,created_at,body,sentiment,raw_content
14,2020-12-15T14:38:18Z,going right throughsupport as if it isnt even...,0,$MSFT Going right through 214 support as if it...
49,2020-12-15T14:23:04Z,nobody gonna buy expensive ass iphones when t...,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15T14:12:10Z,robinhood peeps gonna be severely disappointe...,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15T13:33:52Z,always dump dump dump,0,$AAPL always dump dump dump.
106,2020-12-15T13:30:10Z,why is this turd not going anywhere this is p...,0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...,...
1593006,2022-01-28T15:12:17Z,soar baby soar,1,$TSLA soar baby soar
1116754,2021-11-09T15:28:11Z,evs getting decimated did brandon shit his pan...,1,$TSLA $LCID EV&#39;s getting decimated. Did Br...
1911649,2020-02-26T13:57:07Z,apparently bears have short term memory,1,$TSLA Apparently bears have short term memory
1899134,2022-02-23T18:38:07Z,holy shit i bought more callsmins ago and im ...,1,$SPY holy shit I bought more calls 5 mins ago ...


In [161]:
df.to_csv('./data/balanced_untokenized_cleaned_stocktwits.csv', index_label=False)

### Lemmatization + Tokenization

In [162]:
# tokenize words

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
df['body'] = df['body'].apply(lambda x: w_tokenizer.tokenize(x))

In [163]:
df.to_csv('./data/balanced_tokenized_cleaned_stocktwits.csv', index_label=False)

### Numericalize

In [164]:
tokenizer = Tokenizer()
corpus = df['body'].values
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

In [165]:
# create an int-mapping dictionary
vocab_to_int = tokenizer.word_index

In [166]:
sequences = tokenizer.texts_to_sequences(corpus)
padded_sequences = pad_sequences(sequences, 31, padding='post')

In [167]:
X = pd.DataFrame(padded_sequences)
y = df['sentiment']

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=18)

In [169]:
X.to_csv('./data/padded_X.csv', index_label=False)
y.to_csv('./data/padded_y.csv', index_label=False)
pd.DataFrame([vocab_to_int]).to_csv('./data/vocab_words.csv', index_label=False)

### Create Preprocessing Prediction

In [144]:
import re
from keras.utils import pad_sequences

dct = pd.read_csv('./data/vocab_words.csv').to_dict(orient='records')[0]

websites = r'(https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}[-a-zA-Z0-9()@:%_+.~#?&/=]*)'
numbers = '\d+'
usernames = '@[^\s]+'
tickers = '\$[^\s]+'
extra_spaces = '  +'
hashtags = '\$[^\s]+'
next_lines = '\\n'

punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~‘’“”1234567890…ðŸ‘‰ðŸ‘ŒðŸ’¦âœ¨✰♡*•˛❤•" + '"'

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 
             'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 
             'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 
             'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 
             'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
             'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 
             'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 
             'after', 'to', 'from', 'in', 'out', 'on', 'off', 'again', 'further', 'then', 'once', 'here', 'there', 
             'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 
             'such', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'can', 'will', 'just', 
             'don', 'dont', 'should', 'shouldve', 'now', 'ain', 'aren', 'arent', 'couldn', 'couldnt', 'didn', 
             'didnt', 'doesn', 'doesnt', 'hadn', 'hadnt', 'hasn', 'hasnt', 'haven', 'havent', 'isn', 'isnt', 'ma', 
             'mightn', 'mightnt', 'mustn', 'mustnt', 'needn', 'neednt', 'shan', 'shant', 'shouldn', 'shouldnt', 
             'wasn', 'wasnt', 'weren', 'werent', 'won', 'wont', 'wouldn', 'wouldnt', 'v', 'rn', 'lt', 'y', 'g', 'w', 
             'wk', 'sp', 'em', 'r', 'vs', 'd', 'ai', 't', 'mm', 'st', 'gt', 'n', 'id', 'p', 'f', 'm', 'b', 'c', 
             'pe', 'th', 'q', 'x', 'fb', 'ah', 'ill', 'u', 'oh', 'er', 'k', 's', 'im']




In [145]:
def preprocess(text):
    '''
    preprocess the text to input into model
    '''
    
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # make texts lowercase
    text = text.lower()
    
    # remove websites and usernames, if exist
    text = re.sub(websites, '', text)
    text = re.sub(usernames, '', text)
    text = re.sub(numbers, '', text)
    text = re.sub(tickers, '', text)
    text = re.sub(hashtags, '', text)
    text = re.sub(next_lines, '', text)
    
    # remove punctuation
    text = ''.join([x for x in text if x not in punctuation])
    
    # remove additional characters down to 2
    text = re.sub(re.compile(r'(\w)\1+'), r'\1\1', text)
    
    # remove stop words
    text = ' '.join(text.lower() for text in text.split() if text not in stopwords)
    
    # remove additional spaces
    text = re.sub(extra_spaces, '', text)
    
    # lemmatize & tokenize
    text = [lemmatizer.lemmatize(x) for x in w_tokenizer.tokenize(text)]
    
    # numericalize
    text_int = []
    text_int.append([dct[word] for word in text])
    
    return text_int

In [146]:
text = 'hello'
pad_sequences(preprocess(text), 31, padding='post')

array([[1119,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int32)