In [2]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


# numericalization
from collections import Counter

# preprocessing
from string import punctuation
import re

# modeling
from sklearn.model_selection import train_test_split


# neural nets
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential, Input, optimizers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

pd.set_option('display.max_columns', 500)
title_fontsize = 15

In [3]:
df = pd.read_csv('./data/balanced_tokenized_cleaned_stocktwits.csv', 
                 parse_dates=['created_at']).drop('body', axis=1)

In [10]:
df2 = df['raw_content'].str.upper().str.extractall(r'\$(\w+)')[0].reset_index()

In [11]:
df2

Unnamed: 0,level_0,match,0
0,14,0,MSFT
1,14,1,AAPL
2,49,0,AAPL
3,61,0,AAPL
4,103,0,AAPL
...,...,...,...
1422269,1116754,1,LCID
1422270,1911649,0,TSLA
1422271,1899134,0,SPY
1422272,1899134,1,TSLA


In [28]:
df['raw_content'].str.apply(lambda x: x.fin)

Unnamed: 0,created_at,sentiment,raw_content
14,2020-12-15 14:38:18+00:00,0,$MSFT Going right through 214 support as if it...
49,2020-12-15 14:23:04+00:00,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15 14:12:10+00:00,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15 13:33:52+00:00,0,$AAPL always dump dump dump.
106,2020-12-15 13:30:10+00:00,0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...
1593006,2022-01-28 15:12:17+00:00,1,$TSLA soar baby soar
1116754,2021-11-09 15:28:11+00:00,1,$TSLA $LCID EV&#39;s getting decimated. Did Br...
1911649,2020-02-26 13:57:07+00:00,1,$TSLA Apparently bears have short term memory
1899134,2022-02-23 18:38:07+00:00,1,$SPY holy shit I bought more calls 5 mins ago ...


In [128]:
df.loc[df2[df2['match'] != 0]['level_0'].unique().tolist()].loc['238']

created_at                    2020-12-15 00:50:14+00:00
sentiment                                             0
raw_content    $PTON\nJust got 3 months free from $AAPL
Name: 238, dtype: object

In [91]:
df2['number_of_tickers_in_comment'] = df2['match'] +1
df2['number_of_tickers_in_comment'].value_counts()

1     928816
2     250177
3     118207
4      72411
5      41023
6       5527
7       2130
8       1207
9        789
10       544
11       391
12       292
13       219
14       153
15       117
16        75
17        62
18        43
19        35
20        22
21        13
22         7
23         3
24         2
25         2
26         2
27         1
28         1
29         1
30         1
31         1
Name: number_of_tickers_in_comment, dtype: int64

In [92]:
df

Unnamed: 0,created_at,sentiment,raw_content
14,2020-12-15 14:38:18+00:00,0,$MSFT Going right through 214 support as if it...
49,2020-12-15 14:23:04+00:00,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15 14:12:10+00:00,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15 13:33:52+00:00,0,$AAPL always dump dump dump.
106,2020-12-15 13:30:10+00:00,0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...
1593006,2022-01-28 15:12:17+00:00,1,$TSLA soar baby soar
1116754,2021-11-09 15:28:11+00:00,1,$TSLA $LCID EV&#39;s getting decimated. Did Br...
1911649,2020-02-26 13:57:07+00:00,1,$TSLA Apparently bears have short term memory
1899134,2022-02-23 18:38:07+00:00,1,$SPY holy shit I bought more calls 5 mins ago ...
