In [100]:
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D


# numericalization
from collections import Counter

# preprocessing
import re
import nltk
from nltk.corpus import stopwords # will give an altered version later cuz the default isn't great
from string import punctuation
# from textblob import TextBlob
from collections import Counter
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# modeling
from sklearn.model_selection import train_test_split

# neural nets
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import Sequential, Input, optimizers
from keras.optimizers import Adam
from transformers import (
    RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, 
    BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW
)

pd.set_option('display.max_columns', 500)
title_fontsize = 15

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/setone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/setone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/setone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/setone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [131]:
df = pd.read_csv('./data/all_stock_sentiment.csv').iloc[:, 1:].drop_duplicates()
stocks = pd.read_csv('./data/scraped_stocks/scraped_stock_2015_2022.csv').iloc[:, 1:]

In [132]:
df.Date = pd.to_datetime(df.Date).dt.date

In [134]:
df['sentiment'].value_counts()

Bullish    2488895
Bearish    1054403
Name: sentiment, dtype: int64

In [103]:
df = df.groupby(['Date', 'ticker', 'sentiment']).count().reset_index()
df

Unnamed: 0,Date,ticker,sentiment,Content
0,2014-12-31,BX,Bearish,1
1,2014-12-31,COST,Bullish,1
2,2014-12-31,CRM,Bullish,1
3,2014-12-31,DIS,Bullish,2
4,2014-12-31,KO,Bearish,5
...,...,...,...,...
52792,2022-09-29,PYPL,Bearish,1
52793,2022-09-29,TSLA,Bearish,33
52794,2022-09-29,TSLA,Bullish,79
52795,2022-09-29,TSM,Bearish,8


In [104]:
data = df.copy()
company = 'AAPL'

data = data[data['ticker'] == company]
data

Unnamed: 0,Date,ticker,sentiment,Content
10,2015-01-01,AAPL,Bearish,55
11,2015-01-01,AAPL,Bullish,225
27,2015-01-02,AAPL,Bearish,248
28,2015-01-02,AAPL,Bullish,561
43,2015-01-03,AAPL,Bearish,66
...,...,...,...,...
52740,2022-09-27,AAPL,Bullish,3
52756,2022-09-28,AAPL,Bearish,37
52757,2022-09-28,AAPL,Bullish,24
52777,2022-09-29,AAPL,Bearish,10


In [115]:
# total sum per date
volume = data.groupby(['Date']).agg({'Content': 'sum'}).reset_index()
df = volume.rename(columns={'Content': 'Volume'}).merge(data, on='Date')
df.head()

Unnamed: 0,Date,Volume,ticker,sentiment,Content
0,2015-01-01,280,AAPL,Bearish,55
1,2015-01-01,280,AAPL,Bullish,225
2,2015-01-02,809,AAPL,Bearish,248
3,2015-01-02,809,AAPL,Bullish,561
4,2015-01-03,231,AAPL,Bearish,66


In [118]:
df['Value'] = df.Content / df.Volume
df

Unnamed: 0,Date,Volume,ticker,sentiment,Content,Value
0,2015-01-01,280,AAPL,Bearish,55,0.196429
1,2015-01-01,280,AAPL,Bullish,225,0.803571
2,2015-01-02,809,AAPL,Bearish,248,0.306551
3,2015-01-02,809,AAPL,Bullish,561,0.693449
4,2015-01-03,231,AAPL,Bearish,66,0.285714
...,...,...,...,...,...,...
4587,2022-09-27,4,AAPL,Bullish,3,0.750000
4588,2022-09-28,61,AAPL,Bearish,37,0.606557
4589,2022-09-28,61,AAPL,Bullish,24,0.393443
4590,2022-09-29,20,AAPL,Bearish,10,0.500000


In [82]:
data = pd.merge(volume.rename(columns={'Content':'Volume'}), data)
data['Value'] = data.Content / data.Volume
data

Unnamed: 0,Date,Volume,ticker,sentiment,Content,Value
0,2015-01-01,280,AAPL,Bearish,55,0.196429
1,2015-01-01,280,AAPL,Bullish,225,0.803571
2,2015-01-02,809,AAPL,Bearish,248,0.306551
3,2015-01-02,809,AAPL,Bullish,561,0.693449
4,2015-01-03,231,AAPL,Bearish,66,0.285714
...,...,...,...,...,...,...
4587,2022-09-27,4,AAPL,Bullish,3,0.750000
4588,2022-09-28,61,AAPL,Bearish,37,0.606557
4589,2022-09-28,61,AAPL,Bullish,24,0.393443
4590,2022-09-29,20,AAPL,Bearish,10,0.500000


In [130]:
df

Unnamed: 0,Date,Value,Volume,ticker,sentiment,Content
0,2015-01-01,0.803571,280,AAPL,Bullish,225
1,2015-01-02,0.693449,809,AAPL,Bullish,561
2,2015-01-03,0.714286,231,AAPL,Bullish,165
3,2015-01-04,0.737037,270,AAPL,Bullish,199
4,2015-01-05,0.656566,891,AAPL,Bullish,585
...,...,...,...,...,...,...
2508,2022-09-26,0.727273,11,AAPL,Bullish,8
2509,2022-09-27,0.750000,4,AAPL,Bullish,3
2510,2022-09-28,0.606557,61,AAPL,Bearish,37
2511,2022-09-29,0.500000,20,AAPL,Bearish,10


In [127]:
df2 = df.groupby(['Date']).agg({'Value': 'max'}).reset_index()
df = df2.merge(df, how='inner', on=['Value', 'Date'])

In [129]:
df['sentiment'].value_counts()

Bullish    2336
Bearish     177
Name: sentiment, dtype: int64