In [1]:
# upload sentiment_preprocessing.py if using colab
from sentiment_preprocessing import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
future_days = 1
num_classes = 2
threshold = 0

In [3]:
# cutoff data with no future date
cutoff_date = (pd.to_datetime("2022-09-28", utc=True) - pd.Timedelta(days=future_days)).date()

# convert date column from string to datetime
# convert times to just a date to simplify closing price query
tweets_df = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_tweets.csv")
tweets_df['Date'] = pd.to_datetime(tweets_df['Date'], utc=True).dt.date
tweets_df = tweets_df[tweets_df['Date'] < cutoff_date]

stocks_df = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_yfinance_data.csv")
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'], utc=True).dt.date

Downloading from https://www.kaggle.com/api/v1/datasets/download/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?dataset_version_number=1&file_name=stock_tweets.csv...


100%|██████████| 6.44M/6.44M [00:01<00:00, 4.77MB/s]

Extracting zip of stock_tweets.csv...





Downloading from https://www.kaggle.com/api/v1/datasets/download/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?dataset_version_number=1&file_name=stock_yfinance_data.csv...


100%|██████████| 696k/696k [00:01<00:00, 586kB/s]


In [4]:
print(tweets_df.head())

           Date                                              Tweet Stock Name  \
272  2022-09-26  $TSLA Since 2019, legacy auto has made up for ...       TSLA   
273  2022-09-26  @RealDanODowd @elonmusk @Tesla Don’t misrepres...       TSLA   
274  2022-09-26  @EvasTeslaSPlaid I sold my Porsche Cayenne Tur...       TSLA   
275  2022-09-26  Investing is like swimming, sometimes the tide...       TSLA   
276  2022-09-26  HOW TO TRADE $SPY , $SPX , $QQQ , $TSLA (70+ P...       TSLA   

    Company Name  
272  Tesla, Inc.  
273  Tesla, Inc.  
274  Tesla, Inc.  
275  Tesla, Inc.  
276  Tesla, Inc.  


In [5]:
print(tweets_df.iloc[0]['Tweet'])

$TSLA Since 2019, legacy auto has made up for sinking unit sales by increasing ASPs by $10,000 and lowering incentives by $5,000. That’s a 40% margin trick (!), but it’s temporary: ICE sales will continue to sink, and legacy auto can no longer increase prices or lower incentives: https://t.co/E1mQIftbiL


In [6]:
# clean the tweets
tweets_df['Tweet'] = tweets_df['Tweet'].apply(clean_text)

In [7]:
# add original index column for future df manipulation
tweets_df['index'] = tweets_df.index

In [8]:
tweets_df

Unnamed: 0,Date,Tweet,Stock Name,Company Name,index
272,2022-09-26,tsla since 2019 legacy auto made sinking unit ...,TSLA,"Tesla, Inc.",272
273,2022-09-26,dont misrepresent position tesla full selfdriv...,TSLA,"Tesla, Inc.",273
274,2022-09-26,sold porsche cayenne turbo bought tsla model p...,TSLA,"Tesla, Inc.",274
275,2022-09-26,investing like swimming sometimes tide sometim...,TSLA,"Tesla, Inc.",275
276,2022-09-26,trade spy spx qqq tsla 70 pages free moneymoti...,TSLA,"Tesla, Inc.",276
...,...,...,...,...,...
80788,2021-10-07,fastest growing tech stocks market mix fintech...,XPEV,XPeng Inc.,80788
80789,2021-10-04,earnings horizon quick snapshot largest increa...,XPEV,XPeng Inc.,80789
80790,2021-10-01,record delivery results testimony unwavering p...,XPEV,XPeng Inc.,80790
80791,2021-10-01,delivered 10412 smart evs sep 2021 reaching mi...,XPEV,XPeng Inc.,80791


In [9]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
elif USE_GPU and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [10]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn.functional as F

In [11]:
import numpy as np

In [12]:
def get_probs(model, texts, indices, num_classes):
    # test
    model.eval()

    all_probs = []
    all_indices = []

    # dont need grad calculation when in eval mode
    with torch.no_grad():
        for i, text in enumerate(texts):
            input_ids = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")['input_ids'].to(device)
            attention_mask = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")['attention_mask'].to(device)

            # compute logits
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # compute probs
            probs = F.softmax(logits, dim=1).cpu().tolist()
            all_probs.append(probs[0])

            # get indices
            all_indices.append(indices[i])

    return all_probs, all_indices

In [13]:
# all samples
texts = list(tweets_df['Tweet'])
indices = list(tweets_df['index'])

In [18]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

model = BertForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
all_probs, all_indices = get_probs(model, texts, indices, num_classes)

In [20]:
print(all_probs)
print(all_indices)

[[0.25087010860443115, 0.5323874950408936, 0.2167424112558365], [0.028666112571954727, 0.4729212820529938, 0.4984126687049866], [0.1641818881034851, 0.012600106187164783, 0.8232179880142212], [0.03557007387280464, 0.04949159175157547, 0.9149383902549744], [0.034378185868263245, 0.02129896730184555, 0.9443228840827942], [0.041038092225790024, 0.023469019681215286, 0.9354929327964783], [0.200310617685318, 0.014259003102779388, 0.7854304313659668], [0.03429893031716347, 0.03832057863473892, 0.9273804426193237], [0.06522220373153687, 0.01865687035024166, 0.9161208271980286], [0.03743717074394226, 0.022908639162778854, 0.9396542310714722], [0.14418843388557434, 0.018168583512306213, 0.837643027305603], [0.13341541588306427, 0.009738355875015259, 0.856846272945404], [0.04896577075123787, 0.5886403918266296, 0.36239388585090637], [0.03798256069421768, 0.0749419778585434, 0.8870754837989807], [0.04273148998618126, 0.03408564627170563, 0.9231828451156616], [0.07286927849054337, 0.15823709964752

In [21]:
tweets_df2 = tweets_df.copy()

In [22]:
for idx, probs in zip(all_indices, all_probs):
    for i in range(num_classes):
        tweets_df2.at[idx, f'probs_{i}'] = probs[i]

In [23]:
print(tweets_df2)

             Date                                              Tweet  \
272    2022-09-26  tsla since 2019 legacy auto made sinking unit ...   
273    2022-09-26  dont misrepresent position tesla full selfdriv...   
274    2022-09-26  sold porsche cayenne turbo bought tsla model p...   
275    2022-09-26  investing like swimming sometimes tide sometim...   
276    2022-09-26  trade spy spx qqq tsla 70 pages free moneymoti...   
...           ...                                                ...   
80788  2021-10-07  fastest growing tech stocks market mix fintech...   
80789  2021-10-04  earnings horizon quick snapshot largest increa...   
80790  2021-10-01  record delivery results testimony unwavering p...   
80791  2021-10-01  delivered 10412 smart evs sep 2021 reaching mi...   
80792  2021-09-30  xpeng p5 deliver outstanding performance extre...   

      Stock Name Company Name  index   probs_0   probs_1  
272         TSLA  Tesla, Inc.    272  0.250870  0.532387  
273         TSLA 

In [24]:
from google.colab import files

tweets_df2.to_csv('tweets_with_probs.csv', index=False)
files.download('tweets_with_probs.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>