In [1]:
# upload sentiment_preprocessing.py if using colab
from sentiment_preprocessing import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Load datasets and parse dates

# convert date column from string to datetime
# convert stock time back to just a date to simplify closing price query
tweet_df1 = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_tweets.csv")
tweet_df1['Date'] = pd.to_datetime(tweet_df1['Date'], utc=True)

stocks_df = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_yfinance_data.csv")
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'], utc=True).dt.date

tweet_df2 = download_dataset_to_df("ryanchan911/selective-stock-headlines-sentiment", "Project6500.csv")
tweet_df2['datetime'] = pd.to_datetime(tweet_df2['datetime'], utc=True, format='mixed', dayfirst=False).dt.date

Downloading from https://www.kaggle.com/api/v1/datasets/download/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?dataset_version_number=1&file_name=stock_tweets.csv...


100%|██████████| 6.44M/6.44M [00:00<00:00, 29.8MB/s]

Extracting zip of stock_tweets.csv...





Downloading from https://www.kaggle.com/api/v1/datasets/download/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?dataset_version_number=1&file_name=stock_yfinance_data.csv...


100%|██████████| 696k/696k [00:00<00:00, 2.30MB/s]


Downloading from https://www.kaggle.com/api/v1/datasets/download/ryanchan911/selective-stock-headlines-sentiment?dataset_version_number=7&file_name=Project6500.csv...


100%|██████████| 0.98M/0.98M [00:00<00:00, 3.01MB/s]




In [3]:
# append column for the next day return
tweet_df1['return'] = tweet_df1.apply(lambda row: get_next_day_return(row['Date'], row['Stock Name'], stocks_df), axis=1)

# append a column for the sd of returns
# (very unoptimized but it only takes a few minutes to run so good enough)
tweet_df1['sd_of_returns'] = tweet_df1.apply(lambda row: get_sd_of_returns(row['Stock Name'], stocks_df), axis=1)

# clean the tweets
tweet_df1['Tweet'] = tweet_df1['Tweet'].apply(clean_text)
tweet_df1

Unnamed: 0,Date,Tweet,Stock Name,Company Name,return,sd_of_returns
0,2022-09-29 23:41:16+00:00,mainstream media done amazing job brainwashing...,TSLA,"Tesla, Inc.",,0.040606
1,2022-09-29 23:24:43+00:00,tesla delivery estimates around 364k analysts ...,TSLA,"Tesla, Inc.",,0.040606
2,2022-09-29 23:18:08+00:00,3 even include 630m unvested rsus 630 addition...,TSLA,"Tesla, Inc.",,0.040606
3,2022-09-29 22:40:07+00:00,hahaha still trying stop tesla fsd bro get shi...,TSLA,"Tesla, Inc.",,0.040606
4,2022-09-29 22:27:05+00:00,stop trying kill kids sad deranged old man,TSLA,"Tesla, Inc.",,0.040606
...,...,...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,fastest growing tech stocks market mix fintech...,XPEV,XPeng Inc.,-0.015682,0.053934
80789,2021-10-04 17:05:59+00:00,earnings horizon quick snapshot largest increa...,XPEV,XPeng Inc.,0.041337,0.053934
80790,2021-10-01 04:43:41+00:00,record delivery results testimony unwavering p...,XPEV,XPeng Inc.,,0.053934
80791,2021-10-01 00:03:32+00:00,delivered 10412 smart evs sep 2021 reaching mi...,XPEV,XPeng Inc.,,0.053934


In [4]:
# assign labels from [-1, 0, 1]
# Returns are normalized based on the standard deviation of returns for that specific ticker.
tweet_df1['label'] = tweet_df1.apply(lambda row: assign_labels(row['return'], row['sd_of_returns']), axis=1)
tweet_df1

Unnamed: 0,Date,Tweet,Stock Name,Company Name,return,sd_of_returns,label
0,2022-09-29 23:41:16+00:00,mainstream media done amazing job brainwashing...,TSLA,"Tesla, Inc.",,0.040606,0
1,2022-09-29 23:24:43+00:00,tesla delivery estimates around 364k analysts ...,TSLA,"Tesla, Inc.",,0.040606,0
2,2022-09-29 23:18:08+00:00,3 even include 630m unvested rsus 630 addition...,TSLA,"Tesla, Inc.",,0.040606,0
3,2022-09-29 22:40:07+00:00,hahaha still trying stop tesla fsd bro get shi...,TSLA,"Tesla, Inc.",,0.040606,0
4,2022-09-29 22:27:05+00:00,stop trying kill kids sad deranged old man,TSLA,"Tesla, Inc.",,0.040606,0
...,...,...,...,...,...,...,...
80788,2021-10-07 17:11:57+00:00,fastest growing tech stocks market mix fintech...,XPEV,XPeng Inc.,-0.015682,0.053934,0
80789,2021-10-04 17:05:59+00:00,earnings horizon quick snapshot largest increa...,XPEV,XPeng Inc.,0.041337,0.053934,1
80790,2021-10-01 04:43:41+00:00,record delivery results testimony unwavering p...,XPEV,XPeng Inc.,,0.053934,0
80791,2021-10-01 00:03:32+00:00,delivered 10412 smart evs sep 2021 reaching mi...,XPEV,XPeng Inc.,,0.053934,0


In [5]:
tweet_df1['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,45787
-1,17696
1,17310


In [6]:
# train/val | test split
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    list(tweet_df1['Tweet']), list(tweet_df1['label']), test_size=0.2, random_state=1
)

# train | val split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.2, random_state=1
)

# tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(val_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
print(train_encodings)
print("_______________________________________________________________________")
print(val_encodings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  3449,  2239,  ...,     0,     0,     0],
        [  101,  2002,  2015,  ...,     0,     0,     0],
        [  101,  1050,  6199,  ...,     0,     0,     0],
        ...,
        [  101,  2572,  2480,  ...,     0,     0,     0],
        [  101,  5291, 22649,  ...,     0,     0,     0],
        [  101,  2092, 24529,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
_______________________________________________________________________
{'input_ids': tensor([[  101, 24529, 12458,  ...,     0,     0,     0],
     