In [1]:
# upload sentiment_preprocessing.py if using colab
from sentiment_preprocessing import *

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Load datasets and parse dates
# set cutoff date for end of stocks dataset
cutoff_date = pd.to_datetime("2022-09-28 21:00:00", utc=True)

# convert date column from string to datetime
# convert stock time back to just a date to simplify closing price query
tweet_df1 = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_tweets.csv")
tweet_df1['Date'] = pd.to_datetime(tweet_df1['Date'], utc=True)
tweet_df1 = tweet_df1[tweet_df1['Date'] < cutoff_date]

stocks_df = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_yfinance_data.csv")
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'], utc=True).dt.date

tweet_df2 = download_dataset_to_df("ryanchan911/selective-stock-headlines-sentiment", "Project6500.csv")
tweet_df2['datetime'] = pd.to_datetime(tweet_df2['datetime'], utc=True, format='mixed', dayfirst=False).dt.date

Downloading from https://www.kaggle.com/api/v1/datasets/download/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?dataset_version_number=1&file_name=stock_tweets.csv...


100%|██████████| 6.44M/6.44M [00:00<00:00, 105MB/s]

Extracting zip of stock_tweets.csv...





Downloading from https://www.kaggle.com/api/v1/datasets/download/equinxx/stock-tweets-for-sentiment-analysis-and-prediction?dataset_version_number=1&file_name=stock_yfinance_data.csv...


100%|██████████| 696k/696k [00:00<00:00, 83.3MB/s]


Downloading from https://www.kaggle.com/api/v1/datasets/download/ryanchan911/selective-stock-headlines-sentiment?dataset_version_number=7&file_name=Project6500.csv...


100%|██████████| 0.98M/0.98M [00:00<00:00, 87.2MB/s]


In [4]:
# append column for the next day return
tweet_df1['return'] = tweet_df1.apply(lambda row: get_next_day_return(row['Date'], row['Stock Name'], stocks_df), axis=1)

# append a column for the sd of returns
# (very unoptimized but it only takes a few minutes to run so good enough)
tweet_df1['sd_of_returns'] = tweet_df1.apply(lambda row: get_sd_of_returns(row['Stock Name'], stocks_df), axis=1)

# clean the tweets
tweet_df1['Tweet'] = tweet_df1['Tweet'].apply(clean_text)
tweet_df1

Unnamed: 0,Date,Tweet,Stock Name,Company Name,return,sd_of_returns
120,2022-09-28 20:59:45+00:00,news truist securities analyst jordan levy ass...,TSLA,"Tesla, Inc.",-0.068101,0.040606
121,2022-09-28 20:47:35+00:00,know nothing airbnb cofounder except today joi...,TSLA,"Tesla, Inc.",-0.068101,0.040606
122,2022-09-28 20:45:04+00:00,editing new tsla video,TSLA,"Tesla, Inc.",-0.068101,0.040606
123,2022-09-28 20:27:05+00:00,breaking announced joe gebbia cofounder airbnb...,TSLA,"Tesla, Inc.",-0.068101,0.040606
124,2022-09-28 20:23:43+00:00,joe gebbia joined tsla board,TSLA,"Tesla, Inc.",-0.068101,0.040606
...,...,...,...,...,...,...
1115,2022-09-14 23:03:22+00:00,knows next stage tsla,TSLA,"Tesla, Inc.",-0.001317,0.040606
1116,2022-09-14 22:56:43+00:00,wrote 4tweet piece super followers sellside wo...,TSLA,"Tesla, Inc.",-0.001317,0.040606
1117,2022-09-14 22:28:53+00:00,ev stocks fire qtr rivn 55 tsla 35 f 32 vs qqq...,TSLA,"Tesla, Inc.",-0.001317,0.040606
1118,2022-09-14 22:09:47+00:00,manufacturing cars australia robyn denholm cha...,TSLA,"Tesla, Inc.",-0.001317,0.040606


In [5]:
# assign labels from [-1, 0, 1]
# Returns are normalized based on the standard deviation of returns for that specific ticker.
tweet_df1['label'] = tweet_df1.apply(lambda row: assign_labels(row['return'], row['sd_of_returns']), axis=1)
tweet_df1

Unnamed: 0,Date,Tweet,Stock Name,Company Name,return,sd_of_returns,label
120,2022-09-28 20:59:45+00:00,news truist securities analyst jordan levy ass...,TSLA,"Tesla, Inc.",-0.068101,0.040606,-1
121,2022-09-28 20:47:35+00:00,know nothing airbnb cofounder except today joi...,TSLA,"Tesla, Inc.",-0.068101,0.040606,-1
122,2022-09-28 20:45:04+00:00,editing new tsla video,TSLA,"Tesla, Inc.",-0.068101,0.040606,-1
123,2022-09-28 20:27:05+00:00,breaking announced joe gebbia cofounder airbnb...,TSLA,"Tesla, Inc.",-0.068101,0.040606,-1
124,2022-09-28 20:23:43+00:00,joe gebbia joined tsla board,TSLA,"Tesla, Inc.",-0.068101,0.040606,-1
...,...,...,...,...,...,...,...
1115,2022-09-14 23:03:22+00:00,knows next stage tsla,TSLA,"Tesla, Inc.",-0.001317,0.040606,0
1116,2022-09-14 22:56:43+00:00,wrote 4tweet piece super followers sellside wo...,TSLA,"Tesla, Inc.",-0.001317,0.040606,0
1117,2022-09-14 22:28:53+00:00,ev stocks fire qtr rivn 55 tsla 35 f 32 vs qqq...,TSLA,"Tesla, Inc.",-0.001317,0.040606,0
1118,2022-09-14 22:09:47+00:00,manufacturing cars australia robyn denholm cha...,TSLA,"Tesla, Inc.",-0.001317,0.040606,0


In [6]:
# shift labels from [-1, 0, 1] to [0, 1, 2]
tweet_df1['label'] += 1

In [7]:
tweet_df1['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,370
2,332
0,298


In [8]:
# train/val | test split
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    list(tweet_df1['Tweet']), list(tweet_df1['label']), test_size=0.2, random_state=1
)

# train | val split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.2, random_state=1
)

# tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=512)
val_encodings = tokenizer(val_texts, padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=512)

print(train_encodings)
print(val_encodings)
print(test_encodings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': [[101, 19354, 4002, 2575, 2028, 2034, 2944, 3765, 2412, 2081, 3728, 7282, 7756, 4156, 2484, 2243, 2585, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 6205, 25746, 2475, 7047, 6202, 3422, 9863, 5490, 6163, 2575, 23352, 8318, 6352, 2629, 18804, 14168, 2278, 14181, 5018, 2581, 24529, 2721, 24841, 24594, 2692, 2361, 8318, 24622, 2509, 2502, 12882, 2015, 5799, 5285, 10450, 18605, 9033, 18915, 4867, 2159, 5841, 3319, 4834, 2951, 3081, 5866, 17967, 2566, 16012, 7047, 6494, 4667, 15768, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 24529, 2721, 24529, 2721, 16760, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 4937, 4048, 2063, 3536, 3331, 27166

In [9]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}
        for key, val in self.encodings.items():
            item[key] = val[idx]

        item["labels"] = torch.tensor(self.labels[idx])

        return item

In [10]:
train_inputs = {}
for key, val in train_encodings.items():
    train_inputs[key] = torch.tensor(val)
train_dataset = CustomDataset(train_inputs, train_labels)

val_inputs = {}
for key, val in val_encodings.items():
    val_inputs[key] = torch.tensor(val)
val_dataset = CustomDataset(val_inputs, val_labels)

test_inputs = {}
for key, val in test_encodings.items():
    test_inputs[key] = torch.tensor(val)
test_dataset = CustomDataset(test_inputs, test_labels)

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [12]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
elif USE_GPU and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [13]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput

class BertWithFC(nn.Module):
    def __init__(self, num_labels=3, hidden_size=768, fc_hidden_dim=256, dropout_rate=0.3):
        super(BertWithFC, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc1 = nn.Linear(hidden_size, fc_hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(fc_hidden_dim, num_labels)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # token representation

        x = self.fc1(cls_output)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)

        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        # match format of BertForSequenceClassification
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [14]:
# 3 heads for 3 sentiment labels [0, 1, 2]
model = BertWithFC(num_labels=3)
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertWithFC(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [15]:
learning_rate = 1e-5
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [16]:
def train(model, train_dataloader, val_dataloader, epochs, optimizer):
    for epoch in range(epochs):
        # train
        model.train()
        train_loss = 0

        for data in train_dataloader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            # reset gradients
            optimizer.zero_grad()

            # compute loss with logits
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # update loss and optimizer
            loss.backward()
            train_loss += loss.item()
            optimizer.step()

        print(f'Epoch {epoch + 1}, Training Loss: {train_loss}')

        # validation
        model.eval()

        val_loss = 0
        correct = 0
        total = 0

        # dont need grad calculation when in eval mode
        with torch.no_grad():
            for data in val_dataloader:
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                labels = data['labels'].to(device)

                # compute loss
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

                # select class with highest logit as our prediction
                max_logit, pred = torch.max(outputs.logits, 1)

                # correct += sum of samples with pred == label
                correct += torch.sum(pred == labels).item()

                # total += batch size
                total += labels.size(0)

        val_accuracy = correct / total * 100

        print(f'Epoch {epoch + 1}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    return val_accuracy

In [None]:
train(model, train_dataloader, val_dataloader, epochs=8, optimizer=optimizer)

Epoch 1, Training Loss: 867.6475857496262
Epoch 1, Validation Loss: 218.70258593559265, Validation Accuracy: 39.109179793590435
Epoch 2, Training Loss: 841.7898364067078
Epoch 2, Validation Loss: 222.52385872602463, Validation Accuracy: 40.35074105687902
Epoch 3, Training Loss: 791.8756531476974
Epoch 3, Validation Loss: 230.26153135299683, Validation Accuracy: 41.087918056956624
Epoch 4, Training Loss: 721.7405557632446
Epoch 4, Validation Loss: 238.99643951654434, Validation Accuracy: 39.94723364631024
Epoch 5, Training Loss: 648.0061279535294
Epoch 5, Validation Loss: 257.59280574321747, Validation Accuracy: 41.18103515170326
Epoch 6, Training Loss: 569.2658978700638
Epoch 6, Validation Loss: 288.3735664486885, Validation Accuracy: 41.204314425389924
Epoch 7, Training Loss: 503.1385385990143
Epoch 7, Validation Loss: 304.6983035802841, Validation Accuracy: 42.050128036005276
Epoch 8, Training Loss: 441.8937372267246
Epoch 8, Validation Loss: 326.5218335390091, Validation Accuracy: 4

In [None]:
def test(model, test_dataloader):
    # test
    model.eval()

    test_loss = 0
    correct = 0
    total = 0

    # dont need grad calculation when in eval mode
    with torch.no_grad():
        for data in test_dataloader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            # compute loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            test_loss += loss.item()

            # select class with highest logit as our prediction
            max_logit, pred = torch.max(outputs.logits, 1)

            # correct += sum of samples with pred == label
            correct += torch.sum(pred == labels).item()

            # total += batch size
            total += labels.size(0)

    print(f'Test Loss: {test_loss}, Test Accuracy: {correct / total * 100}')

#test(model, test_dataloader)

In [None]:
test(model, test_dataloader)

Test Loss: 404.362056016922, Test Accuracy: 42.34899745483891


In [None]:
# save and download model
from google.colab import files
torch.save(model.state_dict(), f"bert_with_fc_model.pth")
files.download("bert_with_fc_model.pth")

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [None]:
# reload model from saved
saved_model = BertWithFC(num_labels=3)  # Make sure hyperparameters match
saved_model.load_state_dict(torch.load("bert_with_fc_model.pth"))
saved_model.to(device)

  saved_model.load_state_dict(torch.load("bert_with_fc_model.pth"))


Model and tokenizer loaded successfully!


In [None]:
test(saved_model, test_dataloader)

Test Loss: 404.33094322681427, Test Accuracy: 42.34899745483891


In [17]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [18]:
import optuna
from transformers import BertForSequenceClassification, BertTokenizer, AdamW

# define obj function for optuna
def objective(trial):
    # tune hyperparameters
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 5e-5)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    num_epochs = trial.suggest_int("num_epochs", 2, 5)

    model = BertWithFC(num_labels=3)
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    return train(model, train_dataloader, val_dataloader, num_epochs, optimizer)

# run hyperparameter tuning
study = optuna.create_study(direction="maximize")  # maximize accuracy
study.optimize(objective, n_trials=10)

# Print best hyperparameters
print("Best Hyperparameters:", study.best_params)

[I 2025-03-04 02:12:31,551] A new study created in memory with name: no-name-4067a24e-da89-4591-8159-230852b8b2fc
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 5e-5)


Epoch 1, Training Loss: 10.997800469398499


[W 2025-03-04 02:20:01,428] Trial 0 failed with parameters: {'learning_rate': 6.102534154195938e-06, 'batch_size': 32, 'num_epochs': 3} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-18-85e39b73f305>", line 15, in objective
    return train(model, train_dataloader, val_dataloader, num_epochs, optimizer)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-16-941b9e1730ff>", line 41, in train
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^

KeyboardInterrupt: 

In [None]:
# Train final model using best hyperparameters
best_params = study.best_params
print(f"Training final model with best params: {best_params}")

# Final Model
final_model = BertWithFC(num_labels=3)
final_model.to(device)
final_optimizer = AdamW(final_model.parameters(), lr=best_params["learning_rate"])

train(final_model, train_dataloader, val_dataloader, best_params["num_epochs"], final_optimizer, device)

In [None]:
test(final_model, test_dataloader)

In [None]:
# save and download model
from google.colab import files
torch.save(final_model.state_dict(), f"bert_with_fc_tuned_model.pth")
files.download("bert_with_fc_tuned_model.pth")