In [130]:
# upload sentiment_preprocessing.py if using colab
from sentiment_preprocessing import *

In [131]:
future_days = 10
num_classes = 2
threshold = 0

In [132]:
# cutoff data with no future date
cutoff_date = (pd.to_datetime("2022-09-28", utc=True) - pd.Timedelta(days=future_days)).date()

# convert date column from string to datetime
# convert times to just a date to simplify closing price query
tweets_df = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_tweets.csv")
tweets_df['Date'] = pd.to_datetime(tweets_df['Date'], utc=True).dt.date
tweets_df = tweets_df[tweets_df['Date'] < cutoff_date]

stocks_df = download_dataset_to_df("equinxx/stock-tweets-for-sentiment-analysis-and-prediction", "stock_yfinance_data.csv")
stocks_df['Date'] = pd.to_datetime(stocks_df['Date'], utc=True).dt.date

In [133]:
# only look at TSLA
ticker = 'TSLA'
tweets_df = tweets_df[tweets_df['Stock Name'] == ticker]
stocks_df = stocks_df[stocks_df['Stock Name'] == ticker]

In [134]:
# concat all text by date
tweets_df = tweets_df.groupby(['Date', 'Stock Name'], as_index=False).agg({'Tweet': ' '.join})

In [135]:
print(tweets_df.head())

         Date Stock Name                                              Tweet
0  2021-09-30       TSLA  #LottoFriday Watchlist: short &amp; sweet\n\n$...
1  2021-10-01       TSLA  My Neighbors took delivery, 9/29!  He stopped ...
2  2021-10-02       TSLA  One week from today the top 1,000 button pushe...
3  2021-10-03       TSLA  Bitches , being a $tsla stonk holder in 2013 w...
4  2021-10-04       TSLA  $TSLA / $NDX\n\nIt's probably not nothing. htt...


In [136]:
print(tweets_df.iloc[0]['Tweet'])

#LottoFriday Watchlist: short &amp; sweet

$AMD over 104.5, 105c
$ROKU over 317, 320c
$NVDA below 207, 205p
$TSLA below 774, 770p

I will be guiding my entries &amp; exits in real time via voice chat at @GAAoptionsVIP 👊

Good luck, traders! CORRECTION UPDATE

UPDATE on Q3 Delivery Estimates:

* FactSet - 204k
* Wall Street - 221k
* Gary - 226k
* Troy - 235k (today upgrade)
* Twitter Bulls - 230k/240k
* Wilson FrunkPuppy -240k
* Goldie - 246k
* Rob/Tesla Daily - 247k
* Giddy - 250k
* Umbisam-   &gt;250k
$TSLA https://t.co/BmrwYUJeQB FREE #OPTIONS Ideas 🤯

Scale out when above 25% Profit

$TSLA   790C &gt;788.87 | 775P &lt;775.09
$NIO     36C &gt;35.96 | 35P &lt;35.32
$MU      73C &gt;72.56 | 70P &lt;70.86
$SNAP  76C &gt;75.21 | 72P &lt;72.26

199 ❤️ for MORE SECRET Bonus Picks

$TSLA VERY RARE Double IB last seen in 2020 https://t.co/UU6fvXOrw3 California DMV today issued autonomous vehicle permits to Cruise and Waymo to charge the public for autonomous rides in select Bay Area cities. 

In [137]:
def get_future_return(tweet_date, stock, stocks_df, future_days=1):
    # filter stock data by stock then sort by date
    stock_data = stocks_df[stocks_df['Stock Name'] == stock]
    stock_data = stock_data.sort_values(by = 'Date')

    # get closing prices for tweet date or closest day after
    stock_day_data = stock_data.loc[stock_data['Date'] >= tweet_date]
    # if we don't have data for that stock and date, return none
    if stock_day_data.empty:
        return None
    stock_day_close = stock_day_data.iloc[0]['Adj Close']
    stock_day = stock_day_data.iloc[0]['Date']

    # set next day to day after stock date
    next_day = stock_day + pd.Timedelta(days=future_days)

    # get closing prices for next day or closest day after
    next_day_data = stock_data.loc[stock_data['Date'] >= next_day]
    # if we don't have data for that stock and date, return none
    if next_day_data.empty:
        return None
    next_day_close = next_day_data.iloc[0]['Adj Close']

    # compute return
    return (next_day_close - stock_day_close) / stock_day_close

In [138]:
# append column for future return
tweets_df['return'] = tweets_df.apply(lambda row: get_future_return(row['Date'], row['Stock Name'], stocks_df, future_days), axis=1)

# clean the tweets
tweets_df['Tweet'] = tweets_df['Tweet'].apply(clean_text)

In [139]:
# add original index column for future df manipulation
tweets_df['index'] = tweets_df.index

In [140]:
tweets_df

Unnamed: 0,Date,Stock Name,Tweet,return,index
0,2021-09-30,TSLA,lottofriday watchlist short amp sweet amd 1045...,0.021226,0
1,2021-10-01,TSLA,neighbors took delivery 929 stopped could give...,0.021568,1
2,2021-10-02,TSLA,one week today top 1000 button pushers trying ...,0.047074,2
3,2021-10-03,TSLA,bitches tsla stonk holder 2013 hard waking mid...,0.047074,3
4,2021-10-04,TSLA,tsla ndx probably nothing amazing story earl t...,0.047074,4
...,...,...,...,...,...
348,2022-09-13,TSLA,40 trailers west side giga texas todays video ...,-0.057509,348
349,2022-09-14,TSLA,tesla tsla cfo zach kirkhorn filed sale 3750 s...,-0.087902,349
350,2022-09-15,TSLA,tsla prediction q322 r n g e q u r e r e v e r...,-0.091325,350
351,2022-09-16,TSLA,one thinks super cool doesnt even usd prices w...,-0.090127,351


In [141]:
def assign_labels(future_return, threshold=0.01):
    """
    Assign a label from [-1, 0, 1] based on the next day return. Returns are normalized based on the standard deviation
    of returns for that specific ticker, such that a return that is within (threshold) standard deviations of 0 will be
    labeled 0.
    """
    if threshold == 0:
        if (future_return > 0):
            return 1
        elif (future_return < 0):
            return 0
    else:
      if future_return >= threshold:
          return 2
      elif future_return <= -threshold:
          return 0
      else:
        return 1

In [142]:
# assign labels
tweets_df['label'] = tweets_df.apply(lambda row: assign_labels(row['return'], threshold=threshold), axis=1)

In [143]:
tweets_df

Unnamed: 0,Date,Stock Name,Tweet,return,index,label
0,2021-09-30,TSLA,lottofriday watchlist short amp sweet amd 1045...,0.021226,0,1
1,2021-10-01,TSLA,neighbors took delivery 929 stopped could give...,0.021568,1,1
2,2021-10-02,TSLA,one week today top 1000 button pushers trying ...,0.047074,2,1
3,2021-10-03,TSLA,bitches tsla stonk holder 2013 hard waking mid...,0.047074,3,1
4,2021-10-04,TSLA,tsla ndx probably nothing amazing story earl t...,0.047074,4,1
...,...,...,...,...,...,...
348,2022-09-13,TSLA,40 trailers west side giga texas todays video ...,-0.057509,348,0
349,2022-09-14,TSLA,tesla tsla cfo zach kirkhorn filed sale 3750 s...,-0.087902,349,0
350,2022-09-15,TSLA,tsla prediction q322 r n g e q u r e r e v e r...,-0.091325,350,0
351,2022-09-16,TSLA,one thinks super cool doesnt even usd prices w...,-0.090127,351,0


In [144]:
tweets_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,194
1,159


In [145]:
# train | test split
train_val_texts, test_texts, train_val_labels, test_labels, train_val_indices, test_indices = train_test_split(
    list(tweets_df['Tweet']), list(tweets_df['label']), list(tweets_df['index']), test_size=0.2, random_state=1
)

# train | val split
train_texts, val_texts, train_labels, val_labels, train_indices, val_indices = train_test_split(
    train_val_texts, train_val_labels, train_val_indices, test_size=0.2, random_state=1
)

In [146]:
# tokenize data
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

train_encodings = tokenizer(train_texts, padding=True, truncation=True, max_length=512)
val_encodings = tokenizer(val_texts, padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_texts, padding=True, truncation=True, max_length=512)

print(train_encodings)
print(val_encodings)
print(test_encodings)

{'input_ids': [[101, 24529, 2721, 4566, 4278, 8667, 3959, 2265, 2987, 2102, 2203, 2871, 2753, 2149, 12012, 3863, 3222, 11538, 3251, 3522, 4518, 4341, 26060, 2708, 3237, 3449, 2239, 14163, 6711, 2567, 5035, 10264, 14163, 6711, 14424, 25297, 6202, 3513, 2813, 2395, 3485, 2988, 24529, 2721, 2116, 4191, 4074, 20228, 16344, 3825, 7016, 4149, 2751, 3857, 15742, 12677, 16150, 4710, 2903, 3056, 6698, 2109, 14412, 4630, 4313, 2156, 2112, 6230, 2651, 2812, 3875, 24529, 2721, 2194, 3825, 7016, 2667, 7532, 14981, 2115, 2063, 4011, 2811, 13428, 15749, 2951, 2741, 2067, 26060, 2440, 4518, 6089, 23776, 12155, 2860, 7858, 2184, 2781, 8694, 10047, 2183, 3331, 7167, 2367, 6695, 26060, 24529, 2721, 2092, 2058, 20041, 11538, 3522, 4518, 4341, 5766, 2567, 1038, 20051, 4630, 25297, 10216, 15653, 14279, 4518, 3006, 2381, 2813, 2395, 3485, 4311, 24529, 2721, 24529, 2721, 4160, 20228, 4805, 2509, 2243, 2123, 2102, 2130, 2113, 4088, 4863, 2651, 6314, 5294, 4132, 3314, 2330, 2481, 2102, 2130, 4607, 4449, 2335, 4

In [147]:
token_lengths = []
for tweet in tweets_df["Tweet"]:
    tokens = tokenizer(tweet)["input_ids"]
    token_lengths.append(len(tokens))

print(f"max token length: {max(token_lengths)}")

Token indices sequence length is longer than the specified maximum sequence length for this model (2108 > 512). Running this sequence through the model will result in indexing errors


max token length: 7810


In [148]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, indices, labels):
        self.encodings = encodings
        self.indices = indices
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}
        for key, val in self.encodings.items():
            item[key] = val[idx]

        item["indices"] = torch.tensor(self.indices[idx])
        item["labels"] = torch.tensor(self.labels[idx])

        return item

In [149]:
train_inputs = {}
for key, val in train_encodings.items():
    train_inputs[key] = torch.tensor(val)
train_dataset = CustomDataset(train_inputs, train_indices, train_labels)

val_inputs = {}
for key, val in val_encodings.items():
    val_inputs[key] = torch.tensor(val)
val_dataset = CustomDataset(val_inputs, val_indices, val_labels)

test_inputs = {}
for key, val in test_encodings.items():
    test_inputs[key] = torch.tensor(val)
test_dataset = CustomDataset(test_inputs, test_indices, test_labels)

In [150]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True)

In [151]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
elif USE_GPU and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [152]:
import torch
import torch.nn as nn
from transformers import BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn.functional as F

class BertWithFC(nn.Module):
    def __init__(self, num_labels=3, hidden_size=768, fc_hidden_dim=256, dropout_rate=0.3):
        super(BertWithFC, self).__init__()

        self.bert = BertModel.from_pretrained("ProsusAI/finbert")

        self.fc1 = nn.Linear(hidden_size, fc_hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(fc_hidden_dim, num_labels)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # token representation

        x = self.fc1(cls_output)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)

        loss = None
        if labels is not None:
            loss = self.criterion(logits, labels)

        # match format of BertForSequenceClassification
        return SequenceClassifierOutput(loss=loss, logits=logits)

In [153]:
# 3 heads for 3 sentiment labels [0, 1, 2]
model = BertWithFC(num_labels=num_classes)
model.to(device)

BertWithFC(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

In [154]:
learning_rate = 1e-5
weight_decay = 1e-2
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

epochs = 8
T_max = len(train_dataloader) * epochs
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_max)

In [155]:
def train(model, train_dataloader, val_dataloader, epochs, optimizer, scheduler):
    for epoch in range(epochs):
        # train
        model.train()
        train_loss = 0

        for data in train_dataloader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            # reset gradients
            optimizer.zero_grad()

            # compute loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # update loss, optimizer, and scheduler
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
            scheduler.step()

        print(f'Epoch {epoch + 1}, Training Loss: {train_loss}')

        # validation
        model.eval()

        val_loss = 0
        correct = 0
        total = 0

        # dont need grad calculation when in eval mode
        with torch.no_grad():
            for data in val_dataloader:
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                labels = data['labels'].to(device)

                # compute loss
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

                val_loss += loss.item()

                # select class with highest logit as our prediction
                max_logit, pred = torch.max(outputs.logits, 1)

                # correct += sum of samples with pred == label
                correct += torch.sum(pred == labels).item()

                # total += batch size
                total += labels.size(0)

        val_accuracy = correct / total * 100

        print(f'Epoch {epoch + 1}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')
    return val_accuracy

In [156]:
train(model, train_dataloader, val_dataloader, epochs=3, optimizer=optimizer, scheduler=scheduler)

Epoch 1, Training Loss: 20.142666935920715
Epoch 1, Validation Loss: 5.445497751235962, Validation Accuracy: 56.14035087719298
Epoch 2, Training Loss: 20.187946498394012
Epoch 2, Validation Loss: 5.464637100696564, Validation Accuracy: 56.14035087719298
Epoch 3, Training Loss: 20.238917768001556
Epoch 3, Validation Loss: 5.555096387863159, Validation Accuracy: 57.89473684210527


57.89473684210527

In [157]:
def test(model, test_dataloader):
    # test
    model.eval()

    test_loss = 0
    correct = 0
    total = 0

    # dont need grad calculation when in eval mode
    with torch.no_grad():
        for data in test_dataloader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            # compute loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            test_loss += loss.item()

            # select class with highest logit as our prediction
            max_logit, pred = torch.max(outputs.logits, 1)

            # correct += sum of samples with pred == label
            correct += torch.sum(pred == labels).item()

            # total += batch size
            total += labels.size(0)

    print(f'Test Loss: {test_loss}, Test Accuracy: {correct / total * 100}')

In [158]:
test(model, test_dataloader)

Test Loss: 6.161779403686523, Test Accuracy: 56.33802816901409


In [159]:
import numpy as np

In [160]:
def get_probs(model, texts, indices, num_classes):
    # test
    model.eval()

    all_probs = []
    all_indices = []

    # dont need grad calculation when in eval mode
    with torch.no_grad():
        for i, text in enumerate(texts):
            input_ids = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")['input_ids'].to(device)
            attention_mask = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")['attention_mask'].to(device)

            # compute logits
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # compute probs
            probs = F.softmax(logits, dim=1).cpu().tolist()
            all_probs.append(probs[0])

            # get indices
            all_indices.append(indices[i])

    return all_probs, all_indices

In [161]:
# all samples
texts = list(tweets_df['Tweet'])
indices = list(tweets_df['index'])

In [162]:
all_probs, all_indices = get_probs(model, texts, indices, num_classes)

In [163]:
print(all_probs)
print(all_indices)

[[0.538622260093689, 0.46137773990631104], [0.5476044416427612, 0.45239558815956116], [0.5394543409347534, 0.46054568886756897], [0.5366625189781189, 0.4633375406265259], [0.5504727363586426, 0.4495272934436798], [0.5490614175796509, 0.4509385824203491], [0.5500780940055847, 0.44992193579673767], [0.5446404814720154, 0.455359548330307], [0.549224853515625, 0.4507751166820526], [0.5447338223457336, 0.45526617765426636], [0.5472422242164612, 0.4527578353881836], [0.5536251068115234, 0.44637489318847656], [0.5481562614440918, 0.4518437385559082], [0.5466299057006836, 0.45337003469467163], [0.5444440841674805, 0.45555588603019714], [0.5472993850708008, 0.4527006149291992], [0.5413299202919006, 0.45867007970809937], [0.5416334867477417, 0.4583665132522583], [0.5439808964729309, 0.4560191035270691], [0.5448591113090515, 0.4551409184932709], [0.5408047437667847, 0.45919525623321533], [0.543517529964447, 0.4564824402332306], [0.5500335097312927, 0.44996654987335205], [0.5434265732765198, 0.456

In [164]:
tweets_df2 = tweets_df.copy()

In [165]:
for idx, probs in zip(all_indices, all_probs):
    for i in range(num_classes):
        tweets_df2.at[idx, f'probs_{i}'] = probs[i]

In [166]:
print(tweets_df2)

           Date Stock Name                                              Tweet  \
0    2021-09-30       TSLA  lottofriday watchlist short amp sweet amd 1045...   
1    2021-10-01       TSLA  neighbors took delivery 929 stopped could give...   
2    2021-10-02       TSLA  one week today top 1000 button pushers trying ...   
3    2021-10-03       TSLA  bitches tsla stonk holder 2013 hard waking mid...   
4    2021-10-04       TSLA  tsla ndx probably nothing amazing story earl t...   
..          ...        ...                                                ...   
348  2022-09-13       TSLA  40 trailers west side giga texas todays video ...   
349  2022-09-14       TSLA  tesla tsla cfo zach kirkhorn filed sale 3750 s...   
350  2022-09-15       TSLA  tsla prediction q322 r n g e q u r e r e v e r...   
351  2022-09-16       TSLA  one thinks super cool doesnt even usd prices w...   
352  2022-09-17       TSLA  fsdbeta ready robotaxis soon tsla incentive me...   

       return  index  label

In [168]:
from google.colab import files

tweets_df2.to_csv('tweets_with_probs.csv', index=False)
files.download('tweets_with_probs.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>