In [None]:
!pip install transformers
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import emoji
import re
import nltk
nltk.download('punkt')
import random
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize
from transformers import RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from transformers import get_linear_schedule_with_warmup, AutoModelForSequenceClassification
from torch.utils.data import DataLoader,SequentialSampler,RandomSampler,TensorDataset,random_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#### proper installation and import and mount on google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### read dataset and replace text labels to binary labels

In [None]:
df = pd.read_csv('/content/drive/MyDrive/balanced_untokenized_cleaned_stocktwits.csv')
df

Unnamed: 0,created_at,body,sentiment,raw_content
14,2020-12-15T14:38:18Z,going right support even superior growth trade,0,$MSFT Going right through 214 support as if it...
49,2020-12-15T14:23:04Z,nobody gonna buy expensive ass iphones aint go...,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15T14:12:10Z,robinhood peeps gonna severely disappointed tu...,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15T13:33:52Z,always dump dump dump,0,$AAPL always dump dump dump.
106,2020-12-15T13:30:10Z,turd going anywhere pathetic,0,$AAPL why is this turd not going anywhere. Thi...
...,...,...,...,...
1942436,2021-05-11T12:36:16Z,january calls look really good totally buying ...,1,$TSLA $NIO $LI $XPEV \n\nJanuary 2023 calls lo...
1212986,2021-03-25T22:44:46Z,joe ohm raised price target,1,$TSLA joe ohm raised price target from $800 to...
1900463,2020-01-22T19:31:24Z,like people discovered electric cars first tim...,1,$TSLA It’s like people have just discovered el...
1213039,2021-03-25T21:05:38Z,yes baby yes,1,$TSLA Yes baby yes


In [None]:
df['body'] = df['raw_content']

#### preprocessing

In [None]:
import emoji
import re

def preprocess(texts):
  # lowercase
  texts = texts.lower() # RoBERTa tokenizer is uncased
  # remove URLs
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  # remove '
  texts = texts.replace('&#39;', "'")
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))


  return texts.strip()
    

In [None]:
df['body'] = df.body.apply(preprocess)
df.head()

Unnamed: 0,created_at,body,sentiment,raw_content
14,2020-12-15T14:38:18Z,going right through support as if it isnt eve...,0,$MSFT Going right through 214 support as if it...
49,2020-12-15T14:23:04Z,nobody gonna buy expensive ass iphones when th...,0,$AAPL nobody gonna buy expensive ass iPhones w...
61,2020-12-15T14:12:10Z,robinhood peeps gonna be severely disappointed...,0,$AAPL Robinhood peeps gonna be severely disapp...
103,2020-12-15T13:33:52Z,always dump dump dump.,0,$AAPL always dump dump dump.
106,2020-12-15T13:30:10Z,why is this turd not going anywhere. this is p...,0,$AAPL why is this turd not going anywhere. Thi...


In [None]:
labels = df.sentiment.values
text = df.body.values

In [None]:
# X = pd.read_csv('/content/drive/MyDrive/padded_X.csv').values
# y = pd.read_csv('/content/drive/MyDrive/padded_y.csv').values

#### check the tokens number distribution of all the text data entries

In [None]:
labels

array([0, 0, 0, ..., 1, 1, 1])

In [None]:
def helper(text):
  t = word_tokenize(text)
  return len(t)
length = df.body.apply(helper)

print(f'there are {len(length[length>=0])} entries in the dataframe')
print(f'{100-len(length[length>32])/32000} of them have a length smaller than 32 tokens')
print(f'{100-len(length[length>64])/32000}% of them have a length smaller than 64 tokens')
print(f'{100-len(length[length>128])/32000}% of them have a length smaller than 128 tokens')
print(f'So we just use 64 as RoBERTa tokenizer max padding length')

there are 862570 entries in the dataframe
99.63990625 of them have a length smaller than 32 tokens
99.99984375% of them have a length smaller than 64 tokens
100.0% of them have a length smaller than 128 tokens
So we just use 64 as RoBERTa tokenizer max padding length


#### load tokenizer and encode the text data

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
len(tokenizer)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

50265

In [None]:
input_ids = []
attention_mask = []
for i in text:
    encoded_data = tokenizer.encode_plus(
    i,
    add_special_tokens=True,
    truncation=True,
    max_length=64,
    padding='max_length',
    return_attention_mask= True,
    return_tensors='pt')
    input_ids.append(encoded_data['input_ids'])
    attention_mask.append(encoded_data['attention_mask'])
input_ids = torch.cat(input_ids,dim=0)
attention_mask = torch.cat(attention_mask,dim=0)
labels = torch.tensor(labels)

In [None]:
attention_mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [None]:
input_ids

tensor([[    0, 12891,   235,  ...,     1,     1,     1],
        [    0, 33212,  9956,  ...,     1,     1,     1],
        [    0,  1001,  9413,  ...,     1,     1,     1],
        ...,
        [    0,  2629,   101,  ...,     1,     1,     1],
        [    0, 10932,  1928,  ...,     1,     1,     1],
        [    0,   337, 16402,  ...,     1,     1,     1]])

In [None]:
input_ids[0]

tensor([    0, 12891,   235,   149,  1437,   323,    25,   114,    24,    16,
         3999,   190,    89, 12846,  1437,    16,     5, 10295,   434,   721,
        12846,     2,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])

#### fix random seed and split train/validation/test dataset to dataloader

In [None]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
from torch.utils.data import DataLoader,SequentialSampler,RandomSampler,TensorDataset,random_split

dataset = TensorDataset(input_ids,attention_mask,labels)
train_size = int(0.98*len(dataset))
val_size = int((len(dataset) - train_size)/2)
test_size = val_size

train_dataset,val_dataset = random_split(dataset,[train_size,2*val_size])
val_dataset, test_dataset = random_split(val_dataset,[val_size,test_size])
print('Training Size - ',train_size)
print('Validation Size - ',val_size)
print('Test Size - ',test_size)

Training Size -  845318
Validation Size -  8626
Test Size -  8626


In [None]:
train_dl = DataLoader(train_dataset,sampler = RandomSampler(train_dataset),
                     batch_size = 128)
val_dl = DataLoader(val_dataset,sampler = SequentialSampler(val_dataset),
                     batch_size = 128)
test_dl = DataLoader(test_dataset,sampler = SequentialSampler(test_dataset),
                     batch_size = 128)
len(train_dl),len(val_dl),len(test_dl)

(6605, 68, 68)

#### load model and put to cuda and setup optimizer and scheduler

In [None]:
model = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')

Downloading (…)lve/main/config.json:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
optimizer = AdamW(model.parameters(),lr = 3e-5,eps=1e-8,no_deprecation_warning=True)

In [None]:
epochs = 5
total_steps = len(train_dl)*epochs
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

#### define evaluation functions

In [None]:
def accuracy(preds,labels):
    pred_flat = np.argmax(preds,axis=1).flatten()
    label_flat = labels.flatten()
    return np.sum(pred_flat==label_flat)/len(label_flat)

In [None]:
from sklearn.metrics import classification_report
def report(preds,labels):
  target_names = ['Bearish','Bullish']
  pred_flat = np.argmax(preds,axis=1).flatten()
  label_flat = labels.flatten()
  print(classification_report(label_flat,pred_flat,target_names=target_names,digits=4))

In [None]:
def evaluate(dataloader_test):
    model.eval()
    loss_val_total = 0
    predictions,true_vals = [],[]
    for batch in dataloader_test:
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids':batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        with torch.no_grad():
            outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    loss_val_avg = loss_val_total / len(dataloader_test)
    predictions = np.concatenate(predictions,axis=0)
    true_vals = np.concatenate(true_vals,axis=0)
    return loss_val_avg,predictions,true_vals

#### actual training loop

In [None]:

torch.cuda.empty_cache()
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(train_dl, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({f'training_loss': '{loss.item()/len(batch):.3f}'})
         
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(train_dl)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(val_dl)
    val_acc = accuracy(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'Validation Accuracy: {val_acc}')
    report(predictions,true_vals)    

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/6605 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.25910834427316654
Validation loss: 0.27259070557706494
Validation Accuracy: 0.882796197542314
              precision    recall  f1-score   support

     Bearish     0.8654    0.9059    0.8852      4302
     Bullish     0.9018    0.8599    0.8803      4324

    accuracy                         0.8828      8626
   macro avg     0.8836    0.8829    0.8827      8626
weighted avg     0.8836    0.8828    0.8827      8626



Epoch 2:   0%|          | 0/6605 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.21528994809218557
Validation loss: 0.286576088737039
Validation Accuracy: 0.8832599118942731
              precision    recall  f1-score   support

     Bearish     0.8822    0.8840    0.8831      4302
     Bullish     0.8844    0.8825    0.8834      4324

    accuracy                         0.8833      8626
   macro avg     0.8833    0.8833    0.8833      8626
weighted avg     0.8833    0.8833    0.8833      8626



Epoch 3:   0%|          | 0/6605 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.17680961509995913
Validation loss: 0.3015723425675841
Validation Accuracy: 0.8859262694180385
              precision    recall  f1-score   support

     Bearish     0.8752    0.8996    0.8872      4302
     Bullish     0.8972    0.8723    0.8846      4324

    accuracy                         0.8859      8626
   macro avg     0.8862    0.8860    0.8859      8626
weighted avg     0.8862    0.8859    0.8859      8626



Epoch 4:   0%|          | 0/6605 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.14623653816283427
Validation loss: 0.3159809068721883
Validation Accuracy: 0.8880129840018549
              precision    recall  f1-score   support

     Bearish     0.8879    0.8875    0.8877      4302
     Bullish     0.8881    0.8885    0.8883      4324

    accuracy                         0.8880      8626
   macro avg     0.8880    0.8880    0.8880      8626
weighted avg     0.8880    0.8880    0.8880      8626



Epoch 5:   0%|          | 0/6605 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.12363110546764328
Validation loss: 0.34341063197044763
Validation Accuracy: 0.8892881984697426
              precision    recall  f1-score   support

     Bearish     0.8859    0.8931    0.8895      4302
     Bullish     0.8927    0.8855    0.8891      4324

    accuracy                         0.8893      8626
   macro avg     0.8893    0.8893    0.8893      8626
weighted avg     0.8893    0.8893    0.8893      8626



#### holdout testset evaluation

In [None]:
test_loss,preds_test,true_test = evaluate(test_dl)
report(preds_test,true_test)

              precision    recall  f1-score   support

     Bearish     0.8957    0.8957    0.8957      4334
     Bullish     0.8947    0.8947    0.8947      4292

    accuracy                         0.8952      8626
   macro avg     0.8952    0.8952    0.8952      8626
weighted avg     0.8952    0.8952    0.8952      8626



#### save output model and tokenizer

In [None]:
# output_dir = 'sentimentEngine4/'
output_dir = '/content/drive/MyDrive/'
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/tokenizer_config.json',
 '/content/drive/MyDrive/special_tokens_map.json',
 '/content/drive/MyDrive/vocab.json',
 '/content/drive/MyDrive/merges.txt',
 '/content/drive/MyDrive/added_tokens.json')

#### result for all epochs

In [None]:
!pip install transformers
!pip install emoji

In [None]:
import transformers
import torch
import math
import pandas as pd
import numpy as np
from transformers import RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, BertTokenizer, AutoModelForSequenceClassification, AutoTokenizer, AdamW
import random
import time


seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# tokenizer_loaded = RobertaTokenizer.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')
tokenizer_loaded = RobertaTokenizer.from_pretrained('/content/drive/MyDrive/')
model_loaded = RobertaForSequenceClassification.from_pretrained('/content/drive/MyDrive')
# model_loaded = RobertaForSequenceClassification.from_pretrained('zhayunduo/roberta-base-stocktwits-finetuned')

Checksenti() to check a single sentence

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def Sentiment(sent,model=model_loaded,tokenizer=tokenizer_loaded):
  encoded_dict = tokenizer.encode_plus(
                      sent, 
                      add_special_tokens = True,
                      truncation=True,
                      max_length = 64,
                      padding='max_length',
                      return_attention_mask = True,
                      return_tensors = 'pt')
      
  input_id = torch.LongTensor(encoded_dict['input_ids']).to(device)
  attention_mask = torch.LongTensor(encoded_dict['attention_mask']).to(device)
  model = model.to(device)

  with torch.no_grad():
      outputs = model(input_id, token_type_ids=None, attention_mask=attention_mask)

  logits = outputs[0]
  index = logits.argmax()
  return index,logits

!pip install emoji
import emoji
import re

def process_text(texts):
  # lowercase
  # message = message.lower() # RoBERTa tokenizer is uncased
  # remove URLs
  texts = re.sub(r'https?://\S+', "", texts)
  texts = re.sub(r'www.\S+', "", texts)
  # remove '
  texts = texts.replace('&#39;', "'")
  # remove symbol names
  texts = re.sub(r'(\#)(\S+)', r'hashtag_\2', texts)
  texts = re.sub(r'(\$)([A-Za-z]+)', r'cashtag_\2', texts)
  # remove usernames
  texts = re.sub(r'(\@)(\S+)', r'mention_\2', texts)
  # demojize
  texts = emoji.demojize(texts, delimiters=("", " "))

  return texts.strip()

def checkSenti(sent,return_logits=False):
  labels = ['Bearish','Bullish']
  sent_processed = process_text(sent)
  index,logits = Sentiment(sent_processed)
  if return_logits:
    logit0 = math.exp(logits[0][0])
    logit1 = math.exp(logits[0][1])
    logits = [logit0/(logit0+logit1),logit1/(logit0+logit1)]
    return labels[index],logits
  return labels[index]


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


batch_checkSenti() use GPU to perform batch inferencing

In [None]:
# from torch.utils.data import DataLoader,SequentialSampler,TensorDataset
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def batch_checkSenti(texts,model=model_loaded,tokenizer=tokenizer_loaded,return_logits=False):
#   start = time.time()
#   labels = ['Bearish','Bullish']
#   input_ids = []
#   attention_masks = []
#   for text in texts:
#       i = process_text(text)
#       encoded_data = tokenizer.encode_plus(
#       i,
#       add_special_tokens=True,
#       truncation=True,
#       max_length=64,
#       padding='max_length',
#       return_attention_mask= True,
#       return_tensors='pt')
#       input_ids.append(encoded_data['input_ids'])
#       attention_masks.append(encoded_data['attention_mask'])
#   input_ids = torch.cat(input_ids,dim=0)
#   attention_masks = torch.cat(attention_masks,dim=0)
#   model.to(device)
#   print('It takes {}s to tokenize'.format(time.time()-start))
#   checkpointtime = time.time()

#   testset = TensorDataset(input_ids,attention_masks)
#   test_dl = DataLoader(testset,sampler = SequentialSampler(testset),batch_size = 128)

#   predictions = []
#   for batch in test_dl:
#       batch = tuple(b.to(device) for b in batch)
#       inputs = {
#           'input_ids':batch[0],
#           'attention_mask': batch[1],
#       }

#       with torch.no_grad():
#           outputs = model(**inputs)
#       logits = outputs[0]
#       logits = logits.detach().cpu().numpy()
#       predictions.append(logits)

#   predictions = np.concatenate(predictions,axis=0)
#   index = predictions.argmax(axis=1)
#   print('It takes {}s to do predictions'.format(time.time()-checkpointtime))

#   # if return_logits:
#   #   return index,predictions
#   return (index,predictions) if return_logits else index

Use checkSenti()

In [None]:
samples = ['im feeling bullish about this stock',
           'im feeling bearish about this stock',
           'fuck me i just lost it all',
           'shoot to the moon',
           'short this stock']
for sample in samples:
  print(checkSenti(sample))

Bullish
Bearish
Bearish
Bullish
Bearish


Use batch_checkSenti()

In [None]:
samples_num = 20000
testing = batch_checkSenti(df.body.iloc[:samples_num])
true_labels = df.entities.iloc[:samples_num].replace('Bullish',1).replace('Bearish',0)
# len(['same' for i in range(samples_num) if testing[i]==true_labels[i]])/samples_num
testing

It takes 13.196947813034058s to tokenize
It takes 144.93988299369812s to do predictions


array([1, 1, 1, ..., 0, 0, 0])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(true_labels,testing))

              precision    recall  f1-score   support

           0       0.88      0.85      0.86     10000
           1       0.85      0.89      0.87     10000

    accuracy                           0.87     20000
   macro avg       0.87      0.87      0.87     20000
weighted avg       0.87      0.87      0.87     20000

