In [1]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import re
import string
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
nltk.download('vader_lexicon')
from sklearn.model_selection import train_test_split
#import spacy as sp
#nlps = sp.load('en')
import random
import datasets

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/timgu/nltk_data...


### First Dataset

In [2]:
data_index = 1
#r_data = pd.read_csv('/content/drive/MyDrive/ECE1786Project/sentiment.csv', delimiter=";")
# r_data = pd.read_csv('/content/drive/MyDrive/ECE1786Project/sentiment.csv')
# r_data = r_data.rename({'Stock Ticker': 'stock_ticker', 'Tweet Text': 'tweet_text', 'Tweet URL': 'tweet_URL'}, axis=1)

### Second Dataset

In [6]:
data_index = 2
bearish_data = pd.read_csv('./sentiment_data/bearish.txt',sep='|', header=None, names=['tweet_text'])
bullish_data = pd.read_csv('./sentiment_data/bullish.txt',sep='|', header=None, names=['tweet_text'], encoding='cp1252')

In [7]:
positive_labels = np.ones([len(bullish_data.index),1])
negative_labels = np.zeros([len(bearish_data.index),1])
bullish_data['Sentiment'] = positive_labels
bearish_data['Sentiment'] = negative_labels

In [8]:
bullish_data.Sentiment = bullish_data.Sentiment.astype('int')
bearish_data.Sentiment = bearish_data.Sentiment.astype('int')
r_data = pd.concat([bullish_data,bearish_data])
r_data = r_data.reset_index()

### Raw dataset

In [9]:
r_data

Unnamed: 0,index,tweet_text,Sentiment
0,0,adding,1
1,1,We like to be EARLY before explosions..,1
2,2,"Added, now just need patience",1
3,3,Funny how these Biotech washouts always come b...,1
4,4,looks like it was a nice dip buy yesterday,1
...,...,...,...
1302,595,$AAPL back at lows.,0
1303,596,Only people w/ balls of steel are shorting $AA...,0
1304,597,$FB goes red,0
1305,598,$AAPL down because its moved,0


In [10]:
text_data = r_data[['tweet_text','Sentiment']].copy()
text_data = text_data.dropna()
# Convert categorical labels to numerical
if data_index == 1:
  labels = {'Positive': 1, 'Negative': 0 }
  text_data.Sentiment = text_data.Sentiment.map(labels)

text_data.tweet_text = text_data.tweet_text.str.lower()

#Remove handlers
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub('@[^\s]+','',x))

# Remove URLS
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
text_data.tweet_text = text_data.tweet_text.apply(lambda x:' '.join(re.findall(r'\w+', x)))


#remove all single characters
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

# Substituting multiple spaces with single space
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))


In [11]:
text_data

Unnamed: 0,tweet_text,Sentiment
0,adding,1
1,we like to be early before explosions,1
2,added now just need patience,1
3,funny how these biotech washouts always come b...,1
4,looks like it wasnice dip buy yesterday,1
...,...,...
1302,aapl back at lows,0
1303,only peopleballs of steel are shorting aapl here,0
1304,fb goes red,0
1305,aapl down because its moved,0


In [12]:
# Split training and validation dataset
training, validation = train_test_split(text_data,test_size=0.3)
# Form a datasetdic
train_dataset = datasets.Dataset.from_dict(training)
val_dataset = datasets.Dataset.from_dict(validation)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"validation":val_dataset})

In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### Model: siebert

In [14]:
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")

model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")


Downloading: 100%|██████████| 256/256 [00:00<00:00, 47.0kB/s]
Downloading: 100%|██████████| 687/687 [00:00<00:00, 346kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 2.31MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 1.73MB/s]
Downloading: 100%|██████████| 150/150 [00:00<00:00, 44.2kB/s]
Downloading: 100%|██████████| 1.42G/1.42G [01:19<00:00, 17.9MB/s]


In [15]:
def tokenize_function(examples):
    return tokenizer(examples["tweet_text"], padding=True,truncation=True)
tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["tweet_text"])
tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")

100%|██████████| 1/1 [00:00<00:00,  2.97ba/s]
100%|██████████| 1/1 [00:00<00:00, 69.63ba/s]


In [16]:
tokenized_datasets.set_format("torch")
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8)

In [17]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [18]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [19]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [20]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
torch.manual_seed(43)
np.random.seed(43)
train_loss = np.zeros(num_epochs)
model.train()
for epoch in range(num_epochs):
    total_train_loss = 0.0
    counter = 0
    for batch in train_dataloader:
        counter+=1
        batch = {k: v.to(device) for k, v in batch.items()}
        #print(batch)
        outputs = model(**batch)
        loss = outputs.loss
        #print(loss)
        loss.backward()
        total_train_loss += loss.item()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    train_loss[epoch] = float(total_train_loss) / counter
    print(("Epoch {}: Train loss: {} |").format(
                    epoch + 1,
                    train_loss[epoch]))

  1%|          | 7/575 [00:32<38:47,  4.10s/it]  

KeyboardInterrupt: 

In [19]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import evaluate

predictions_list = []
softmax = torch.nn.Softmax(dim=1)
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
total_valid_loss = 0
counter = 0
model.eval()
for batch in eval_dataloader:
    counter+= 1
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    loss = outputs.loss
    #print(loss)
    total_valid_loss += loss.item()
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_prob = softmax(logits)
    predictions_list.append(prediction_prob)
    metric1.add_batch(predictions=predictions, references=batch["labels"])
    metric2.add_batch(predictions=predictions, references=batch["labels"])
print('The accuracy for model siebert/sentiment-roberta-large-english is', metric1.compute())
print('The F1 score for model siebert/sentiment-roberta-large-english is', metric2.compute())
print(total_valid_loss/counter)

The accuracy for model siebert/sentiment-roberta-large-english is {'accuracy': 0.8498727735368957}
The F1 score for model siebert/sentiment-roberta-large-english is {'f1': 0.8637413394919168}
0.3627231700718403


In [21]:
predictions_list

[tensor([[0.5459, 0.4541],
         [0.8692, 0.1308],
         [0.0295, 0.9705],
         [0.0297, 0.9703],
         [0.8284, 0.1716],
         [0.9462, 0.0538],
         [0.9498, 0.0502],
         [0.9475, 0.0525]], device='cuda:0'), tensor([[0.0319, 0.9681],
         [0.0298, 0.9702],
         [0.9109, 0.0891],
         [0.0345, 0.9655],
         [0.0304, 0.9696],
         [0.0331, 0.9669],
         [0.0294, 0.9706],
         [0.0319, 0.9681]], device='cuda:0'), tensor([[0.0298, 0.9702],
         [0.9385, 0.0615],
         [0.9479, 0.0521],
         [0.9353, 0.0647],
         [0.0316, 0.9684],
         [0.9504, 0.0496],
         [0.0319, 0.9681],
         [0.9498, 0.0502]], device='cuda:0'), tensor([[0.9388, 0.0612],
         [0.9291, 0.0709],
         [0.0349, 0.9651],
         [0.5061, 0.4939],
         [0.9437, 0.0563],
         [0.0301, 0.9699],
         [0.0295, 0.9705],
         [0.9460, 0.0540]], device='cuda:0'), tensor([[0.4961, 0.5039],
         [0.9493, 0.0507],
         [

### Model: juliensimon

In [22]:
tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")

model = AutoModelForSequenceClassification.from_pretrained("juliensimon/reviews-sentiment-analysis")


In [23]:
def tokenize_function(examples):
    return tokenizer(examples["tweet_text"], padding=True,truncation=True)
tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["tweet_text"])
tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 914
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 393
    })
})

In [25]:
tokenized_datasets.set_format("torch")
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8)

In [26]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [28]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [29]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [30]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
torch.manual_seed(43)
np.random.seed(43)
train_loss = np.zeros(num_epochs)
model.train()
for epoch in range(num_epochs):
    total_train_loss = 0.0
    counter = 0
    for batch in train_dataloader:
        counter+=1
        batch = {k: v.to(device) for k, v in batch.items()}
        #print(batch)
        outputs = model(**batch)
        loss = outputs.loss
        #print(loss)
        loss.backward()
        total_train_loss += loss.item()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    train_loss[epoch] = float(total_train_loss) / counter
    print(("Epoch {}: Train loss: {} |").format(
                    epoch + 1,
                    train_loss[epoch]))

  0%|          | 0/575 [00:00<?, ?it/s]

Epoch 1: Train loss: 0.5039203902949457 |
Epoch 2: Train loss: 0.22314853328887535 |
Epoch 3: Train loss: 0.07128103381103795 |
Epoch 4: Train loss: 0.01910919233771932 |
Epoch 5: Train loss: 0.005838887704252873 |


In [31]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import evaluate

predictions_list = []
softmax = torch.nn.Softmax(dim=1)
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
total_valid_loss = 0
counter = 0
model.eval()
for batch in eval_dataloader:
    counter+= 1
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    loss = outputs.loss
    #print(loss)
    total_valid_loss += loss.item()
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_prob = softmax(logits)
    predictions_list.append(prediction_prob)
    metric1.add_batch(predictions=predictions, references=batch["labels"])
    metric2.add_batch(predictions=predictions, references=batch["labels"])
print('The accuracy for model juliensimon/reviews-sentiment-analysis is', metric1.compute())
print('The F1 score for model juliensimon/reviews-sentiment-analysis is', metric2.compute())
print(total_valid_loss/counter)

The accuracy for model juliensimon/reviews-sentiment-analysis is {'accuracy': 0.8422391857506362}
The F1 score for model juliensimon/reviews-sentiment-analysis is {'f1': 0.8544600938967136}
0.5738562382943928


In [33]:
predictions_list

[tensor([[9.8264e-03, 9.9017e-01],
         [9.6436e-02, 9.0356e-01],
         [1.1384e-03, 9.9886e-01],
         [1.0833e-03, 9.9892e-01],
         [9.9169e-01, 8.3096e-03],
         [9.7305e-01, 2.6954e-02],
         [9.9919e-01, 8.1400e-04],
         [9.9873e-01, 1.2685e-03]], device='cuda:0'),
 tensor([[9.9197e-03, 9.9008e-01],
         [9.1891e-04, 9.9908e-01],
         [5.4688e-02, 9.4531e-01],
         [2.2373e-02, 9.7763e-01],
         [1.3634e-03, 9.9864e-01],
         [8.2737e-01, 1.7263e-01],
         [9.6225e-04, 9.9904e-01],
         [2.8729e-03, 9.9713e-01]], device='cuda:0'),
 tensor([[9.8300e-04, 9.9902e-01],
         [9.9907e-01, 9.2785e-04],
         [9.9724e-01, 2.7553e-03],
         [9.3257e-01, 6.7426e-02],
         [9.9796e-04, 9.9900e-01],
         [9.9864e-01, 1.3629e-03],
         [1.6332e-03, 9.9837e-01],
         [9.9862e-01, 1.3843e-03]], device='cuda:0'),
 tensor([[9.9918e-01, 8.1879e-04],
         [9.9099e-01, 9.0103e-03],
         [1.0205e-03, 9.9898e-01]