In [1]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import re
import string
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
nltk.download('vader_lexicon')
from sklearn.model_selection import train_test_split
#import spacy as sp
#nlps = sp.load('en')
import random
import datasets

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### First Dataset

In [3]:
# data_index = 1
# r_data = pd.read_csv('/content/drive/MyDrive/ECE1786Project/sentiment.csv')
# r_data = r_data.rename({'Stock Ticker': 'stock_ticker', 'Tweet Text': 'tweet_text', 'Tweet URL': 'tweet_URL'}, axis=1)

### Second Dataset

In [3]:
data_index = 2
bearish_data = pd.read_csv('/content/drive/MyDrive/ECE1786Project/bearish.txt',sep='\n', header=None, names=['tweet_text'])
bullish_data = pd.read_csv('/content/drive/MyDrive/ECE1786Project/bullish.txt',sep='\n', header=None, names=['tweet_text'], encoding='cp1252')

In [4]:
positive_labels = np.ones([len(bullish_data.index),1])
negative_labels = np.zeros([len(bearish_data.index),1])
bullish_data['Sentiment'] = positive_labels
bearish_data['Sentiment'] = negative_labels

In [5]:
bullish_data.Sentiment = bullish_data.Sentiment.astype('int')
bearish_data.Sentiment = bearish_data.Sentiment.astype('int')
r_data = pd.concat([bullish_data,bearish_data])
r_data = r_data.reset_index()

### Raw dataset

In [7]:
r_data

Unnamed: 0,index,tweet_text,Sentiment
0,0,adding,1
1,1,We like to be EARLY before explosions..,1
2,2,"Added, now just need patience",1
3,3,Funny how these Biotech washouts always come b...,1
4,4,looks like it was a nice dip buy yesterday,1
...,...,...,...
1302,595,$AAPL back at lows.,0
1303,596,Only people w/ balls of steel are shorting $AA...,0
1304,597,$FB goes red,0
1305,598,$AAPL down because its moved,0


In [6]:
text_data = r_data[['tweet_text','Sentiment']].copy()
text_data = text_data.dropna()
# Convert categorical labels to numerical
if data_index == 1:
  labels = {'Positive': 1, 'Negative': 0 }
  text_data.Sentiment = text_data.Sentiment.map(labels)

text_data.tweet_text = text_data.tweet_text.str.lower()

#Remove handlers
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub('@[^\s]+','',x))

# Remove URLS
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
text_data.tweet_text = text_data.tweet_text.apply(lambda x:' '.join(re.findall(r'\w+', x)))


#remove all single characters
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

# Substituting multiple spaces with single space
text_data.tweet_text = text_data.tweet_text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))


In [7]:
text_data

Unnamed: 0,tweet_text,Sentiment
0,adding,1
1,we like to be early before explosions,1
2,added now just need patience,1
3,funny how these biotech washouts always come b...,1
4,looks like it wasnice dip buy yesterday,1
...,...,...
1302,aapl back at lows,0
1303,only peopleballs of steel are shorting aapl here,0
1304,fb goes red,0
1305,aapl down because its moved,0


In [8]:
# Split training and validation dataset
training, validation = train_test_split(text_data,test_size=0.3)
# Form a datasetdic
train_dataset = datasets.Dataset.from_dict(training)
val_dataset = datasets.Dataset.from_dict(validation)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"validation":val_dataset})

### Test data

In [49]:
test_r_data = pd.read_csv('/content/drive/MyDrive/ECE1786Project/Unlabeled_data_date/Apple_posts_clean.csv')
# r_data = r_data.rename({'Stock Ticker': 'stock_ticker', 'Tweet Text': 'tweet_text', 'Tweet URL': 'tweet_URL'}, axis=1)

### Raw dataset

In [50]:
test_r_data

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,Date,Time
0,0,"Apple is now worth more than Alphabet, Amazon ...",,ynfy82,3621,712,https://i.redd.it/dy5sa32voay91.png,1.667707e+09,2022-11-05 23:55:07
1,1,My god Apple is unstoppable,,yf4hi0,11776,648,https://i.redd.it/l7t4w0jr5fw91.jpg,1.666907e+09,2022-10-27 17:49:26
2,2,Paper Holding Apple since 2001. Slow iPhone 14...,,xqhze8,15640,990,https://i.redd.it/lraxkmcypmq91.jpg,1.664384e+09,2022-09-28 12:55:54
3,3,"Apple is now worth more than Google, Amazon an...",Does that make sense?\n\nIs Apple overpriced? ...,yl6nr4,680,246,https://www.reddit.com/r/wallstreetbets/commen...,1.667491e+09,2022-11-03 12:03:29
4,4,Warren Buffett/Berkshire Hathaway Portfolio - ...,,y5ic2v,2449,486,https://i.redd.it/f7sr38k7i6u91.png,1.665931e+09,2022-10-16 10:35:09
...,...,...,...,...,...,...,...,...,...
189,194,I SPY an AAPL in a TSLA - Scalpers Delight - 4/29,# [April 28 - Recap](https://www.reddit.com/r/...,ue9q9u,28,17,https://www.reddit.com/r/wallstreetbets/commen...,1.651193e+09,2022-04-28 20:46:42
190,195,"Alright so I'm up 183% on my AAPL calls, when ...",,rdvjd9,187,188,https://i.redd.it/3s72c2vhlv481.jpg,1.639212e+09,2021-12-11 03:40:47
191,196,AAPL to the rescue today,,r5zs4c,1845,80,https://i.redd.it/l1ofr4ijbt281.jpg,1.638313e+09,2021-11-30 17:53:28
192,197,After $MSFT er I’m thinking to go hard on $AAP...,,scow5t,36,96,https://i.redd.it/b6117pkcnwd81.jpg,1.643147e+09,2022-01-25 16:50:11


In [51]:
test_r_data['Post_Concat'] = test_r_data['Title'] + test_r_data['Post Text'].fillna('')
test_r_data

Unnamed: 0.1,Unnamed: 0,Title,Post Text,ID,Score,Total Comments,Post URL,Date,Time,Post_Concat
0,0,"Apple is now worth more than Alphabet, Amazon ...",,ynfy82,3621,712,https://i.redd.it/dy5sa32voay91.png,1.667707e+09,2022-11-05 23:55:07,"Apple is now worth more than Alphabet, Amazon ..."
1,1,My god Apple is unstoppable,,yf4hi0,11776,648,https://i.redd.it/l7t4w0jr5fw91.jpg,1.666907e+09,2022-10-27 17:49:26,My god Apple is unstoppable
2,2,Paper Holding Apple since 2001. Slow iPhone 14...,,xqhze8,15640,990,https://i.redd.it/lraxkmcypmq91.jpg,1.664384e+09,2022-09-28 12:55:54,Paper Holding Apple since 2001. Slow iPhone 14...
3,3,"Apple is now worth more than Google, Amazon an...",Does that make sense?\n\nIs Apple overpriced? ...,yl6nr4,680,246,https://www.reddit.com/r/wallstreetbets/commen...,1.667491e+09,2022-11-03 12:03:29,"Apple is now worth more than Google, Amazon an..."
4,4,Warren Buffett/Berkshire Hathaway Portfolio - ...,,y5ic2v,2449,486,https://i.redd.it/f7sr38k7i6u91.png,1.665931e+09,2022-10-16 10:35:09,Warren Buffett/Berkshire Hathaway Portfolio - ...
...,...,...,...,...,...,...,...,...,...,...
189,194,I SPY an AAPL in a TSLA - Scalpers Delight - 4/29,# [April 28 - Recap](https://www.reddit.com/r/...,ue9q9u,28,17,https://www.reddit.com/r/wallstreetbets/commen...,1.651193e+09,2022-04-28 20:46:42,I SPY an AAPL in a TSLA - Scalpers Delight - 4...
190,195,"Alright so I'm up 183% on my AAPL calls, when ...",,rdvjd9,187,188,https://i.redd.it/3s72c2vhlv481.jpg,1.639212e+09,2021-12-11 03:40:47,"Alright so I'm up 183% on my AAPL calls, when ..."
191,196,AAPL to the rescue today,,r5zs4c,1845,80,https://i.redd.it/l1ofr4ijbt281.jpg,1.638313e+09,2021-11-30 17:53:28,AAPL to the rescue today
192,197,After $MSFT er I’m thinking to go hard on $AAP...,,scow5t,36,96,https://i.redd.it/b6117pkcnwd81.jpg,1.643147e+09,2022-01-25 16:50:11,After $MSFT er I’m thinking to go hard on $AAP...


In [57]:
df = test_r_data[['Post_Concat', 'Date', 'Time', 'Score']].copy()
df = df.sort_values(by='Time',ascending=False)
start_date = '2022-11-13 00:00:00'
end_date = '2022-11-17 23:59:59'
mask = (df['Time'] > start_date) & (df['Time'] <= end_date)
df_date = df.loc[mask]
df_date

Unnamed: 0,Post_Concat,Date,Time,Score
9,Apple plans to source chips from Arizona plant...,1668567000.0,2022-11-15 21:46:21,139
56,"Apple ""Ask App Not To Track"" feature effect on...",1668464000.0,2022-11-14 17:21:09,15
20,"11-14-22 SPY/ ES Futures, Apple and Spy Daily ...",1668461000.0,2022-11-14 16:25:57,37
22,Apple XR plans get a little more daylightApple...,1668366000.0,2022-11-13 14:03:35,19


In [58]:
test_data = df_date[['Post_Concat']].copy()
test_data = test_data.dropna()
# Convert categorical labels to numerical

test_data.Post_Concat = test_data.Post_Concat.str.lower()

#Remove handlers
test_data.Post_Concat = test_data.Post_Concat.apply(lambda x:re.sub('@[^\s]+','',x))

# Remove URLS
test_data.Post_Concat = test_data.Post_Concat.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
test_data.Post_Concat = test_data.Post_Concat.apply(lambda x:' '.join(re.findall(r'\w+', x)))


#remove all single characters
test_data.Post_Concat = test_data.Post_Concat.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

# Substituting multiple spaces with single space
test_data.Post_Concat = test_data.Post_Concat.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))


In [59]:
# Form a datasetdic
test_dataset = datasets.Dataset.from_dict(test_data)
test_dataset_dict = datasets.DatasetDict({"test":test_dataset})

In [60]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### Model: siebert

In [61]:
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")

model = AutoModelForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english")


In [62]:
def tokenize_function(examples):
    return tokenizer(examples["tweet_text"], padding=True,truncation=True)
tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["tweet_text"])
tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")
def tokenize_function_test(examples):
    return tokenizer(examples["Post_Concat"], padding=True,truncation=True)
tokenized_datasets_test = test_dataset_dict.map(tokenize_function_test,batched=True)
tokenized_datasets_test = tokenized_datasets_test.remove_columns(["Post_Concat"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [63]:
tokenized_datasets.set_format("torch")
tokenized_datasets_test.set_format("torch")
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8)
test_dataloader = DataLoader(tokenized_datasets_test['test'], batch_size=12)

In [42]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [43]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [64]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
         

In [21]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
torch.manual_seed(43)
np.random.seed(43)
train_loss = np.zeros(num_epochs)
model.train()
for epoch in range(num_epochs):
    total_train_loss = 0.0
    counter = 0
    for batch in train_dataloader:
        counter+=1
        batch = {k: v.to(device) for k, v in batch.items()}
        #print(batch)
        outputs = model(**batch)
        loss = outputs.loss
        #print(loss)
        loss.backward()
        total_train_loss += loss.item()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    train_loss[epoch] = float(total_train_loss) / counter
    print(("Epoch {}: Train loss: {} |").format(
                    epoch + 1,
                    train_loss[epoch]))

  0%|          | 0/575 [00:00<?, ?it/s]

Epoch 1: Train loss: 0.6391370607459027 |
Epoch 2: Train loss: 0.7124543182227923 |
Epoch 3: Train loss: 0.6962305281473242 |
Epoch 4: Train loss: 0.6934694831785948 |
Epoch 5: Train loss: 0.6956166936003644 |


In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import evaluate

predictions_list = []
softmax = torch.nn.Softmax(dim=1)
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
total_valid_loss = 0
counter = 0
model.eval()
for batch in eval_dataloader:
    counter+= 1
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    loss = outputs.loss
    #print(loss)
    total_valid_loss += loss.item()
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_prob = softmax(logits)
    predictions_list.append(prediction_prob)
    metric1.add_batch(predictions=predictions, references=batch["labels"])
    metric2.add_batch(predictions=predictions, references=batch["labels"])
print('The accuracy for model siebert/sentiment-roberta-large-english is', metric1.compute())
print('The F1 score for model siebert/sentiment-roberta-large-english is', metric2.compute())
print(total_valid_loss/counter)

The accuracy for model siebert/sentiment-roberta-large-english is {'accuracy': 0.8498727735368957}
The F1 score for model siebert/sentiment-roberta-large-english is {'f1': 0.8637413394919168}
0.3627231700718403


In [None]:
predictions_list

[tensor([[0.5459, 0.4541],
         [0.8692, 0.1308],
         [0.0295, 0.9705],
         [0.0297, 0.9703],
         [0.8284, 0.1716],
         [0.9462, 0.0538],
         [0.9498, 0.0502],
         [0.9475, 0.0525]], device='cuda:0'), tensor([[0.0319, 0.9681],
         [0.0298, 0.9702],
         [0.9109, 0.0891],
         [0.0345, 0.9655],
         [0.0304, 0.9696],
         [0.0331, 0.9669],
         [0.0294, 0.9706],
         [0.0319, 0.9681]], device='cuda:0'), tensor([[0.0298, 0.9702],
         [0.9385, 0.0615],
         [0.9479, 0.0521],
         [0.9353, 0.0647],
         [0.0316, 0.9684],
         [0.9504, 0.0496],
         [0.0319, 0.9681],
         [0.9498, 0.0502]], device='cuda:0'), tensor([[0.9388, 0.0612],
         [0.9291, 0.0709],
         [0.0349, 0.9651],
         [0.5061, 0.4939],
         [0.9437, 0.0563],
         [0.0301, 0.9699],
         [0.0295, 0.9705],
         [0.9460, 0.0540]], device='cuda:0'), tensor([[0.4961, 0.5039],
         [0.9493, 0.0507],
         [

### Test model with test data

In [65]:
predictions_list = []
softmax = torch.nn.Softmax(dim=1)
for batch in test_dataloader:
    #print(batch.items())
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # prediction_prob = softmax(logits)
    predictions_list.append(predictions)
print(predictions_list)

[tensor([1, 0, 0, 0], device='cuda:0')]


In [66]:
for i in predictions_list:
  results = i.cpu().numpy()
df_result = df_date[['Post_Concat', 'Time', 'Score']].copy()
df_result['Predictions'] = results
df_result['Predictions'] = df_result['Predictions'].replace([0], -1)
df_result

Unnamed: 0,Post_Concat,Time,Score,Predictions
9,Apple plans to source chips from Arizona plant...,2022-11-15 21:46:21,139,1
56,"Apple ""Ask App Not To Track"" feature effect on...",2022-11-14 17:21:09,15,-1
20,"11-14-22 SPY/ ES Futures, Apple and Spy Daily ...",2022-11-14 16:25:57,37,-1
22,Apple XR plans get a little more daylightApple...,2022-11-13 14:03:35,19,-1


In [68]:
df_result['Weighted_result'] = df_result['Predictions'] * df_result['Score']
if df_result['Weighted_result'].sum() > 0:
  print("The sentiment is positive, the tomorrow's stock price will rise.")
else:
  print("The sentiment is negative, the tomorrow's stock price will fall.")

The sentiment is positive, the tomorrow's stock price will rise.


### Model: juliensimon

In [69]:
tokenizer = AutoTokenizer.from_pretrained("juliensimon/reviews-sentiment-analysis")

model = AutoModelForSequenceClassification.from_pretrained("juliensimon/reviews-sentiment-analysis")


Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/586 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [70]:
def tokenize_function(examples):
    return tokenizer(examples["tweet_text"], padding=True,truncation=True)
tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["tweet_text"])
tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")
def tokenize_function_test(examples):
    return tokenizer(examples["Post_Concat"], padding=True,truncation=True)
tokenized_datasets_test = test_dataset_dict.map(tokenize_function_test,batched=True)
tokenized_datasets_test = tokenized_datasets_test.remove_columns(["Post_Concat"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [71]:
tokenized_datasets.set_format("torch")
tokenized_datasets_test.set_format("torch")
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['validation'], batch_size=8)
test_dataloader = DataLoader(tokenized_datasets_test['test'], batch_size=12)

In [72]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [73]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [74]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [75]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
torch.manual_seed(43)
np.random.seed(43)
train_loss = np.zeros(num_epochs)
model.train()
for epoch in range(num_epochs):
    total_train_loss = 0.0
    counter = 0
    for batch in train_dataloader:
        counter+=1
        batch = {k: v.to(device) for k, v in batch.items()}
        #print(batch)
        outputs = model(**batch)
        loss = outputs.loss
        #print(loss)
        loss.backward()
        total_train_loss += loss.item()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    train_loss[epoch] = float(total_train_loss) / counter
    print(("Epoch {}: Train loss: {} |").format(
                    epoch + 1,
                    train_loss[epoch]))

  0%|          | 0/575 [00:00<?, ?it/s]

Epoch 1: Train loss: 0.46289792967879256 |
Epoch 2: Train loss: 0.14754616419693617 |
Epoch 3: Train loss: 0.03772076907587926 |
Epoch 4: Train loss: 0.012563504921474858 |
Epoch 5: Train loss: 0.0025663319200722743 |


In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import evaluate

predictions_list = []
softmax = torch.nn.Softmax(dim=1)
metric1 = evaluate.load("accuracy")
metric2 = evaluate.load("f1")
total_valid_loss = 0
counter = 0
model.eval()
for batch in eval_dataloader:
    counter+= 1
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    loss = outputs.loss
    #print(loss)
    total_valid_loss += loss.item()
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    prediction_prob = softmax(logits)
    predictions_list.append(prediction_prob)
    metric1.add_batch(predictions=predictions, references=batch["labels"])
    metric2.add_batch(predictions=predictions, references=batch["labels"])
print('The accuracy for model juliensimon/reviews-sentiment-analysis is', metric1.compute())
print('The F1 score for model juliensimon/reviews-sentiment-analysis is', metric2.compute())
print(total_valid_loss/counter)

The accuracy for model juliensimon/reviews-sentiment-analysis is {'accuracy': 0.8422391857506362}
The F1 score for model juliensimon/reviews-sentiment-analysis is {'f1': 0.8544600938967136}
0.5738562382943928


In [None]:
predictions_list

[tensor([[9.8264e-03, 9.9017e-01],
         [9.6436e-02, 9.0356e-01],
         [1.1384e-03, 9.9886e-01],
         [1.0833e-03, 9.9892e-01],
         [9.9169e-01, 8.3096e-03],
         [9.7305e-01, 2.6954e-02],
         [9.9919e-01, 8.1400e-04],
         [9.9873e-01, 1.2685e-03]], device='cuda:0'),
 tensor([[9.9197e-03, 9.9008e-01],
         [9.1891e-04, 9.9908e-01],
         [5.4688e-02, 9.4531e-01],
         [2.2373e-02, 9.7763e-01],
         [1.3634e-03, 9.9864e-01],
         [8.2737e-01, 1.7263e-01],
         [9.6225e-04, 9.9904e-01],
         [2.8729e-03, 9.9713e-01]], device='cuda:0'),
 tensor([[9.8300e-04, 9.9902e-01],
         [9.9907e-01, 9.2785e-04],
         [9.9724e-01, 2.7553e-03],
         [9.3257e-01, 6.7426e-02],
         [9.9796e-04, 9.9900e-01],
         [9.9864e-01, 1.3629e-03],
         [1.6332e-03, 9.9837e-01],
         [9.9862e-01, 1.3843e-03]], device='cuda:0'),
 tensor([[9.9918e-01, 8.1879e-04],
         [9.9099e-01, 9.0103e-03],
         [1.0205e-03, 9.9898e-01]

### Test model with test data

In [76]:
predictions_list = []
softmax = torch.nn.Softmax(dim=1)
for batch in test_dataloader:
    #print(batch.items())
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # prediction_prob = softmax(logits)
    predictions_list.append(predictions)
print(predictions_list)

[tensor([0, 0, 0, 0], device='cuda:0')]


In [77]:
for i in predictions_list:
  results = i.cpu().numpy()
df_result = df_date[['Post_Concat', 'Time', 'Score']].copy()
df_result['Predictions'] = results
df_result['Predictions'] = df_result['Predictions'].replace([0], -1)
df_result

Unnamed: 0,Post_Concat,Time,Score,Predictions
9,Apple plans to source chips from Arizona plant...,2022-11-15 21:46:21,139,-1
56,"Apple ""Ask App Not To Track"" feature effect on...",2022-11-14 17:21:09,15,-1
20,"11-14-22 SPY/ ES Futures, Apple and Spy Daily ...",2022-11-14 16:25:57,37,-1
22,Apple XR plans get a little more daylightApple...,2022-11-13 14:03:35,19,-1


In [79]:
df_result['Weighted_result'] = df_result['Predictions'] * df_result['Score']
if df_result['Weighted_result'].sum() > 0:
  print("The sentiment is positive, the tomorrow's stock price will rise.")
else:
  print("The sentiment is negative, the tomorrow's stock price will fall.")

The sentiment is negative, the tomorrow's stock price will fall.
