In [34]:
# Install deps
# !python -m spacy download en_core_web_md
# !pip install torch torchvision
# !pip install transformers
# !pip3 install nltk emoji==0.6.0


In [1]:
#dataset
import pandas as pd
from datasets import Dataset 
# utils
from sklearn.utils import shuffle

#visualize
import matplotlib.pyplot as plt
%matplotlib inline

# modeling
import spacy
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification


Reader functions:

In [2]:
# We define a function to read our already pre-processed data
def reader_df(topic):
  path_train = "cleaned_df/stance_" + topic + "_train_cleaned.csv"
  path_test = "cleaned_df/stance_" + topic + "_test_cleaned.csv"
  path_val = "cleaned_df/stance_" + topic + "_validation_cleaned.csv"
  df_train = pd.read_csv(path_train)
  df_val = pd.read_csv(path_val)
  df_test = pd.read_csv(path_test)

  X_train = df_train.loc[:, 'text'].values
  y_train = df_train.loc[:, 'label'].values

  X_test = df_test.loc[:, 'text'].values
  y_test = df_test.loc[:, 'label'].values

  X_val = df_val.loc[:, 'text'].values
  y_val = df_val.loc[:, 'label'].values

  return X_train, X_test, y_train, y_test, X_val, y_val

In [13]:
# During finetuning, we might want to use the specific tokenizer functions of BERT models, so we define a function to read the raw data.
def raw_reader_and_tokenize(topic):
    path_X_train = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/train_text.txt"
    path_X_test = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/test_text.txt"
    path_X_val = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/val_text.txt"
    path_y_train = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/train_labels.txt"
    path_y_test = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/test_labels.txt"
    path_y_val = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/val_labels.txt"

    X_train = pd.read_table(path_X_train, header=None)
    X_test = pd.read_table(path_X_test, header=None)
    X_val = pd.read_table(path_X_val, header=None)
    y_train = pd.read_table(path_y_train, header=None)
    y_test = pd.read_table(path_y_test, header=None)
    y_val = pd.read_table(path_y_val, header=None)

    X_train = X_train.rename(columns={0: "text"})
    X_test = X_test.rename(columns={0: "text"})
    y_train = y_train.rename(columns={0: "label"})
    y_test = y_test.rename(columns={0: "label"})
    X_val = X_val.rename(columns={0: "text"})
    y_val = y_val.rename(columns={0: "label"})

    # Convert the DataFrame to a Hugging Face Dataset
    train_dataset = Dataset.from_dict({"text": X_train["text"].values, "label": y_train["label"].values})
    val_dataset = Dataset.from_dict({"text": X_val["text"].values, "label": y_val["label"].values})
    test_dataset = Dataset.from_dict({"text": X_test["text"].values, "label": y_test["label"].values})

    # Tokenize the training and validation datasets
    train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
    val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
    test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))

    # Set the format of the datasets to PyTorch tensors
    train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


    return train_dataset, val_dataset, test_dataset
    
    

+ **DistilBERT** model

As a first test, we can calculate how accurate the predictions are using a DistilBert model. The code below uses `DistilBertTokenizerFast` on our already pre-processed datasets. 
We apply the `DistilBertForSequenceClassification` model, which is the most appropiate for our current task of multi-class classification.

We will implement this first model in the `stances-feminist` dataset, just to test it.

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.metrics import f1_score, precision_score, accuracy_score
import torch

X_train, X_test, y_train, y_test, X_val, y_val = reader_df("feminist")



# Load the tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Prepare the data
train_texts = X_train.tolist() 
train_labels = y_train.tolist()
test_texts = X_test.tolist()
test_labels = y_test.tolist()

# Tokenize the data
train_encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)
test_encodings = tokenizer(test_texts, return_tensors='pt', padding=True, truncation=True)

# Convert to torch Dataset
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

# Train the model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # useful to improve computing time, depending on our pc characteristics
model.to(device)
model.train()

In the code below we use the PyTorch library to create a DataLoader object for the train_dataset with a batch size of 16. This means that the data in train_dataset will be divided into batches of 16 samples each and fed into the model for training.
We also create an Adam optimizer object with a learning rate of 5e-5. The optimizer is used to update the model’s parameters during training to minimize the loss function.

In [17]:

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)
optim = torch.optim.Adam(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

# Evaluate the model on the test set
model.eval()
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)
predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        predictions.extend(logits.argmax(dim=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate metrics
f1 = f1_score(true_labels, predictions, average='macro')
precision = precision_score(true_labels, predictions, average='macro')
accuracy = accuracy_score(true_labels, predictions)

print(f'F1 score: {f1}')
print(f'Precision: {precision}')
print(f'Accuracy: {accuracy}')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

F1 score: 0.5426585516619846
Precision: 0.6004088504088504
Accuracy: 0.5719298245614035


We have an F1 of 54 with this implementation. How can we improve this?

#### Fine-tuned BERTweet implementation

BERT is a pre-trained language model that has been shown to achieve state-of-the-art performance on a wide range of natural language processing tasks. However, BERT was pre-trained on a large corpus of general text, which may not be representative of the language used in tweets.

Tweets are known to have unique characteristics that can make them more challenging to classify compared to other types of text. For example, tweets are often shorter, contain a lot of noise (such as typos and slang), and can have complex grammatical structures that are not found in more formal writing.

Additionally, the use of hashtags, emojis, and other special characters in tweets can make it difficult for BERT to understand the context and sentiment of the tweet. Pre-processing and cleaning the tweets can help to mitigate some of these issues, but there is still a limit to the effectiveness of this approach.

To address these challenges, researchers have developed specialized versions of BERT for use with social media data. For example, BERTweet is a variant of BERT that has been trained specifically on tweets and has been shown to outperform generic BERT on tweet classification tasks.

+ **BERTweet**

First we define a `compute_metrics` function that calls precision, recall and F1, using a macro average.

In [4]:
precision = load_metric("precision")
recall = load_metric("recall")
f1 = load_metric("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "precision": precision.compute(predictions=predictions, references=labels, average="macro"),
        "recall": recall.compute(predictions=predictions, references=labels, average="macro"),
        "f1": f1.compute(predictions=predictions, references=labels, average="macro"),
    }

  precision = load_metric("precision")


We define the tokenizer from Bertweet, and implement the five models after that.

In [5]:
# Load the pre-trained BERTweet tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)

# Define a function to tokenize the dataset
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Stance: climate

In this case, instead of our pre-processed datasets, we will import the raw data.
(As a note, we tried an implementation using our pre-processed datasets, but the scores were slightly worse in all cases. For this alternative implementation, see code that is commented out below.)

In [14]:
train_dataset, val_dataset, test_dataset = raw_reader_and_tokenize("climate")



Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/169 [00:00<?, ? examples/s]

In [49]:
### ALTERNATIVE IMPLEMENTATION WITH OUR PRE-PROCESSED DATA, INSTEAD OF RAW DATA

# Convert the DataFrame to a Hugging Face Dataset
# X_train, X_test, y_train, y_test, X_val, y_val = reader_df("climate")

# train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
# val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})
# test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

# # Tokenize the training and validation datasets
# train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
# val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

# # Set the format of the datasets to PyTorch tensors
# train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [8]:
# Load the pre-trained BERTweet model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)


# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    logging_dir='./logs',
    load_best_model_at_end=True,
    seed=1
)


# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model on the training dataset
trainer.train()

loading configuration file config.json from cache at C:\Users\danid/.cache\huggingface\hub\models--vinai--bertweet-base\snapshots\118ab1d567653bec16bbb081eafb6f8942f72108\config.json
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertweetTokenizer",
  "transforme

  0%|          | 0/60 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.5839727195225916}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6115779645191409}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5971479500891265}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-12
Configuration saved in ./results\checkpoint-12\config.json


{'eval_loss': 0.5253016352653503, 'eval_precision': {'precision': 0.5839727195225916}, 'eval_recall': {'recall': 0.6115779645191409}, 'eval_f1': {'f1': 0.5971479500891265}, 'eval_runtime': 1.7267, 'eval_samples_per_second': 23.166, 'eval_steps_per_second': 1.158, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-12\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.5444096133751306}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5294117647058824}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.518095238095238}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-24
Configuration saved in ./results\checkpoint-24\config.json


{'eval_loss': 0.6195958852767944, 'eval_precision': {'precision': 0.5444096133751306}, 'eval_recall': {'recall': 0.5294117647058824}, 'eval_f1': {'f1': 0.518095238095238}, 'eval_runtime': 1.9658, 'eval_samples_per_second': 20.348, 'eval_steps_per_second': 1.017, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-24\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.6041666666666666}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6274509803921569}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6141414141414141}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-36
Configuration saved in ./results\checkpoint-36\config.json


{'eval_loss': 0.4208868443965912, 'eval_precision': {'precision': 0.6041666666666666}, 'eval_recall': {'recall': 0.6274509803921569}, 'eval_f1': {'f1': 0.6141414141414141}, 'eval_runtime': 1.6987, 'eval_samples_per_second': 23.548, 'eval_steps_per_second': 1.177, 'epoch': 3.0}


Model weights saved in ./results\checkpoint-36\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.6041666666666666}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6274509803921569}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6141414141414141}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-48
Configuration saved in ./results\checkpoint-48\config.json


{'eval_loss': 0.3590737581253052, 'eval_precision': {'precision': 0.6041666666666666}, 'eval_recall': {'recall': 0.6274509803921569}, 'eval_f1': {'f1': 0.6141414141414141}, 'eval_runtime': 1.6977, 'eval_samples_per_second': 23.561, 'eval_steps_per_second': 1.178, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-48\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.6041666666666666}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6274509803921569}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6141414141414141}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-60
Configuration saved in ./results\checkpoint-60\config.json


{'eval_loss': 0.37791839241981506, 'eval_precision': {'precision': 0.6041666666666666}, 'eval_recall': {'recall': 0.6274509803921569}, 'eval_f1': {'f1': 0.6141414141414141}, 'eval_runtime': 1.7367, 'eval_samples_per_second': 23.032, 'eval_steps_per_second': 1.152, 'epoch': 5.0}


Model weights saved in ./results\checkpoint-60\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-48 (score: 0.3590737581253052).


{'train_runtime': 333.8925, 'train_samples_per_second': 5.316, 'train_steps_per_second': 0.18, 'train_loss': 0.48289686838785806, 'epoch': 5.0}


TrainOutput(global_step=60, training_loss=0.48289686838785806, metrics={'train_runtime': 333.8925, 'train_samples_per_second': 5.316, 'train_steps_per_second': 0.18, 'train_loss': 0.48289686838785806, 'epoch': 5.0})

In [15]:
# Evaluate the trained model in the test set

# trainer.compute_metrics=compute_metrics
# (if we want to use different metrics (e.g. weighted average), just update the compute_metrics function
#  and run the line above)
results = trainer.evaluate(test_dataset)




The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 169
  Batch size = 32


  0%|          | 0/6 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.5184569952011812}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5770034843205575}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5450520028833282}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


In [16]:
print(results["eval_precision"])
print(results["eval_recall"])
print(results["eval_f1"])

{'precision': 0.5184569952011812}
{'recall': 0.5770034843205575}
{'f1': 0.5450520028833282}


To assess possible biases, we can check how well we are predicting each class:

In [17]:
from sklearn.metrics import classification_report

# Generate predictions for the test set
test_predictions = trainer.predict(test_dataset).predictions
test_predictions = np.argmax(test_predictions, axis=-1)

# Compute the classification report
report = classification_report(test_dataset['label'], test_predictions, output_dict=True)

# Print the classification report
for label in report:
    if label != 'accuracy':
        print(f"Class: {label}")
        print(f"Precision: {report[label]['precision']}")
        print(f"Recall: {report[label]['recall']}")
        print(f"F1-score: {report[label]['f1-score']}\n")

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 169
  Batch size = 32


  0%|          | 0/6 [00:00<?, ?it/s]

Class: 0
Precision: 0.6744186046511628
Recall: 0.8285714285714286
F1-score: 0.7435897435897435

Class: 1
Precision: 0.0
Recall: 0.0
F1-score: 0.0

Class: 2
Precision: 0.8809523809523809
Recall: 0.9024390243902439
F1-score: 0.8915662650602411

Class: macro avg
Precision: 0.5184569952011812
Recall: 0.5770034843205575
F1-score: 0.5450520028833282

Class: weighted avg
Precision: 0.7808390178694293
Recall: 0.8284023668639053
F1-score: 0.8028892995742644



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


To compare with the original benchmarking methodology, we take the average of F1 scores for against and favor clases:

In [20]:
# Final evaluation
f1_against = report['1']['f1-score']
f1_favor = report['2']['f1-score']
tweeteval_result = (f1_against+f1_favor) / 2
print("Final F1 - Stance CLIMATE: " + str(tweeteval_result))

Final F1 - Stance CLIMATE: 0.44578313253012053


###  Stance: atheism

In [23]:
# Import dataset
train_dataset, val_dataset, test_dataset = raw_reader_and_tokenize("atheism")


# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model on the training dataset
trainer.train()

# Generate predictions for the test set
test_predictions = trainer.predict(test_dataset).predictions
test_predictions = np.argmax(test_predictions, axis=-1)

# Compute the classification report
report = classification_report(test_dataset['label'], test_predictions, output_dict=True)

# Print the classification report
for label in report:
    if label != 'accuracy':
        print(f"Class: {label}")
        print(f"Precision: {report[label]['precision']}")
        print(f"Recall: {report[label]['recall']}")
        print(f"F1-score: {report[label]['f1-score']}\n")

Map:   0%|          | 0/461 [00:00<?, ? examples/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 461
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 75
  Number of trainable parameters = 134902275


  0%|          | 0/75 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 52
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.4794326241134752}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.4336917562724014}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.41327300150829566}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-15
Configuration saved in ./results\checkpoint-15\config.json


{'eval_loss': 0.957466185092926, 'eval_precision': {'precision': 0.4794326241134752}, 'eval_recall': {'recall': 0.4336917562724014}, 'eval_f1': {'f1': 0.41327300150829566}, 'eval_runtime': 2.4984, 'eval_samples_per_second': 20.814, 'eval_steps_per_second': 0.801, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-15\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 52
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.646011396011396}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5433094384707288}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5493386243386243}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-30
Configuration saved in ./results\checkpoint-30\config.json


{'eval_loss': 0.744162917137146, 'eval_precision': {'precision': 0.646011396011396}, 'eval_recall': {'recall': 0.5433094384707288}, 'eval_f1': {'f1': 0.5493386243386243}, 'eval_runtime': 2.4655, 'eval_samples_per_second': 21.091, 'eval_steps_per_second': 0.811, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-30\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 52
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.6708333333333334}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6684587813620072}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6674767727399306}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-45
Configuration saved in ./results\checkpoint-45\config.json


{'eval_loss': 0.6782239079475403, 'eval_precision': {'precision': 0.6708333333333334}, 'eval_recall': {'recall': 0.6684587813620072}, 'eval_f1': {'f1': 0.6674767727399306}, 'eval_runtime': 2.4587, 'eval_samples_per_second': 21.149, 'eval_steps_per_second': 0.813, 'epoch': 3.0}


Model weights saved in ./results\checkpoint-45\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 52
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7523809523809524}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6899641577060932}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7038809144072302}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-60
Configuration saved in ./results\checkpoint-60\config.json


{'eval_loss': 0.7633729577064514, 'eval_precision': {'precision': 0.7523809523809524}, 'eval_recall': {'recall': 0.6899641577060932}, 'eval_f1': {'f1': 0.7038809144072302}, 'eval_runtime': 2.4514, 'eval_samples_per_second': 21.213, 'eval_steps_per_second': 0.816, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-60\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 52
  Batch size = 32


  0%|          | 0/2 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7293650793650794}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7425328554360813}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7160370634354954}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-75
Configuration saved in ./results\checkpoint-75\config.json


{'eval_loss': 0.758226752281189, 'eval_precision': {'precision': 0.7293650793650794}, 'eval_recall': {'recall': 0.7425328554360813}, 'eval_f1': {'f1': 0.7160370634354954}, 'eval_runtime': 2.3626, 'eval_samples_per_second': 22.009, 'eval_steps_per_second': 0.847, 'epoch': 5.0}


Model weights saved in ./results\checkpoint-75\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-45 (score: 0.6782239079475403).
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 220
  Batch size = 32


{'train_runtime': 411.1427, 'train_samples_per_second': 5.606, 'train_steps_per_second': 0.182, 'train_loss': 0.5718584187825521, 'epoch': 5.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Class: 0
Precision: 0.5348837209302325
Recall: 0.8214285714285714
F1-score: 0.647887323943662

Class: 1
Precision: 0.9
Recall: 0.7875
F1-score: 0.84

Class: 2
Precision: 0.5405405405405406
Recall: 0.625
F1-score: 0.5797101449275363

Class: macro avg
Precision: 0.658474753823591
Recall: 0.744642857142857
F1-score: 0.6891991562903993

Class: weighted avg
Precision: 0.8012456431061082
Recall: 0.7681818181818182
F1-score: 0.7776889532186532



In [24]:
# Final evaluation
f1_against = report['1']['f1-score']
f1_favor = report['2']['f1-score']
tweeteval_result_atheism = (f1_against+f1_favor) / 2
print("Final F1 - Stance ATHEISM: " + str(tweeteval_result_atheism))

Final F1 - Stance ATHEISM: 0.7098550724637681


### Stance: feminist

In [25]:
# Import dataset
train_dataset, val_dataset, test_dataset = raw_reader_and_tokenize("feminist")


# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model on the training dataset
trainer.train()

# Generate predictions for the test set
test_predictions = trainer.predict(test_dataset).predictions
test_predictions = np.argmax(test_predictions, axis=-1)

# Compute the classification report
report = classification_report(test_dataset['label'], test_predictions, output_dict=True)

# Print the classification report
for label in report:
    if label != 'accuracy':
        print(f"Class: {label}")
        print(f"Precision: {report[label]['precision']}")
        print(f"Recall: {report[label]['recall']}")
        print(f"F1-score: {report[label]['f1-score']}\n")

Map:   0%|          | 0/597 [00:00<?, ? examples/s]

Map:   0%|          | 0/67 [00:00<?, ? examples/s]

Map:   0%|          | 0/285 [00:00<?, ? examples/s]

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 597
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 95
  Number of trainable parameters = 134902275


  0%|          | 0/95 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 67
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.6662768031189084}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5153735153735154}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.47222222222222215}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-19
Configuration saved in ./results\checkpoint-19\config.json


{'eval_loss': 0.8846456408500671, 'eval_precision': {'precision': 0.6662768031189084}, 'eval_recall': {'recall': 0.5153735153735154}, 'eval_f1': {'f1': 0.47222222222222215}, 'eval_runtime': 4.9132, 'eval_samples_per_second': 13.637, 'eval_steps_per_second': 0.611, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-19\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 67
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.5913521176679072}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.558996558996559}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5519460851128275}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-38
Configuration saved in ./results\checkpoint-38\config.json


{'eval_loss': 0.9805386662483215, 'eval_precision': {'precision': 0.5913521176679072}, 'eval_recall': {'recall': 0.558996558996559}, 'eval_f1': {'f1': 0.5519460851128275}, 'eval_runtime': 4.5639, 'eval_samples_per_second': 14.68, 'eval_steps_per_second': 0.657, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-38\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 67
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.5841491841491842}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5275835275835276}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5294920133629811}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-57
Configuration saved in ./results\checkpoint-57\config.json


{'eval_loss': 1.03778076171875, 'eval_precision': {'precision': 0.5841491841491842}, 'eval_recall': {'recall': 0.5275835275835276}, 'eval_f1': {'f1': 0.5294920133629811}, 'eval_runtime': 4.4445, 'eval_samples_per_second': 15.075, 'eval_steps_per_second': 0.675, 'epoch': 3.0}


Model weights saved in ./results\checkpoint-57\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 67
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.5459595959595959}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5308025308025308}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5369093908330388}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-76
Configuration saved in ./results\checkpoint-76\config.json


{'eval_loss': 0.9985182881355286, 'eval_precision': {'precision': 0.5459595959595959}, 'eval_recall': {'recall': 0.5308025308025308}, 'eval_f1': {'f1': 0.5369093908330388}, 'eval_runtime': 4.3904, 'eval_samples_per_second': 15.26, 'eval_steps_per_second': 0.683, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-76\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 67
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.5438228438228437}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.5481185481185481}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.5421310368118878}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-95
Configuration saved in ./results\checkpoint-95\config.json


{'eval_loss': 1.0821036100387573, 'eval_precision': {'precision': 0.5438228438228437}, 'eval_recall': {'recall': 0.5481185481185481}, 'eval_f1': {'f1': 0.5421310368118878}, 'eval_runtime': 4.415, 'eval_samples_per_second': 15.176, 'eval_steps_per_second': 0.68, 'epoch': 5.0}


Model weights saved in ./results\checkpoint-95\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-19 (score: 0.8846456408500671).
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 285
  Batch size = 32


{'train_runtime': 735.8505, 'train_samples_per_second': 4.057, 'train_steps_per_second': 0.129, 'train_loss': 0.6242399115311472, 'epoch': 5.0}


  0%|          | 0/9 [00:00<?, ?it/s]

Class: 0
Precision: 0.49206349206349204
Recall: 0.7045454545454546
F1-score: 0.5794392523364486

Class: 1
Precision: 0.7156398104265402
Recall: 0.825136612021858
F1-score: 0.766497461928934

Class: 2
Precision: 0.36363636363636365
Recall: 0.06896551724137931
F1-score: 0.11594202898550723

Class: macro avg
Precision: 0.5237798887087987
Recall: 0.5328825279362306
F1-score: 0.48729291441696326

Class: weighted avg
Precision: 0.6094869756131915
Recall: 0.6526315789473685
F1-score: 0.6052245625156423



In [26]:
# Final evaluation
f1_against = report['1']['f1-score']
f1_favor = report['2']['f1-score']
tweeteval_result_feminist = (f1_against+f1_favor) / 2
print("Final F1 - Stance FEMINIST: " + str(tweeteval_result_feminist))

Final F1 - Stance FEMINIST: 0.4412197454572206


### Stance: Abortion

In [27]:
# Import dataset
train_dataset, val_dataset, test_dataset = raw_reader_and_tokenize("abortion")


# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model on the training dataset
trainer.train()

# Generate predictions for the test set
test_predictions = trainer.predict(test_dataset).predictions
test_predictions = np.argmax(test_predictions, axis=-1)

# Compute the classification report
report = classification_report(test_dataset['label'], test_predictions, output_dict=True)

# Print the classification report
for label in report:
    if label != 'accuracy':
        print(f"Class: {label}")
        print(f"Precision: {report[label]['precision']}")
        print(f"Recall: {report[label]['recall']}")
        print(f"F1-score: {report[label]['f1-score']}\n")

Map:   0%|          | 0/587 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 587
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 95
  Number of trainable parameters = 134902275


  0%|          | 0/95 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7359788359788361}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6851851851851851}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7034077034077034}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-19
Configuration saved in ./results\checkpoint-19\config.json


{'eval_loss': 0.6615263223648071, 'eval_precision': {'precision': 0.7359788359788361}, 'eval_recall': {'recall': 0.6851851851851851}, 'eval_f1': {'f1': 0.7034077034077034}, 'eval_runtime': 2.8533, 'eval_samples_per_second': 23.131, 'eval_steps_per_second': 1.051, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-19\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7171877760113055}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7129629629629629}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.714851054577082}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-38
Configuration saved in ./results\checkpoint-38\config.json


{'eval_loss': 0.6878447532653809, 'eval_precision': {'precision': 0.7171877760113055}, 'eval_recall': {'recall': 0.7129629629629629}, 'eval_f1': {'f1': 0.714851054577082}, 'eval_runtime': 3.2953, 'eval_samples_per_second': 20.029, 'eval_steps_per_second': 0.91, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-38\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.6828282828282828}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7037037037037037}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6902844873859366}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-57
Configuration saved in ./results\checkpoint-57\config.json


{'eval_loss': 0.6775017976760864, 'eval_precision': {'precision': 0.6828282828282828}, 'eval_recall': {'recall': 0.7037037037037037}, 'eval_f1': {'f1': 0.6902844873859366}, 'eval_runtime': 2.7448, 'eval_samples_per_second': 24.045, 'eval_steps_per_second': 1.093, 'epoch': 3.0}


Model weights saved in ./results\checkpoint-57\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.6633986928104575}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.6759259259259259}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6683760683760683}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-76
Configuration saved in ./results\checkpoint-76\config.json


{'eval_loss': 0.6935228109359741, 'eval_precision': {'precision': 0.6633986928104575}, 'eval_recall': {'recall': 0.6759259259259259}, 'eval_f1': {'f1': 0.6683760683760683}, 'eval_runtime': 3.3371, 'eval_samples_per_second': 19.778, 'eval_steps_per_second': 0.899, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-76\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7282765737874097}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7777777777777777}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7378578410836475}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-95
Configuration saved in ./results\checkpoint-95\config.json


{'eval_loss': 0.7481910586357117, 'eval_precision': {'precision': 0.7282765737874097}, 'eval_recall': {'recall': 0.7777777777777777}, 'eval_f1': {'f1': 0.7378578410836475}, 'eval_runtime': 3.1779, 'eval_samples_per_second': 20.768, 'eval_steps_per_second': 0.944, 'epoch': 5.0}


Model weights saved in ./results\checkpoint-95\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-19 (score: 0.6615263223648071).
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 280
  Batch size = 32


{'train_runtime': 899.8938, 'train_samples_per_second': 3.261, 'train_steps_per_second': 0.106, 'train_loss': 0.4689614145379317, 'epoch': 5.0}


  0%|          | 0/9 [00:00<?, ?it/s]

Class: 0
Precision: 0.45454545454545453
Recall: 0.7777777777777778
F1-score: 0.5737704918032788

Class: 1
Precision: 0.7722222222222223
Recall: 0.7354497354497355
F1-score: 0.7533875338753387

Class: 2
Precision: 0.4782608695652174
Recall: 0.2391304347826087
F1-score: 0.3188405797101449

Class: macro avg
Precision: 0.5683428487776313
Recall: 0.584119316003374
F1-score: 0.5486662017962541

Class: weighted avg
Precision: 0.6728733766233768
Recall: 0.6607142857142857
F1-score: 0.6531306525009043



In [28]:
# Final evaluation
f1_against = report['1']['f1-score']
f1_favor = report['2']['f1-score']
tweeteval_result_abortion = (f1_against+f1_favor) / 2
print("Final F1 - Stance ABORTION: " + str(tweeteval_result_abortion))

Final F1 - Stance ABORTION: 0.5361140567927418


### Stance: Hillary

In [29]:
# Import dataset
train_dataset, val_dataset, test_dataset = raw_reader_and_tokenize("abortion")


# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Fine-tune the model on the training dataset
trainer.train()

# Generate predictions for the test set
test_predictions = trainer.predict(test_dataset).predictions
test_predictions = np.argmax(test_predictions, axis=-1)

# Compute the classification report
report = classification_report(test_dataset['label'], test_predictions, output_dict=True)

# Print the classification report
for label in report:
    if label != 'accuracy':
        print(f"Class: {label}")
        print(f"Precision: {report[label]['precision']}")
        print(f"Recall: {report[label]['recall']}")
        print(f"F1-score: {report[label]['f1-score']}\n")

Map:   0%|          | 0/587 [00:00<?, ? examples/s]

Map:   0%|          | 0/66 [00:00<?, ? examples/s]

Map:   0%|          | 0/280 [00:00<?, ? examples/s]

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 587
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 95
  Number of trainable parameters = 134902275


  0%|          | 0/95 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.6805555555555557}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7037037037037037}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6562289562289562}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-19
Configuration saved in ./results\checkpoint-19\config.json


{'eval_loss': 0.8476704955101013, 'eval_precision': {'precision': 0.6805555555555557}, 'eval_recall': {'recall': 0.7037037037037037}, 'eval_f1': {'f1': 0.6562289562289562}, 'eval_runtime': 2.7017, 'eval_samples_per_second': 24.43, 'eval_steps_per_second': 1.11, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-19\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7175925925925926}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.611111111111111}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.6331569664902998}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-38
Configuration saved in ./results\checkpoint-38\config.json


{'eval_loss': 0.8007002472877502, 'eval_precision': {'precision': 0.7175925925925926}, 'eval_recall': {'recall': 0.611111111111111}, 'eval_f1': {'f1': 0.6331569664902998}, 'eval_runtime': 2.9279, 'eval_samples_per_second': 22.542, 'eval_steps_per_second': 1.025, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-38\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7052287581699347}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7222222222222223}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7053092501368363}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-57
Configuration saved in ./results\checkpoint-57\config.json


{'eval_loss': 0.7921849489212036, 'eval_precision': {'precision': 0.7052287581699347}, 'eval_recall': {'recall': 0.7222222222222223}, 'eval_f1': {'f1': 0.7053092501368363}, 'eval_runtime': 2.7333, 'eval_samples_per_second': 24.147, 'eval_steps_per_second': 1.098, 'epoch': 3.0}


Model weights saved in ./results\checkpoint-57\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.7357142857142858}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.75}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7392064807143514}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-76
Configuration saved in ./results\checkpoint-76\config.json


{'eval_loss': 0.8476834297180176, 'eval_precision': {'precision': 0.7357142857142858}, 'eval_recall': {'recall': 0.75}, 'eval_f1': {'f1': 0.7392064807143514}, 'eval_runtime': 3.0218, 'eval_samples_per_second': 21.841, 'eval_steps_per_second': 0.993, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-76\pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 66
  Batch size = 32


  0%|          | 0/3 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'precision': 0.734640522875817}" of type <class 'dict'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'recall': 0.7592592592592594}" of type <class 'dict'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'f1': 0.7378215654077723}" of type <class 'dict'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results\checkpoint-95
Configuration saved in ./results\checkpoint-95\config.json


{'eval_loss': 0.8488245606422424, 'eval_precision': {'precision': 0.734640522875817}, 'eval_recall': {'recall': 0.7592592592592594}, 'eval_f1': {'f1': 0.7378215654077723}, 'eval_runtime': 3.5002, 'eval_samples_per_second': 18.856, 'eval_steps_per_second': 0.857, 'epoch': 5.0}


Model weights saved in ./results\checkpoint-95\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-57 (score: 0.7921849489212036).
The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 280
  Batch size = 32


{'train_runtime': 520.1074, 'train_samples_per_second': 5.643, 'train_steps_per_second': 0.183, 'train_loss': 0.3170971117521587, 'epoch': 5.0}


  0%|          | 0/9 [00:00<?, ?it/s]

Class: 0
Precision: 0.5076923076923077
Recall: 0.7333333333333333
F1-score: 0.6

Class: 1
Precision: 0.8544303797468354
Recall: 0.7142857142857143
F1-score: 0.7780979827089337

Class: 2
Precision: 0.5263157894736842
Recall: 0.6521739130434783
F1-score: 0.5825242718446602

Class: macro avg
Precision: 0.6294794923042758
Recall: 0.6999309868875087
F1-score: 0.6535407515178646

Class: weighted avg
Precision: 0.7448000783360543
Recall: 0.7071428571428572
F1-score: 0.7173451258458674



In [30]:
# Final evaluation
f1_against = report['1']['f1-score']
f1_favor = report['2']['f1-score']
tweeteval_result_hillary = (f1_against+f1_favor) / 2
print("Final F1 - Stance HILLARY: " + str(tweeteval_result_hillary))

Final F1 - Stance HILLARY: 0.680311127276797


The average F1 for all five datasets is:

In [44]:
f1_all = [tweeteval_result, tweeteval_result_abortion, tweeteval_result_atheism, tweeteval_result_feminist, tweeteval_result_hillary]
f1_all = np.array(f1_all)
f1_all_dic = {"Climate":tweeteval_result, "Abortion": tweeteval_result_abortion, "Atheism": tweeteval_result_atheism, "Feminist": tweeteval_result_feminist, "Hillary": tweeteval_result_hillary}
print("Global F1 mean: " + str(np.mean(f1_all)))
print(f1_all_dic)

Global F1 mean: 0.5626566269041297
{'Climate': 0.44578313253012053, 'Abortion': 0.5361140567927418, 'Atheism': 0.7098550724637681, 'Feminist': 0.4412197454572206, 'Hillary': 0.680311127276797}


Hyperparameter search: best learning rate.
### (work in progress)

In [17]:
train_dataset, val_dataset, test_dataset = raw_reader_and_tokenize("climate")



In [45]:
# # Define the range of learning rates to search over
# learning_rates = [1e-3, 1e-4, 1e-5]

# # Initialize variables to store the best learning rate and validation loss
# best_learning_rate = None
# best_val_loss = np.inf

# # Perform grid search over learning rates
# for lr in learning_rates:
#     # Define the training arguments
#     training_args = TrainingArguments(
#         output_dir='./results',
#         evaluation_strategy='epoch',
#         save_strategy='epoch',
#         learning_rate=lr,
#         per_device_train_batch_size=32,
#         per_device_eval_batch_size=32,
#         num_train_epochs=5,
#         logging_dir='./logs',
#         logging_steps=10,
#         load_best_model_at_end=True,
#         seed=1
#     )

#     # Create a Trainer instance
#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=val_dataset
#     )

#     # Fine-tune the model on the training dataset
#     trainer.train()

#     # Evaluate the model on the validation dataset
#     val_loss = trainer.evaluate()['eval_loss']

#     # Update the best learning rate and validation loss if necessary
#     if val_loss < best_val_loss:
#         best_learning_rate = lr
#         best_val_loss = val_loss

# # Print the best learning rate
# print(f"Best learning rate: {best_learning_rate}")