# TweetEval - Avaliação Final (NLP)

### Ellen Shen

In [1]:
import pandas as pd

def load_data(text_path, labels_path):
    with open(text_path, 'r') as f:
        texts = f.readlines()
    with open(labels_path, 'r') as f:
        labels = [int(label.strip()) for label in f.readlines()]
    return pd.DataFrame({'text': texts, 'label': labels})

In [None]:
train_text_path = "../dataset/train_text.txt"
train_labels_path = "../dataset/train_labels.txt"
val_text_path = "../dataset/val_text.txt"
val_labels_path = "../dataset/val_labels.txt"

train_data = load_data(train_text_path, train_labels_path)
val_data = load_data(val_text_path, val_labels_path)

print(train_data.head())


                                                text  label
0  "QT @user In the original draft of the 7th boo...      2
1  "Ben Smith / Smith (concussion) remains out of...      1
2  Sorry bout the stream last night I crashed out...      1
3  Chase Headley's RBI double in the 8th inning o...      1
4  @user Alciato: Bee will invest 150 million in ...      2


In [None]:
test_text_path = "../dataset/test_text.txt"
test_labels_path = "../dataset/test_labels.txt"
test_data = load_data(test_text_path, test_labels_path)

print(test_data.head())

                                                text  label
0  @user @user what do these '1/2 naked pics' hav...      1
1  OH: “I had a blue penis while I was this” [pla...      1
2  @user @user That's coming, but I think the vic...      1
3  I think I may be finally in with the in crowd ...      2
4  @user Wow,first Hugo Chavez and now Fidel Cast...      0


## BERT Fine-Tuned Model

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_data['text']), truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch

train_labels = torch.tensor(train_data['label'].tolist())
val_labels = torch.tensor(val_data['label'].tolist())

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,                      # Reduce epochs
    per_device_train_batch_size=8,           # Reduce batch size
    per_device_eval_batch_size=16,           # Evaluation batch size can stay higher
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=100,                       # Log less frequently
    learning_rate=5e-5,
    fp16=True,                               # Use mixed precision for faster GPU training
)


from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = TweetDataset(
    {key: val[:1000] for key, val in train_encodings.items()},  # First 1000 samples
    train_labels[:1000]
)

val_dataset = TweetDataset(
    {key: val[:500] for key, val in val_encodings.items()},  # First 500 samples
    val_labels[:500]
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss
1,1.0013,0.906303


  item['labels'] = torch.tensor(self.labels[idx])


TrainOutput(global_step=125, training_loss=0.9785629577636719, metrics={'train_runtime': 1540.6444, 'train_samples_per_second': 0.649, 'train_steps_per_second': 0.081, 'total_flos': 59611633704000.0, 'train_loss': 0.9785629577636719, 'epoch': 1.0})

In [11]:
from sklearn.metrics import classification_report

test_text_path = "test_text.txt"
test_labels_path = "test_labels.txt"
test_data = load_data(test_text_path, test_labels_path)

test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)
test_labels = torch.tensor(test_data['label'].tolist())

preds = trainer.predict(TweetDataset(test_encodings, test_labels))
predictions = preds.predictions.argmax(axis=1)

print(classification_report(test_labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))


  item['labels'] = torch.tensor(self.labels[idx])


              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00      3972
     Neutral       0.50      0.70      0.58      5937
    Positive       0.44      0.75      0.56      2375

    accuracy                           0.48     12284
   macro avg       0.31      0.48      0.38     12284
weighted avg       0.33      0.48      0.39     12284



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## RoBERTa-Base Model

In [4]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.utils import resample
import pandas as pd

train_encodings = tokenizer(list(train_data['text']), truncation=True, padding=True, max_length=128)
train_labels = train_data['label'].tolist()
val_encodings = tokenizer(list(val_data['text']), truncation=True, padding=True, max_length=128)
val_labels = val_data['label'].tolist()


In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,                     # Increase epochs
    per_device_train_batch_size=16,         # Adjust batch size
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,                     # Lower learning rate
    fp16=True,                              # Enable mixed precision for GPUs
    logging_dir='./logs',
    logging_steps=500,
    load_best_model_at_end=True
)




In [14]:
from transformers import Trainer
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = TweetDataset(
    {key: val[:5000] for key, val in train_encodings.items()},
    train_labels[:5000]
)

val_dataset = TweetDataset(
    {key: val[:2000] for key, val in val_encodings.items()},
    val_labels[:2000]
)

# train_dataset = TweetDataset(train_encodings, train_labels)
# val_dataset = TweetDataset(val_encodings, val_labels)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,8.34088


TrainOutput(global_step=313, training_loss=0.004183435211547267, metrics={'train_runtime': 5085.9769, 'train_samples_per_second': 0.983, 'train_steps_per_second': 0.062, 'total_flos': 244099362150000.0, 'train_loss': 0.004183435211547267, 'epoch': 1.0})

In [16]:
from sklearn.metrics import recall_score, classification_report
import torch

test_dataset = TweetDataset(test_encodings, test_labels)
preds = trainer.predict(test_dataset)
predictions = preds.predictions.argmax(axis=1)

macro_recall = recall_score(test_labels, predictions, average='macro')
print(f"Macro-Averaged Recall: {macro_recall:.2f}")

print(classification_report(test_labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))


Macro-Averaged Recall: 0.33
              precision    recall  f1-score   support

    Negative       0.32      1.00      0.49      3972
     Neutral       0.00      0.00      0.00      5937
    Positive       0.00      0.00      0.00      2375

    accuracy                           0.32     12284
   macro avg       0.11      0.33      0.16     12284
weighted avg       0.10      0.32      0.16     12284



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Smaller Sample

In [27]:
# Número desejado de exemplos por classe
samples_per_class = 500

# Reduzindo o conjunto de treinamento
train_data_reduced = train_data.groupby("label").apply(lambda x: x.sample(n=samples_per_class, random_state=42)).reset_index(drop=True)

# Reduzindo o conjunto de validação
val_data_reduced = val_data.groupby("label").apply(lambda x: x.sample(n=samples_per_class // 2, random_state=42)).reset_index(drop=True)


  train_data_reduced = train_data.groupby("label").apply(lambda x: x.sample(n=samples_per_class, random_state=42)).reset_index(drop=True)
  val_data_reduced = val_data.groupby("label").apply(lambda x: x.sample(n=samples_per_class // 2, random_state=42)).reset_index(drop=True)


In [28]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
train_encodings = tokenizer(list(train_data_reduced['text']), truncation=True, padding=True, max_length=128)
train_labels = train_data_reduced['label'].tolist()
val_encodings = tokenizer(list(val_data_reduced['text']), truncation=True, padding=True, max_length=128)
val_labels = val_data_reduced['label'].tolist()

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,                     # Increase epochs
    per_device_train_batch_size=16,         # Adjust batch size
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,                     # Lower learning rate
    fp16=True,                              # Enable mixed precision for GPUs
    logging_dir='./logs',
    logging_steps=500,
    load_best_model_at_end=True
)




In [32]:
from transformers import Trainer
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.893416


Epoch,Training Loss,Validation Loss
1,No log,0.893416
2,No log,0.735309
3,No log,0.771247


TrainOutput(global_step=282, training_loss=0.6721485219103225, metrics={'train_runtime': 3448.9548, 'train_samples_per_second': 1.305, 'train_steps_per_second': 0.082, 'total_flos': 143376256926000.0, 'train_loss': 0.6721485219103225, 'epoch': 3.0})

In [33]:
from sklearn.metrics import recall_score, classification_report
import torch


test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)
test_labels = test_data['label'].tolist()

test_dataset = TweetDataset(test_encodings, test_labels)
preds = trainer.predict(test_dataset)
predictions = preds.predictions.argmax(axis=1)

macro_recall = recall_score(test_labels, predictions, average='macro')
print(f"Macro-Averaged Recall: {macro_recall:.2f}")

print(classification_report(test_labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))


Macro-Averaged Recall: 0.71
              precision    recall  f1-score   support

    Negative       0.61      0.86      0.72      3972
     Neutral       0.78      0.47      0.59      5937
    Positive       0.60      0.78      0.68      2375

    accuracy                           0.66     12284
   macro avg       0.66      0.71      0.66     12284
weighted avg       0.69      0.66      0.65     12284

