In [4]:
# Import necessary libraries for training and testing

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
import torch
from torch import tensor
import torch.nn.functional as F
from torch import nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
from typing import Tuple
from datasets import load_dataset, Dataset

# For this approach we use a pre-trained model which is based on RoBERTa
model_name = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

output_dir = "./workspace/data/"

# load the preprocessed data
data = pd.read_csv("data_cleaned.csv")
data_sample = data.sample(frac=1, random_state=42)


if model_name.__contains__("cardiffnlp"):
    data_sample["text"] = data_sample["text"].apply(lambda x: x.replace("@Alex", "@user").replace("@Sam", "@user").replace("@Taylor", "@user").replace("<url>", "http"))

In [5]:
# Load the pretrained model as well as the tokenizer

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = model.config.max_position_embeddings
print("model parameters:" + str(sum(p.numel() for p in model.parameters())))

config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

model parameters:124647939


In [6]:
# Convert the labels into one-hot encoding


labels = data_sample["label"].tolist()
labels = [0 if x == 0 else 2 for x in labels]
labels = np.eye(3)[labels]
labels

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [7]:
# Prepare the training and validation sets

train_data,val_data, train_labels, val_labels = train_test_split(data_sample["text"], labels, test_size=5000/len(data_sample), random_state=42)
dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(train_data, train_labels)])
val_dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(val_data, val_labels)])

def tokenize_function(examples):
    return tokenizer(examples['text'])

dataset = dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
dataset

Map:   0%|          | 0/2261322 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 2261322
})

In [8]:
# Transfer learning part - define the training arguments and the trainer

from transformers import TrainerCallback, TrainerState, TrainerControl,training_args

# custom callback class for accuracy evaluation during the training
class CustomCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        # Assuming the evaluation dataset has 'labels' and 'predictions' fields
        eval_dataloader = kwargs['eval_dataloader']
        model = kwargs['model']
        tokenizer = kwargs['tokenizer']
        
        model.eval()
        correct = 0
        total = 0
        
        for batch in eval_dataloader:
            inputs = batch['input_ids'].to(args.device)
            labels = batch['labels'].to(args.device)
            
            with torch.no_grad():
                outputs = model(inputs)
                predictions = torch.argmax(outputs.logits, dim=-1)
            
            labels = torch.argmax(labels, dim=-1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
        
        accuracy = correct / total
        print(f"Evaluation Accuracy: {accuracy:.4f}")


# initialization of the trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=256,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim=training_args.OptimizerNames.ADAMW_TORCH, learning_rate=1e-4,
        # optim=training_args.OptimizerNames.LION,learning_rate=1e-5,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="outputs",
        num_train_epochs=1,
        report_to="wandb",
        # report_to="none",
        group_by_length=True,
        evaluation_strategy="steps",
        eval_steps=200,
    ),
    callbacks=[CustomCallback()],
)

# Run the training

trainer_stats = trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
200,0.2001,0.214203
400,0.2072,0.201095
600,0.2417,0.188866
800,0.2474,0.185
1000,0.1872,0.190119
1200,0.192,0.185359
1400,0.262,0.176505
1600,0.2555,0.180291
1800,0.2116,0.182714
2000,0.2094,0.176638


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Evaluation Accuracy: 0.8616
Evaluation Accuracy: 0.8714
Evaluation Accuracy: 0.8734
Evaluation Accuracy: 0.8764
Evaluation Accuracy: 0.8718
Evaluation Accuracy: 0.8842
Evaluation Accuracy: 0.8870
Evaluation Accuracy: 0.8802
Evaluation Accuracy: 0.8644
Evaluation Accuracy: 0.8868
Evaluation Accuracy: 0.8638
Evaluation Accuracy: 0.8812
Evaluation Accuracy: 0.8862
Evaluation Accuracy: 0.8890
Evaluation Accuracy: 0.8846
Evaluation Accuracy: 0.8696
Evaluation Accuracy: 0.8574
Evaluation Accuracy: 0.8868
Evaluation Accuracy: 0.8880
Evaluation Accuracy: 0.8846
Evaluation Accuracy: 0.8946
Evaluation Accuracy: 0.8826
Evaluation Accuracy: 0.8636
Evaluation Accuracy: 0.8932
Evaluation Accuracy: 0.8920
Evaluation Accuracy: 0.8894
Evaluation Accuracy: 0.8898
Evaluation Accuracy: 0.8778
Evaluation Accuracy: 0.8968
Evaluation Accuracy: 0.8962
Evaluation Accuracy: 0.8934
Evaluation Accuracy: 0.8940
Evaluation Accuracy: 0.8978
Evaluation Accuracy: 0.8966
Evaluation Accuracy: 0.8940
Evaluation Accuracy:

In [10]:
# save the model and the tokenizer

torch.save(model, "bert.pt")
torch.save(tokenizer, "bert_tokenizer.pt")

In [None]:
# Run the model on the submission test data and save the predictions

from tqdm import tqdm
test_df = pd.read_csv("test_data_cleaned.csv")
model = model.cuda()
model = model.eval()

batch_size=50
all_preds = []
with torch.no_grad():
    for i in tqdm(range(0, len(test_df), batch_size)):
        batch = test_df["text"][i:i+batch_size].tolist()
        tokens = tokenizer(batch, padding=True, return_tensors="pt")
        tokens = {k: v.cuda() for k, v in tokens.items()}
        output = model(**tokens)
        logits = output[0].cpu()
        scores = F.softmax(logits, dim=1)[:,2] # 0 -> Negative; 1 -> Neutral; 2 -> Positive
        all_preds.extend(scores.tolist())

binary_predictions = [1 if prob >= 0.5 else -1 for prob in all_preds]
submission_df = pd.DataFrame({"Prediction": binary_predictions})
submission_df["Id"] = submission_df.index + 1
submission_df = submission_df[["Id", "Prediction"]]
submission_df.to_csv("submission_bert.csv", index=False)

test_probs_df = pd.DataFrame({'Probability': all_preds})
test_probs_df.to_csv("test_probs_bert.csv", index=False)