In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import wandb
import spacy
import random
import logging
import pytextrank
from tqdm import tqdm
from torch.optim import Adam
from torch.nn import DataParallel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import load_dataset,concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

In [12]:
# Summarize the input to how many tokens, default to 512 for BERT use.
# Since LongFormer could accept 4096 tokens, we could skip TextRank if LongFormer
seed_val = 42
ENABLE_TEXT_RANK = False
TEXT_RANK_LENGTH = 512
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4

MODEL = "T5base" # Choose from "T5base", "roberta", "Distilbert", "FlanT5small"

NEW_BATCH_SIZE = 8 if MODEL in ["roberta", "Distilbert", "FlanT5small"] else 4 

wandb.login(key='bf24a38a046a0448057459477a5d48fbc6eb2f6a')
wandb.init()



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670070933317523, max=1.0…

wandb: Network error (ConnectionError), entering retry loop.


In [3]:
dataset = load_dataset("argilla/banking_sentiment_setfit")
test = concatenate_datasets([dataset['train'], dataset['test']])

test[:5]

Found cached dataset parquet (/home/fangkangmi/.cache/huggingface/datasets/argilla___parquet/argilla--banking_sentiment_setfit-4a60f83f113675bf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

{'text': ['are you sending me my card?',
  "Two days ago I did a transfer to another account within the country.  It doesn't appear the transfer went through.  I have verified the account number several times.  Could you please check on this for me?",
  "Why didn't I receive the right amount of cash?",
  "Is there a reason why my virtual card won't work?",
  'Why is my balance the same after a transfer?'],
 'label': [1, 1, 0, 0, 1]}

In [4]:
# If there's a GPU available...
# If GPU not available, training will cost SEVERAL DAYS, not recommended running on CPU
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU insteadp(not recommended).')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [5]:
# Set the seed value all over the place to make this reproducible.
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [6]:

tokenizer_options = {
    "T5base": AutoTokenizer.from_pretrained("michelecafagna26/t5-base-finetuned-sst2-sentiment"),
    "roberta": AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest"),
    "Distilbert": AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
    "FlanT5small":  AutoTokenizer.from_pretrained("cardiffnlp/flan-t5-small-tweet-sentiment")
}

model_options = {
    "T5base": "michelecafagna26/t5-base-finetuned-sst2-sentiment",
    "roberta":"cardiffnlp/twitter-roberta-base-sentiment-latest",
    "Distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
    "FlanT5small": "cardiffnlp/flan-t5-small-tweet-sentiment"
}
    


In [7]:
#Loading model:

# Suppress transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
    
print("Loading models...")
tokenizer = AutoTokenizer.from_pretrained(model_options.get(MODEL))
model = AutoModelForSeq2SeqLM.from_pretrained(model_options.get(MODEL)) if MODEL in ["T5base", "FlanT5small"] \
else AutoModelForSequenceClassification.from_pretrained(model_options.get(MODEL))

# Restore default logging level for transformers
logging.getLogger("transformers").setLevel(logging.WARNING)

print('done')

Loading models...
done


# Tokenize

By now the test dataset, model and tokenizer has been loaded. The next step is to tokenize the dataset.

In [8]:
test = test.map(lambda example: tokenizer(example['text'], truncation=True, padding=True), batched=True)

Loading cached processed dataset at /home/fangkangmi/.cache/huggingface/datasets/argilla___parquet/argilla--banking_sentiment_setfit-4a60f83f113675bf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-90158929d9e14cb8.arrow


In [9]:
pd.DataFrame(test).head(5)

Unnamed: 0,text,label,input_ids,attention_mask
0,are you sending me my card?,1,"[33, 25, 5657, 140, 82, 895, 58, 1, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,Two days ago I did a transfer to another accou...,1,"[2759, 477, 977, 27, 410, 3, 9, 2025, 12, 430,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,Why didn't I receive the right amount of cash?,0,"[1615, 737, 31, 17, 27, 911, 8, 269, 866, 13, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ..."
3,Is there a reason why my virtual card won't work?,0,"[27, 7, 132, 3, 9, 1053, 572, 82, 4291, 895, 7...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,Why is my balance the same after a transfer?,1,"[1615, 19, 82, 2109, 8, 337, 227, 3, 9, 2025, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ..."


Trainer

In [10]:
# Use the selected model
torch.cuda.empty_cache()
model = model.to('cuda')

training_args = TrainingArguments(
    output_dir='./results',  # Directory to save checkpoints and final model
    num_train_epochs= NUM_EPOCHS,  # Number of training epochs
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size=NEW_BATCH_SIZE,
    save_strategy='epoch',
    save_total_limit = 1,
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=200,
    evaluation_strategy='no',  # No evaluation during training
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)