In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import wandb
import spacy
import random
import logging
import pytextrank
from tqdm import tqdm
from torch.optim import Adam
from torch.nn import DataParallel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import load_dataset,concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
# Summarize the input to how many tokens, default to 512 for BERT use.
# Since LongFormer could accept 4096 tokens, we could skip TextRank if LongFormer
seed_val = 42
ENABLE_TEXT_RANK = False
TEXT_RANK_LENGTH = 512
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4

MODEL = "roberta" # Choose from "T5base", "roberta", "Distilbert", "FlanT5small"

NEW_BATCH_SIZE = 8 if MODEL in ["roberta", "Distilbert", "FlanT5small"] else 4 

#wandb.login(key='bf24a38a046a0448057459477a5d48fbc6eb2f6a')
#wandb.init()

In [4]:
dataset = load_dataset("argilla/banking_sentiment_setfit")
test = concatenate_datasets([dataset['train'], dataset['test']])

test[:5]

Found cached dataset parquet (/home/fangkangmi/.cache/huggingface/datasets/argilla___parquet/argilla--banking_sentiment_setfit-4a60f83f113675bf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

{'text': ['are you sending me my card?',
  "Two days ago I did a transfer to another account within the country.  It doesn't appear the transfer went through.  I have verified the account number several times.  Could you please check on this for me?",
  "Why didn't I receive the right amount of cash?",
  "Is there a reason why my virtual card won't work?",
  'Why is my balance the same after a transfer?'],
 'label': [1, 1, 0, 0, 1]}

In [5]:
# If there's a GPU available...
# If GPU not available, training will cost SEVERAL DAYS, not recommended running on CPU
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU insteadp(not recommended).')
    device = torch.device("cpu")

No GPU available, using the CPU insteadp(not recommended).


In [6]:
# Set the seed value all over the place to make this reproducible.
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [7]:

tokenizer_options = {
    "T5base": AutoTokenizer.from_pretrained("michelecafagna26/t5-base-finetuned-sst2-sentiment"),
    "roberta": AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest"),
    "Distilbert": AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english"),
    "FlanT5small":  AutoTokenizer.from_pretrained("cardiffnlp/flan-t5-small-tweet-sentiment")
}

model_options = {
    "T5base": "michelecafagna26/t5-base-finetuned-sst2-sentiment",
    "roberta":"cardiffnlp/twitter-roberta-base-sentiment-latest",
    "Distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
    "FlanT5small": "cardiffnlp/flan-t5-small-tweet-sentiment"
}
    


In [8]:
#Loading model:

# Suppress transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)
    
print("Loading models...")
tokenizer = AutoTokenizer.from_pretrained(model_options.get(MODEL))
model = AutoModelForSeq2SeqLM.from_pretrained(model_options.get(MODEL)) if MODEL in ["T5base", "FlanT5small"] \
else AutoModelForSequenceClassification.from_pretrained(model_options.get(MODEL))

# Restore default logging level for transformers
logging.getLogger("transformers").setLevel(logging.WARNING)

print('done')

Loading models...
done


# Tokenize

By now the test dataset, model and tokenizer has been loaded. The next step is to tokenize the dataset.

In [9]:
dataset = load_dataset("argilla/banking_sentiment_setfit")
test = concatenate_datasets([dataset['train'], dataset['test']])

Found cached dataset parquet (/home/fangkangmi/.cache/huggingface/datasets/argilla___parquet/argilla--banking_sentiment_setfit-4a60f83f113675bf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
def T5_tokenization(dataset):
    dataset['text'] = list(map(lambda s: 'sentiment ' + s, dataset['text']))
    return tokenizer(dataset['text'], max_length=128, padding=True, return_tensors="pt")

def T5_get_sentiment(tensor):
    preds = model.generate(tensor)
    decoded_preds = tokenizer.batch_decode(sequences=preds, skip_special_tokens=True)
    return decoded_preds
def RoBERTa_tokenization(dataset):
    return tokenizer(dataset, max_length=128, padding=True, return_tensors="pt")
class RoBERTa_Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)





In [11]:
if(MODEL == 'T5base'):
    T5test = test.map(T5tokenization, batched = True)['input_ids']
    T5predict = T5get_sentiment(torch.tensor(T5test))
    T5predict_digit = [1 if i == 'p' else 0 for i in T5predict]
    # Print the classification report
    # 1 means positive 0 means negative
    report = classification_report(T5predict_digit, test['label'])
    print(report)


In [12]:
if(MODEL =='roberta'):

    tokenized_RoBERTa_test = RoBERTa_tokenization(test['text'])
    RoBERTa_dataset_test = RoBERTa_Dataset(tokenized_RoBERTa_test,test['label'])
    RoBERTa_dataloader = DataLoader(RoBERTa_dataset_test, batch_size=8, shuffle=True)

    # Iterate over the test dataset
    model.eval()
    model.to('cuda')
    predictions = []
    true_labels = []
    for batch in RoBERTa_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}

        # Disable gradient calculation
        with torch.no_grad():
            # Perform inference
            outputs = model(**batch)

        predicted_values = outputs.logits

        predictions.extend(predicted_values)
        true_labels.extend(batch['labels'].tolist())

    # Convert logits to predictions
    print(predictions)
    predictions = [torch.argmax(item).item() for item in predictions]

    # Generate classification report
    report = classification_report(test['label'], predictions)




In [None]:
print(predictions)