In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import torch
import wandb
import spacy
import random
import logging
import pytextrank
from tqdm import tqdm
from torch.optim import Adam
from torch.nn import DataParallel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import load_dataset,concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification

In [2]:
# Summarize the input to how many tokens, default to 512 for BERT use.
# Since LongFormer could accept 4096 tokens, we could skip TextRank if LongFormer
seed_val = 42
ENABLE_TEXT_RANK = False
TEXT_RANK_LENGTH = 512
LEARNING_RATE = 2e-5
NUM_EPOCHS = 4

MODEL = ["T5base","roberta","Distilbert","FlanT5small","FlanT5base"] # Choose from "T5base", "roberta", "Distilbert", "FlanT5small"

NEW_BATCH_SIZE = 8 if MODEL in ["roberta", "Distilbert", "FlanT5small"] else 4 

#wandb.login(key='bf24a38a046a0448057459477a5d48fbc6eb2f6a')
#wandb.init()

In [3]:
# If there's a GPU available...
# If GPU not available, training will cost SEVERAL DAYS, not recommended running on CPU
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU insteadp(not recommended).')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [17]:
dataset = load_dataset("argilla/banking_sentiment_setfit")
test = concatenate_datasets([dataset['train'], dataset['test']])
test_text = test['text']
test[:5]

Found cached dataset parquet (/home/fangkangmi/.cache/huggingface/datasets/argilla___parquet/argilla--banking_sentiment_setfit-4a60f83f113675bf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

{'text': ['are you sending me my card?',
  "Two days ago I did a transfer to another account within the country.  It doesn't appear the transfer went through.  I have verified the account number several times.  Could you please check on this for me?",
  "Why didn't I receive the right amount of cash?",
  "Is there a reason why my virtual card won't work?",
  'Why is my balance the same after a transfer?'],
 'label': [1, 1, 0, 0, 1]}

In [5]:
# Set the seed value all over the place to make this reproducible.
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [6]:
model_options = {
    "T5base": "michelecafagna26/t5-base-finetuned-sst2-sentiment",
    "roberta":"cardiffnlp/twitter-roberta-base-sentiment-latest",
    "Distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
    "FlanT5small": "cardiffnlp/flan-t5-small-tweet-sentiment",
    "FlanT5base": "cardiffnlp/flan-t5-base-tweet-sentiment"
}

In [7]:
def set_model_and_tokenizer(model):
    return AutoModelForSeq2SeqLM.from_pretrained(model_options.get(model)) if model in ["T5base", "FlanT5small", "FlanT5base"] \
           else AutoModelForSequenceClassification.from_pretrained(model_options.get(model))\
           ,AutoTokenizer.from_pretrained(model_options.get(model))

# Tokenize

By now the test dataset, model and tokenizer has been loaded. The next step is to tokenize the dataset.

In [24]:
def T5_get_sentiment(tensor):
    preds = model.generate(tensor)
    decoded_preds = tokenizer.batch_decode(sequences=preds, skip_special_tokens=True)
    return decoded_preds

if('T5base' in MODEL):
    model, tokenizer = set_model_and_tokenizer('T5base')
    
    
    #Add prompt
    T5_test_text = ["sentiment: " + item for item in test_text]
    #Tokenizer
    T5_tokenized_text = tokenizer(T5_test_text, max_length=128, padding=True, return_tensors="pt")['input_ids']
    
    #Create the prediction
    T5_predict = T5_get_sentiment(torch.tensor(T5_tokenized_text))
    T5_predict_digit = [1 if i == 'p' else 0 for i in T5_predict]
    
    # Print the classification report
    # 1 means positive 0 means negative
    report = classification_report(test['label'],T5_predict_digit)
    print(report)


  T5_predict = T5_get_sentiment(torch.tensor(T5_tokenized_text))


              precision    recall  f1-score   support

           0       0.38      0.94      0.54        51
           1       0.81      0.14      0.24        93

    accuracy                           0.42       144
   macro avg       0.59      0.54      0.39       144
weighted avg       0.66      0.42      0.34       144



In [29]:
class RoBERTa_Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

if('roberta' in MODEL):

    model, tokenizer = set_model_and_tokenizer('roberta')
    
    #Add prompt
    RoBERTa_test_text = [" " + item for item in test_text]
    
    #Tokenize, with max_length= 128 could increase the accuracy from 50 -> 53
    RoBERTa_tokenized_text = tokenizer(RoBERTa_test_text,max_length=128, padding=True, return_tensors="pt")
    
    #Dataset and Dataloader
    RoBERTa_dataset_test = RoBERTa_Dataset(RoBERTa_tokenized_text,test['label'])
    RoBERTa_dataloader = DataLoader(RoBERTa_dataset_test, batch_size=8, shuffle=True)

    # Iterate over the test dataset

    torch.cuda.empty_cache()
    model.eval()
    model.to('cuda')
    predictions = []
    true_labels = []

    for batch in RoBERTa_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}

        # Disable gradient calculation
        with torch.no_grad():
            # Perform inference
            outputs = model(**batch)

        predicted_values = outputs.logits

        predictions.extend(predicted_values)
        true_labels.extend(batch['labels'].tolist())
    # Convert logits to predictions
    # 1 and 2 means positive and 0 means negative
    predictions = [torch.argmax(item).item() for item in predictions]
    predictions = [1 if (i == 1 or i == 2) else i for i in predictions]
    # Generate classification report
    report = classification_report(test['label'], predictions)
    print(report)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0       0.25      0.27      0.26        51
           1       0.58      0.56      0.57        93

    accuracy                           0.46       144
   macro avg       0.42      0.42      0.42       144
weighted avg       0.47      0.46      0.46       144



In [30]:
class Distilbert_Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

if('Distilbert' in MODEL):
    model, tokenizer = set_model_and_tokenizer('Distilbert')
    
    #No prompt for this 
    Distilbert_tokenized_text = tokenizer(RoBERTa_test_text,max_length=128, padding=True,  return_tensors="pt")
    
    #Dataset and Dataloader
    Distilbert_dataset_test = Distilbert_Dataset(Distilbert_tokenized_text,test['label'])
    Distilbert_dataloader = DataLoader(Distilbert_dataset_test, batch_size=8, shuffle=True)

    # Iterate over the test dataset

    torch.cuda.empty_cache()
    model.eval()
    model.to('cuda')
    predictions = []
    true_label = []

    for batch in Distilbert_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}

        # Disable gradient calculation
        with torch.no_grad():
            # Perform inference
            outputs = model(**batch)

        predicted_values = outputs.logits
        predictions.extend(predicted_values)
        true_label.extend(batch['labels'].tolist())
    # Convert logits to predictions,  
    # 1 means positive and 0 means negative
    predictions = [torch.argmax(item).item() for item in predictions]
    # Generate classification report
    report = classification_report(test['label'], predictions)
    print(report)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


              precision    recall  f1-score   support

           0       0.36      0.96      0.52        51
           1       0.71      0.05      0.10        93

    accuracy                           0.38       144
   macro avg       0.54      0.51      0.31       144
weighted avg       0.59      0.38      0.25       144



In [None]:
def T5_get_sentiment(tensor):
    preds = model.generate(tensor)
    decoded_preds = tokenizer.batch_decode(sequences=preds, skip_special_tokens=True)
    return decoded_preds

if('T5base' in MODEL):
    model, tokenizer = set_model_and_tokenizer('T5base')
    
    #Add prompt
    T5_test_text = ["sentiment: " + item for item in test_text]
    
    #Tokenizer
    T5_tokenized_text = tokenizer(T5_test_text, max_length=128, padding=True, return_tensors="pt")['input_ids']
    
    #Create the prediction
    T5_predict = T5_get_sentiment(torch.tensor(T5_tokenized_text))
    T5_predict_digit = [1 if i == 'p' else 0 for i in T5_predict]
    
    # Print the classification report
    # 1 means positive 0 means negative
    report = classification_report(test['label'],T5_predict_digit)
    print(report)


In [33]:
def Flan_T5_get_sentiment(tensor):
    preds = model.generate(tensor)
    decoded_preds = tokenizer.batch_decode(sequences=preds, skip_special_tokens=True)
    return decoded_preds

if('FlanT5small' in MODEL):
    model, tokenizer = set_model_and_tokenizer('FlanT5small')
    
    #Add prompt
    Flan_T5_small_test_text = ["context: " + item for item in test_text]
    #Tokenizer
    Flan_T5_small_tokenized_test = tokenizer(Flan_T5_small_test_text, \
                                             max_length=128, padding=True, return_tensors="pt")['input_ids']
    
    
    Flan_T5_small_predict = Flan_T5_get_sentiment(torch.tensor(Flan_T5_small_tokenized_test))
    Flan_T5_small_predict_digit = [0 if i == 'negative' else 1 for i in Flan_T5_small_predict]
    # Print the classification report
    # 1 means positive 0 means negative
    report = classification_report(test['label'],Flan_T5_small_predict_digit)
    print(report)


  Flan_T5_small_predict = Flan_T5_get_sentiment(torch.tensor(Flan_T5_small_tokenized_test))


              precision    recall  f1-score   support

           0       0.67      0.04      0.07        51
           1       0.65      0.99      0.79        93

    accuracy                           0.65       144
   macro avg       0.66      0.51      0.43       144
weighted avg       0.66      0.65      0.53       144



In [40]:
Flan_T5_small_predict

['negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative',
 'negative or neutral',
 'positive',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative',
 'negative or neutral',
 'negative or neutral',
 'positive',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'positive',
 'negative or neutral',
 'negative or neutral',
 'negative or neutral',
 'positive',
 'negative or neutral',
 'positive

In [38]:
def Flan_T5_get_sentiment(tensor):
    preds = model.generate(tensor)
    decoded_preds = tokenizer.batch_decode(sequences=preds, skip_special_tokens=True)
    return decoded_preds

if('FlanT5base' in MODEL):
    model, tokenizer = set_model_and_tokenizer('FlanT5base')
    
    #Add prompt
    Flan_T5_base_test_text = ["context: " + item for item in test_text]
    #Tokenizer
    Flan_T5_base_tokenized_test = tokenizer(Flan_T5_base_test_text, \
                                             max_length=128, padding=True, return_tensors="pt")['input_ids']
    
    
    Flan_T5_base_predict = Flan_T5_get_sentiment(torch.tensor(Flan_T5_base_tokenized_test))
    Flan_T5_base_predict_digit = [0 if i == 'negative' else 1 for i in Flan_T5_base_predict]
    # Print the classification report
    # 1 means positive 0 means negative
    report = classification_report(test['label'],Flan_T5_base_predict_digit)
    print(report)


  Flan_T5_base_predict = Flan_T5_get_sentiment(torch.tensor(Flan_T5_base_tokenized_test))


              precision    recall  f1-score   support

           0       0.60      0.65      0.62        51
           1       0.80      0.76      0.78        93

    accuracy                           0.72       144
   macro avg       0.70      0.71      0.70       144
weighted avg       0.73      0.72      0.72       144



In [39]:
Flan_T5_base_predict

['no',
 'negative or neutral',
 'I was not able to pay my bills',
 'negative',
 'i have a different balance',
 'negative',
 'negative',
 'a bank',
 'negative',
 'Press the "Reset Passcode" button on your computer.',
 'negative',
 '$600',
 'context: I want to know the limits of the disposable cards.',
 'a bank account',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'no',
 'a restaurant',
 'negative',
 'no',
 'negative',
 'negative',
 'negative',
 '',
 'negative',
 'click on the "Get a Virtual Card" button and then click on the "Get ',
 'negative',
 'yes',
 'a bank charge',
 'if you are unsure of what to do, contact the company directly.',
 'no',
 'negative',
 'negative or neutral',
 'The exchange rate is the amount of money that is being spent on the exchange rate.',
 'negative or neutral',
 'a child',
 'negative',
 'negative',
 'negative',
 'a website',
 'a credit card',
 'franc',
 'a fee',
 'negative',
 'a digit code',
 'yes',
 'a few days',
 'negative',
 '',
 'neg

In [49]:
#Below are using pipeline that provided by 
#https://huggingface.co/cardiffnlp/flan-t5-base-tweet-sentiment?text=context%3A+If+I+make+a+game+as+a+%23windows10+Universal+App.+Will+%23xboxone+owners+be+able+to+download+and+play+it+in+November%3F+%40majornelson+%40Microsoft%2C+target%3A+%40microsoft
from transformers import pipeline

pipe = pipeline('text2text-generation', model="cardiffnlp/flan-t5-base-tweet-sentiment")
Flan_T5_base_predict_digit_pipeline = [0 if i == 'negative' else 1 for i in Flan_T5_base_output]
# Print the classification report
# 1 means positive 0 means negative
report = classification_report(test['label'],Flan_T5_base_predict_digit)
print(report)

              precision    recall  f1-score   support

           0       0.60      0.65      0.62        51
           1       0.80      0.76      0.78        93

    accuracy                           0.72       144
   macro avg       0.70      0.71      0.70       144
weighted avg       0.73      0.72      0.72       144

