# Training the sentiment classifier with generated data from T5 model with encoder - decoder prompts

In [20]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import string


import numpy as np
import statistics as st
import glob
import sys
import io
import re

import zipfile
import tarfile
import logging
from collections import Counter

import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
from transformers import DistilBertTokenizerFast
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

In [2]:
#!pip install transformers

In [3]:
#Set one cuda visible device if multiple GPUs are avialable
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=0


In [4]:
# Fix the seed to be able to get the same randomness across runs and hence reproducible outcomes
def get_device_and_set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    return device
    
SEED = 123
device = get_device_and_set_seed(SEED)

In [6]:
df_pos_train = pd.read_csv("../data/T5_encoder_decoder_generated_25k_Pos.csv", encoding='utf-8')
df_pos_val = pd.read_csv("../data/T5_encoder_decoder_generated_5k_Neg_val.csv", encoding='utf-8')
df_pos_test = pd.read_csv("../data/1_data_remove_duplicates_5_pos_sampled_test.csv", encoding='utf-8')

df_neg_train = pd.read_csv("../data/T5_encoder_decoder_generated_25k_Neg.csv", encoding='utf-8')
df_neg_val = pd.read_csv("../data/T5_encoder_decoder_generated_5k_Neg_val.csv", encoding='utf-8')
df_neg_test = pd.read_csv("../data/1_data_remove_duplicates_5_neg_sampled_test.csv", encoding='utf-8')

In [7]:
train_texts = df_pos_train["reviewText"].tolist() + df_neg_train["reviewText"].tolist()
train_labels = df_pos_train["label"].tolist() + df_neg_train["label"].tolist()

train_texts, train_labels = shuffle(np.array(train_texts), np.array(train_labels), random_state=SEED)

val_texts = df_pos_val["reviewText"].tolist() + df_neg_val["reviewText"].tolist()
val_labels = df_pos_val["label"].tolist() + df_neg_val["label"].tolist()

val_texts, val_labels = shuffle(np.array(val_texts), np.array(val_labels), random_state=SEED)

test_texts = df_pos_test["reviewText"].tolist() + df_neg_test["reviewText"].tolist()
test_labels = df_pos_test["label"].tolist() + df_neg_test["label"].tolist()


del df_pos_train 
del df_pos_val 
del df_pos_test 

del df_neg_train 
del df_neg_val
del df_neg_test 


In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', truncation_side='left')


In [8]:
class AmazonDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, labels):
        self.texts = texts
        self.tokenizer = tokenizer
        self.labels = labels

    def __getitem__(self, idx):
        encodings = self.tokenizer(self.texts[idx], truncation=True, padding="max_length")
        item = {key: torch.tensor(val).to(device) for key, val in encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

#train_dataset = AmazonDataset(train_texts, tokenizer, train_labels)
#val_dataset = AmazonDataset(val_texts, tokenizer, val_labels)
test_dataset = AmazonDataset(test_texts, tokenizer, test_labels)

# Training

In [10]:

def compute_metric(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average = 'weighted')
    acc = accuracy_score(labels, preds)
    return {'Validation accuracy':acc, 'Validation f1':f1}


In [11]:
len(train_dataset)

50000

In [12]:

batch_size = 32
step_len = len(train_dataset)//batch_size

training_args = TrainingArguments(
    output_dir='./results_db_T5_mod',          # output directory
    learning_rate = 1e-5,
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    warmup_steps=600,                # number of warmup steps for learning rate scheduler
    logging_dir='./logs_db_T5_mod',            # directory for storing logs
    logging_steps=step_len//10,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    seed=SEED,
 
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english").to(device)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    tokenizer = tokenizer,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metric
)


trainer.args._n_gpu = 1

In [None]:
trainer.train()

# Evaluation

In [14]:
#test set evaluation

In [9]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', truncation_side='left')
model1 = DistilBertForSequenceClassification.from_pretrained("../trained_models/classifier_t5_encoder_decoder_generated_data/cp/").to(device)
model1.eval()
print(1)

In [11]:
sys.path.append("..")
from utils.eval_utils import evaluate_accuracy_gpu, evaluate_f1_gpu, plot_confusion_matrix

In [13]:
_, y_t, y_h_t = evaluate_f1_gpu(model1, test_dataset)

In [14]:
print('\nTesting Metrics\n')
print(classification_report(y_t, y_h_t, target_names=['Negative', 'Positive'], digits=4))


Testing Metrics

              precision    recall  f1-score   support

    Negative     0.8507    0.9114    0.8800      5000
    Positive     0.9046    0.8400    0.8711      5000

    accuracy                         0.8757     10000
   macro avg     0.8776    0.8757    0.8755     10000
weighted avg     0.8776    0.8757    0.8755     10000



In [15]:
from transformers import pipeline

In [16]:
test_p = pipeline("text-classification", model=model1, tokenizer=tokenizer, device=device, )

In [17]:
class ModP :
    
    def __init__(self, pipe_l):
        self.peipe_l = pipe_l
        
    def predict_proba(self, inps):
        
        ab = self.peipe_l(inps)
        
        op = []
        
        for each in ab:
    
            if each["label"] == "POSITIVE":
                op.append([1-each["score"], each["score"]])
            else:
                op.append([each["score"], 1-each["score"]])
                
        return np.array(op)
        
        

In [18]:
ts_mod = ModP(test_p)

In [19]:
ts_mod.predict_proba(["the movie was very good", "the movie was terrible"])

array([[9.66191292e-04, 9.99033809e-01],
       [9.99883533e-01, 1.16467476e-04]])