# Prompt tuning model implementation for steering the T5 with encoder-decoder prompts to produce positive reviews

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import string


import numpy as np
import statistics as st
import glob
import sys
import io

import zipfile
import tarfile

import logging
from collections import Counter

import matplotlib
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import accuracy_score, f1_score
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")
import math

In [None]:
#Set one cuda visible device if multiple GPUs are avialable
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

In [None]:
from transformers import (
    T5TokenizerFast,
    get_scheduler
)
import torch

from transformers.optimization import Adafactor, AdafactorSchedule
sys.path.append("..")

from model_classes.model_t5_encoder_decoder_prompt import T5PromptTuningLM


In [None]:
# Fix the seed to be able to get the same randomness across runs and hence reproducible outcomes
def get_device_and_set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    return device
    
SEED = 123
device = get_device_and_set_seed(SEED)

In [None]:
class Config:
    
    num_train_epochs = 20
    learning_rate = 0.15
    warmup_steps = 500
    max_train_steps = num_train_epochs
    weight_decay=0.01
    batch_size = 10
    # Prompt-tuning
    # number of prompt tokens
    n_prompt_tokens = 20
    # If True, soft prompt will be initialized from vocab 
    # Otherwise, you can set `random_range` to initialize by randomization.
    init_from_vocab = True
    # random_range = 0.5
args = Config()

In [None]:
tokenizer = T5TokenizerFast.from_pretrained("google/t5-small-lm-adapt")
# Load the model
model_pos = T5PromptTuningLM.from_pretrained(
    "google/t5-small-lm-adapt",
    encoder_soft_prompt_path="../trained_models/t5_encoder_decoder/positive/encoder_soft_prompt_T5_pos.model",
    decoder_soft_prompt_path="../trained_models/t5_encoder_decoder/positive/decoder_soft_prompt_T5_pos.model",
    device=device
).to(device)
model_pos.eval()
print(1)

In [None]:

# Load the model
model_neg = T5PromptTuningLM.from_pretrained(
    "google/t5-small-lm-adapt",
    encoder_soft_prompt_path="../trained_models/t5_encoder_decoder/negative/encoder_soft_prompt_T5_neg.model",
    decoder_soft_prompt_path="../trained_models/t5_encoder_decoder/negative/decoder_soft_prompt_T5_neg.model",
    device=device
).to(device)
model_neg.eval()
print(1)

In [None]:
from model_classes.de_generation_2 import DExpertsGenerationMod

In [None]:
de_mod = DExpertsGenerationMod(expert_model=model_pos, antiexpert_model=model_neg, device=device, seed=SEED)

In [None]:
test = "the movie was"
call = tokenizer(test, return_tensors="pt").input_ids

input_ids = torch.tensor([call.tolist()[0][:-1]]).to(device)

In [None]:
input_ids = torch.tensor([call.tolist()[0][:-1]]).to(device)
decoder_input_ids = torch.zeros([1,1]).long().to(device)

In [None]:
op_1 = de_mod.generate(input_ids = input_ids,
                 decoder_input_ids = decoder_input_ids,
                 max_len = 200,
                 sample = True,
                 filter_p = 1,
                 k = 0,
                 p = 0.9,
                 temperature= 1.1,
                 alpha = 1.2,   
                      )

In [None]:
print(tokenizer.decode(op_1[0], skip_special_tokens=True))

In [None]:
df_pos_test = pd.read_csv("../data/2_data_remove_duplicates_5_pos_sampled_test.csv", encoding='utf-8')
test_texts = df_pos_test["reviewText"].tolist() 
#test_texts = shuffle(np.array(test_texts), random_state=SEED)

del df_pos_test 


In [None]:
class AmazonDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, inp_perecentage=0.4):
        self.texts = texts
        self.tokenizer = tokenizer
        self.inp_perecentage = inp_perecentage

    def __getitem__(self, idx):

        encodings = self.tokenizer.encode(self.texts[idx], truncation=True, padding=True, return_tensors='pt')
        item = {}
        full_ids = encodings.tolist()[0][:-1]
        
        item["full_text"]=self.texts[idx]
        full_len = len(full_ids)
    
        input_len = math.floor(full_len*self.inp_perecentage)
        
        if input_len < 3:
            input_len=3
        elif input_len > 20:
            input_len=20
        
        if full_len < 25:
            full_len=25
            
        item["min_length"] = (input_len+full_len)+args.n_prompt_tokens+80
        item["max_length"] = (input_len+full_len)+args.n_prompt_tokens+80
        item["input_ids"] = torch.tensor([full_ids[:input_len]]).to(device)
        item["full_len"] = full_len
        item["input_len"] = input_len
        item["full_ids"]= full_ids
        
        return item

    def __len__(self):
        return len(self.texts)
    

test_dataset = AmazonDatasetTest(test_texts, tokenizer)

In [None]:
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from torch import nn

def bleu_score(li_abs_hyp, li_abs_ref):
    """
    Computes the BLEU score
    :param li_abs_hyp: list of hypothesis abstracts (token strings)
    :param li_abs_ref: list of reference abstracts (token strings)
    """
    bleu = corpus_bleu(li_abs_hyp, [li_abs_ref])

    return bleu.score


def rouge_score(li_abs_hyp, li_abs_ref):
    """
    Computes the ROUGE score
    :param li_abs_hyp: list of hypothesis abstracts (token strings)
    :param li_abs_ref: list of reference abstracts (token strings)
    """
    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}

    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    for hyp, ref in zip(li_abs_hyp, li_abs_ref):
        local_rouge_scores = scorer.score(ref, hyp)
        for rouge_type in rouge_scores.keys():
            rouge_scores[rouge_type] += local_rouge_scores[rouge_type].fmeasure

    # Compute the averages 
    for rouge_type in rouge_scores.keys():
        rouge_scores[rouge_type] = rouge_scores[rouge_type] / max(len(li_abs_hyp), 1e-7)
    
    return rouge_scores

In [None]:
def evaluate_T5_gpu_2(net, data_iter, loss_model, device=None):
    """Compute the f1 score for a model on a dataset using a GPU.

    Defined in :numref:`sec_lenet`"""
    if isinstance(net, nn.Module):
        net.eval()  # Set the model to evaluation mode
        if not device:
            device = next(iter(net.parameters())).device
    # No. of correct predictions, no. of predictions

    
    y_tot =[]
    y_hat_tot = []
    y_hat_tot_plus = []
    input_texts_ls = []
    full_texts=[]
    perplexities = []

    with torch.no_grad():
        
        for inputs in tqdm(data_iter, total=len(data_iter)):
            
            beam_outputs = net.generate(
                input_ids = inputs["input_ids"], 
                decoder_input_ids = torch.zeros([1,1]).long().to(device),
                max_len = inputs["max_length"],
                sample = True,
                filter_p = 1,
                k = 0,
                p = 0.9,
                temperature= 1.1,
                alpha = 1.2,
)
            
            
            y_hat = " ".join(data_iter.tokenizer.decode(beam_outputs[0], skip_special_tokens=True).split()[:(inputs["full_len"])]).lower()
            y_hat_plus = " ".join(data_iter.tokenizer.decode(beam_outputs[0], skip_special_tokens=True).split()[:(inputs["full_len"]+inputs["input_len"])]).lower()
            input_text = data_iter.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True ).lower()
            
            y_hat = y_hat.replace(input_text, "").strip()
            y_hat_plus = " ".join(y_hat_plus.replace(input_text, "").strip().split()[:(inputs["full_len"])])
            
            #print(y_hat_plus,"\n")
            
            y_tot += [inputs["full_text"]]
            #full_texts += [inputs["full_text"]]
            input_texts_ls += [input_text]
            y_hat_tot += [y_hat]
            y_hat_tot_plus += [y_hat_plus]
            
            
            inps_2 = data_iter.tokenizer.encode(y_hat_plus)[:-1]
            
            labels_2 = torch.tensor([inps_2[1:][:511]+[1]]).to(device)
            inputs_2 = torch.tensor([inps_2[:-1][:511]+[1]]).to(device)
            mask_2 =  torch.tensor([[1]*inputs_2.shape[1]]).to(device)
            
            loss_model.eval()
            #loss_model.to(device)
            loss_2 = loss_model(input_ids=inputs_2, attention_mask=mask_2, decoder_input_ids=inputs_2, decoder_attention_mask=mask_2 , labels=labels_2.to(device)).get("loss").detach() 
            
            #loss_m = (loss_2 - loss_1) / (inputs_2.shape[1] - inputs_1.shape[1])
            ppl = math.exp(loss_2.item())
            if ppl < 1e4:   # for sanity
                perplexities.append(ppl)
                #print("added ppl = ", ppl)
            else:
                print("missed ppl = ", ppl)
                
            
            
    bleu_value = bleu_score(y_hat_tot_plus, y_tot)
    try:
        rouge_value = rouge_score(y_hat_tot_plus, y_tot)
    except:
        rouge_value = {"rouge1": 0.00}
            

    return bleu_value, rouge_value, y_hat_tot, y_hat_tot_plus, y_tot, input_texts_ls, np.nanmean(perplexities)


In [None]:
bl2, rg2, predictions2, preds_plus2, full_texts2, input_texts2, ppl2 = evaluate_T5_gpu_2(de_mod, test_dataset, model_pos )

In [None]:
def distinctness(generations):
    
    unigrams, bigrams, trigrams = set(), set(), set()
    total_words = 0
    for gen in generations:
        o = gen.split(' ')
        total_words += len(o)
        unigrams.update(o)
        for i in range(len(o) - 1):
            bigrams.add(o[i] + '_' + o[i+1])
        for i in range(len(o) - 2):
            trigrams.add(o[i] + '_' + o[i+1] + '_' + o[i+2])
    dist1 = (len(unigrams) / total_words)
    dist2 = (len(bigrams) / total_words)
    dist3 = (len(trigrams) / total_words)
    
    return dist1, dist2, dist3

In [None]:
def get_metrics_2(y_hat_tot_plus, y_tot):
    
    bleu_value = bleu_score(y_hat_tot_plus, y_tot)
    try:
        rouge_value = rouge_score(y_hat_tot_plus, y_tot)
    except:
        rouge_value = {"rouge1": 0.00}
        
                
    return bleu_value, rouge_value