# Prompt tuning model training for GPT-2 with a single prompt to produce negative reviews

In [None]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import string


import numpy as np
import statistics as st
import glob
import sys
import path
import io

import zipfile
import tarfile
import logging
from collections import Counter

import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import shuffle

import warnings
warnings.filterwarnings("ignore")
from tqdm.notebook import tqdm
import math
from evaluate import load

In [None]:
#!pip install transformers

In [None]:
#Set one cuda visible device if multiple GPUs are avialable
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

In [None]:
from transformers import (
    GPT2TokenizerFast,
    AdamW,
    get_scheduler
)
import torch

from transformers.optimization import Adafactor, AdafactorSchedule

sys.path.append("..")
 

from model_classes.model_gpt_prompt import GPT2PromptTuningLM


In [None]:
# Fix the seed to be able to get the same randomness across runs and hence reproducible outcomes
def get_device_and_set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    return device
    
SEED = 123
device = get_device_and_set_seed(SEED)

In [None]:
class Config:
    # Same default parameters as run_clm_no_trainer.py in tranformers
    # https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py
    num_train_epochs = 20
    learning_rate = 0.00005
    lr_scheduler_type = "linear"
    num_warmup_steps = 500
    max_train_steps = num_train_epochs
    
    # Prompt-tuning
    # number of prompt tokens
    n_prompt_tokens = 20
    # If True, soft prompt will be initialized from vocab 
    # Otherwise, you can set `random_range` to initialize by randomization.
    init_from_vocab = True
    # random_range = 0.5
args = Config()

# Training

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2" , truncation=True, padding="max_length")
# Initialize GPT2LM with soft prompt

tokenizer.add_special_tokens({'pad_token': '0'})
    
model = GPT2PromptTuningLM.from_pretrained(
    "gpt2",
    n_tokens=args.n_prompt_tokens,
    initialize_from_vocab=args.init_from_vocab,
    device=device
).to(device)

In [None]:
model.soft_prompt.weight

In [None]:
df_neg_train = pd.read_csv("../data/2_data_remove_duplicates_25_neg_sampled.csv", encoding='utf-8')
df_neg_val = pd.read_csv("../data/2_data_remove_duplicates_5_neg_sampled_val.csv", encoding='utf-8')
df_neg_test = pd.read_csv("../data/2_data_remove_duplicates_5_neg_sampled_test.csv", encoding='utf-8')

In [None]:

train_texts = df_neg_train["reviewText"].tolist()
train_texts = shuffle(np.array(train_texts), random_state=SEED)

val_texts = df_neg_val["reviewText"].tolist() 
val_texts = shuffle(np.array(val_texts), random_state=SEED)

test_texts = df_neg_test["reviewText"].tolist() 
#test_texts = shuffle(np.array(test_texts), random_state=SEED)

del df_neg_train 
del df_neg_val 
del df_neg_test 



In [None]:
class AmazonDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        
        encodings = self.tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=512)
        item = {key: torch.tensor(val) for key, val in encodings.items()}
        lst_attn_msk = item['attention_mask'].tolist()
        sent_end = lst_attn_msk.index(0) if 0 in lst_attn_msk else len(lst_attn_msk)
        #item['labels'] = torch.tensor([-100 if x==15 else x for x in item["input_ids"][1:]]).to(device)
        item['labels'] = torch.tensor([x or -100 for x in item["input_ids"][1:]]).to(device)
        inp_1 = item["input_ids"][:(sent_end-1)]
        inp_2 = item["input_ids"][sent_end:]
        item['input_ids'] = torch.cat((inp_1, inp_2)).to(device)
        item['attention_mask'] = item['attention_mask'][1:].to(device)
        
        
        return item

    def __len__(self):
        return len(self.texts)

train_dataset = AmazonDataset(train_texts, tokenizer)
val_dataset = AmazonDataset(val_texts, tokenizer)


In [None]:
from torch import nn
from transformers import Trainer
from transformers.trainer_utils import (
    PREFIX_CHECKPOINT_DIR,
    HPSearchBackend,
)


class GPT2Trainer(Trainer):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.soft_pr_save_number=1
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        if labels is not None:
            labels = model._extend_labels(labels)
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        #print("logit shape = ", logits.shape)
        # compute custom loss 
        loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        
        
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        #print("loss = ", loss)
        return (loss, outputs) if return_outputs else loss
    
    def _save_checkpoint(self, model, trial, metrics=None):
       # insert your own behavior here
    
        run_dir = self._get_output_dir(trial=trial)
        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
        output_dir = os.path.join(run_dir, checkpoint_folder)
        
        model.save_soft_prompt(path=output_dir, filename="soft_prompt_1.model")
        
    def _get_output_dir(self, trial):
        if self.hp_search_backend is not None and trial is not None:
            if self.hp_search_backend == HPSearchBackend.OPTUNA:
                run_id = trial.number
            elif self.hp_search_backend == HPSearchBackend.RAY:
                from ray import tune

                run_id = tune.get_trial_id()
            elif self.hp_search_backend == HPSearchBackend.SIGOPT:
                run_id = trial.id
            elif self.hp_search_backend == HPSearchBackend.WANDB:
                import wandb

                run_id = wandb.run.id
            run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}"
            run_dir = os.path.join(self.args.output_dir, run_name)
        else:
            run_dir = self.args.output_dir
        return run_dir

In [None]:
from transformers import  Trainer, TrainingArguments


batch_size = 10
step_len = len(train_dataset)//batch_size


# Only update soft prompt'weights for prompt-tuning. ie, all weights in LM are set as `require_grad=False`. 

'''
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n == "soft_prompt.weight"],
        "weight_decay": 0.01,
    }
]


optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=3,
)

'''

optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n == "soft_prompt.weight"],
    }
]


optimizer = Adafactor(optimizer_grouped_parameters, lr=0.00005,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=1e-5,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False,)

lr_scheduler = AdafactorSchedule(optimizer, initial_lr=0.00005)



training_args = TrainingArguments(
    output_dir='./results_5',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=batch_size,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    logging_dir='./logs',            # directory for storing logs
    logging_steps=step_len//10,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    seed=SEED,
)



trainer = GPT2Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    optimizers = (optimizer, lr_scheduler),
    args=training_args,                  # training arguments, defined above
    tokenizer = tokenizer,
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
)


trainer.args._n_gpu = 1

In [None]:
trainer.train()

# Inference

In [None]:
#Inference

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '0'})
# Load the model
model = GPT2PromptTuningLM.from_pretrained(
    "gpt2",
    soft_prompt_path="../trained_models/gpt2_cp/negative/soft_prompt_1.model",
    device=device
).to(device)
model.eval()
print(1)


In [None]:
test = "the movie was"

call = tokenizer(test, return_tensors="pt").to(device)

basic_output = model.generate(
    input_ids=torch.tensor([call.input_ids.tolist()[0][:-1]]).to(device),
    min_length=call.input_ids.shape[-1] + 100,
    max_length=call.input_ids.shape[-1] + 100,
    num_beams=1, 
    do_sample=True,
    no_repeat_ngram_size=1,  
    temperature = 1.0,
    top_k = 0,
    top_p = 1,
    repetition_penalty = 1.0,
    early_stopping=True,
    pad_token_id=tokenizer.pad_token_id
)


In [None]:
print(tokenizer.decode(basic_output[0], skip_special_tokens=True ))

In [None]:
class AmazonDatasetTest(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, inp_perecentage=0.4):
        self.texts = texts
        self.tokenizer = tokenizer
        self.inp_perecentage = inp_perecentage

    def __getitem__(self, idx):

        encodings = self.tokenizer.encode(self.texts[idx], truncation=True, padding=True, return_tensors='pt')
        item = {}
        full_ids = encodings.tolist()[0]
        
        item["full_text"]=self.texts[idx]
        full_len = len(full_ids)
        
        input_len = math.floor(full_len*self.inp_perecentage)
        
        
        if input_len < 3:
            input_len=3
        elif input_len > 20:
            input_len=20
        
        
        if full_len < 25:
            full_len=25
        
        
        item["min_length"] = (input_len+full_len)+80
        item["max_length"] = (input_len+full_len)+80
        item["input_ids"] = torch.tensor([full_ids[:input_len]]).to(device)
        item["full_len"] = full_len
        item["input_len"] = input_len
        item["full_ids"] = encodings
        
        return item

    def __len__(self):
        return len(self.texts)
    

test_dataset = AmazonDatasetTest(test_texts, tokenizer)

In [None]:
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from torch import nn

def bleu_score(li_abs_hyp, li_abs_ref):
    """
    Computes the BLEU score
    :param li_abs_hyp: list of hypothesis abstracts (token strings)
    :param li_abs_ref: list of reference abstracts (token strings)
    """
    bleu = corpus_bleu(li_abs_hyp, [li_abs_ref])

    return bleu.score


def rouge_score(li_abs_hyp, li_abs_ref):
    """
    Computes the ROUGE score
    :param li_abs_hyp: list of hypothesis abstracts (token strings)
    :param li_abs_ref: list of reference abstracts (token strings)
    """
    rouge_scores = {"rouge1": 0, "rouge2": 0, "rougeL": 0}

    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    for hyp, ref in zip(li_abs_hyp, li_abs_ref):
        local_rouge_scores = scorer.score(ref, hyp)
        for rouge_type in rouge_scores.keys():
            rouge_scores[rouge_type] += local_rouge_scores[rouge_type].fmeasure

    # Compute the averages 
    for rouge_type in rouge_scores.keys():
        rouge_scores[rouge_type] = rouge_scores[rouge_type] / max(len(li_abs_hyp), 1e-7)
    
    return rouge_scores

In [None]:
def evaluate_GPT2_gpu(net, data_iter, device=None):
    """Compute the f1 score for a model on a dataset using a GPU.

    Defined in :numref:`sec_lenet`"""
    if isinstance(net, nn.Module):
        net.eval()  # Set the model to evaluation mode
        if not device:
            device = next(iter(net.parameters())).device
    # No. of correct predictions, no. of predictions

    
    y_tot =[]
    y_hat_tot = []
    y_hat_tot_plus = []
    input_texts_ls = []
    full_texts=[]
    perplexities = []

    with torch.no_grad():
        
        for inputs in tqdm(data_iter, total=len(data_iter)):
            
            beam_outputs = net.generate(
                input_ids=inputs["input_ids"], 
                min_length=inputs["min_length"],
                max_length=inputs["max_length"],
                num_beams=5, 
    do_sample=True,
    no_repeat_ngram_size=1,  
    temperature = 1.0,
    top_k = 0,
    top_p = 1,
    repetition_penalty = 1.0,
    early_stopping=True,
    pad_token_id=tokenizer.pad_token_id
)
            
            y_hat = " ".join(data_iter.tokenizer.decode(beam_outputs[0], skip_special_tokens=True).split()[:(inputs["full_len"])]).lower()
            y_hat_plus = " ".join(data_iter.tokenizer.decode(beam_outputs[0], skip_special_tokens=True).split()[:(inputs["full_len"]+inputs["input_len"])]).lower()
            input_text = data_iter.tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True ).lower()
            
            y_hat = y_hat.replace(input_text, "").strip()
            y_hat_plus = " ".join(y_hat_plus.replace(input_text, "").strip().split()[:(inputs["full_len"])])
            
            #print(y_hat_plus)
            
            y_tot += [inputs["full_text"]]
            #full_texts += [inputs["full_text"]]
            input_texts_ls += [input_text]
            y_hat_tot += [y_hat]
            y_hat_tot_plus += [y_hat_plus]
            
            
            inps_2 = data_iter.tokenizer.encode(y_hat_plus)
            
            labels_2 = torch.tensor([inps_2[:512]]).to(device)
            inputs_2 = torch.tensor([inps_2[:512]]).to(device)
            mask_2 =  torch.tensor([[1]*inputs_2.shape[1]]).to(device)
        
            loss_2 = net(input_ids=inputs_2, attention_mask=mask_2, labels=labels_2).get("loss").detach() 
            
            #loss_m = (loss_2 - loss_1) / (inputs_2.shape[1] - inputs_1.shape[1])
            ppl = math.exp(loss_2.item())
            if ppl < 1e4:   # for sanity
                perplexities.append(ppl)
                #print("added ppl = ", ppl)
            else:
                print("missed ppl = ", ppl)
            
            
            
    bleu_value = bleu_score(y_hat_tot_plus, y_tot)
    try:
        rouge_value = rouge_score(y_hat_tot_plus, y_tot)
    except:
        rouge_value = {"rouge1": 0.00}
            

    return bleu_value, rouge_value, y_hat_tot, y_hat_tot_plus, y_tot, input_texts_ls, np.nanmean(perplexities)


In [None]:
bl2, rg2, predictions2, preds_plus2, full_texts2, input_texts2, ppl2 = evaluate_GPT2_gpu(model, test_dataset )

In [None]:
def distinctness(generations):
    
    unigrams, bigrams, trigrams = set(), set(), set()
    total_words = 0
    for gen in generations:
        o = gen.split(' ')
        total_words += len(o)
        unigrams.update(o)
        for i in range(len(o) - 1):
            bigrams.add(o[i] + '_' + o[i+1])
        for i in range(len(o) - 2):
            trigrams.add(o[i] + '_' + o[i+1] + '_' + o[i+2])
    dist1 = (len(unigrams) / total_words)
    dist2 = (len(bigrams) / total_words)
    dist3 = (len(trigrams) / total_words)
    
    return dist1, dist2, dist3

In [None]:
def get_metrics_2(y_hat_tot_plus, y_tot):
    
    bleu_value = bleu_score(y_hat_tot_plus, y_tot)
    try:
        rouge_value = rouge_score(y_hat_tot_plus, y_tot)
    except:
        rouge_value = {"rouge1": 0.00}
        
                
    return bleu_value, rouge_value
        
    
    
    