# Seq2Seq LLM Reprompt




In [1]:
from datasets import load_dataset
import pandas as pd 

# df.columns
ds = load_dataset("csv",data_files={"train":'gemma1000_w7b.csv'})

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# df=df[['original_text','gemma_7b_rewritten_text_temp0','rewrite_prompt']]
# df['diff_prompt']='Tell me what prompt converted the following text: \n"""' + df['original_text'] + '"""\n to this text \n"""' + df['gemma_7b_rewritten_text_temp0'] + '"""\n'
#original text prefix

orig_prefix = "Original Text:"

rewrite_prefix = "\nRewritten Text:"

import json
import re

def get_prompt(orig_text, transformed_text):
    #message = f"{orig_prefix} {orig_text} {rewrite_prefix} {transformed_text}"
    message = f"{rewrite_prefix} {transformed_text}"
    return message


# interate over all the rows formate the dataset and store it in a jsonl file
def process_jsonl_file(output_file_path):
    with open(output_file_path, "w") as output_jsonl_file:
        for item in ds["train"]:
            json_object = {
                "text": get_prompt(item["original_text"],item['gemma_7b_rewritten_text_temp0']),
                "output": item['rewrite_prompt']
            }
            output_jsonl_file.write(json.dumps(json_object) + "\n")

# Provide the path where you want to save the formatted dataset
process_jsonl_file("./training_dataset.jsonl")


In [3]:
from datasets import Dataset
train_dataset = load_dataset('json', data_files='./training_dataset.jsonl', split="train")

train_dataset=train_dataset.train_test_split(test_size=0.1)


Generating train split: 1000 examples [00:00, 139828.78 examples/s]


In [4]:
#model_checkpoint = "google/flan-t5-base"
model_checkpoint = "facebook/bart-large"

In [5]:
from datasets import load_dataset
from evaluate import load

metric = load("rouge")

2024-03-22 01:00:12.206040: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 01:00:12.206083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 01:00:12.207127: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-22 01:00:12.212079: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, device_map="auto")

In [7]:
max_input_length = 1024
max_target_length = 16

def preprocess_function(examples):
    model_inputs = tokenizer(examples["text"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["output"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_dataset = train_dataset.map(preprocess_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 900/900 [00:00<00:00, 8196.09 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 7017.99 examples/s]


In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 900
    })
    test: Dataset({
        features: ['text', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 100
    })
})

In [10]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map="auto")

In [11]:
batch_size = 4
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-reprompt",
    evaluation_strategy = "steps",
    learning_rate=2e-6,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    #fp16_full_eval=True,
    load_best_model_at_end=True
)

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Note that other metrics may not have a `use_aggregator` parameter
    # and thus will return a list, computing a metric for each sentence.
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, use_aggregator=True)
    # Extract a few results
    result = {key: value * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],    
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,1.2161,0.833756,73.3021,62.7576,73.0357,72.9898,13.95
1000,0.685,0.71949,77.9994,69.5621,77.7587,77.7125,13.08
1500,0.5574,0.693895,78.0857,69.12,77.9098,77.9751,13.29
2000,0.4915,0.703615,78.1529,70.2252,78.1068,78.1658,13.26
2500,0.4161,0.694739,79.638,71.9642,79.5223,79.5111,12.96
3000,0.369,0.696823,78.5204,70.2416,78.541,78.5135,13.63
3500,0.3456,0.6894,80.4455,72.8792,80.5017,80.4785,13.13
4000,0.3353,0.701242,79.8254,72.0318,79.7861,79.7431,13.09
4500,0.3214,0.699299,80.0064,72.1418,79.9448,79.899,13.1


Checkpoint destination directory bart-large-finetuned-reprompt/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Checkpoint destination directory bart-large-finetuned-reprompt/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Checkpoint destination directory bart-large-finetuned-reprompt/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'e

TrainOutput(global_step=4500, training_loss=0.5263974372016059, metrics={'train_runtime': 790.225, 'train_samples_per_second': 22.778, 'train_steps_per_second': 5.695, 'total_flos': 1.4731418378502144e+16, 'train_loss': 0.5263974372016059, 'epoch': 20.0})

In [16]:
trainer.save_model('Seq2Seq_bart-large')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


In [18]:
#tokenizer = AutoTokenizer.from_pretrained('./Seq2Seq_tst', device_map="auto")
model = AutoModelForSeq2SeqLM.from_pretrained('./Seq2Seq_bart-large', device_map="auto")

In [19]:
def pred(x):
    input_ids = tokenizer(x, return_tensors='pt').input_ids
    outputs = model.generate(input_ids, max_new_tokens=max_target_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [20]:
print(pred(train_dataset["train"][0]["text"]))
print(train_dataset["train"][0]["output"])
print(train_dataset["train"][0]["text"])

Translate this into a passage from ancient scriptures.
Translate this into a text from ancient scriptures.

Rewritten Text: Sure, here is the text translated into ancient scriptures:

"In the days of old, when the internet was a new and wondrous thing, there lived a group of major rights holders who claimed that the search engines were making it "difficult" for people to find legal music and films online.

One day, a coalition of entertainment industry groups approached the government and presented a confidential document that revealed the truth. The document claimed that Google and Microsoft's Bing were "overwhelmingly" directing music fans to illegal copies of copyrighted tracks.

The major rights holders argued that the search engines were making it "much more difficult" for people to find legal music and films. They claimed that this was a violation of their rights and that the government had a responsibility to protect them.




In [25]:
print(pred(train_dataset["train"][12]["text"]))
print(train_dataset["train"][12]["output"])
print(train_dataset["train"][12]["text"])

Rephrase this as a sitcom scene.
Rephrase this as a sitcom scene.

Rewritten Text: Sure, here is the rephrased text as a sitcom scene:

[FADE IN]

**INT. COFFEE SHOP - DAY**

A young entrepreneur, MARCIA (20s), sits in a coffee shop, staring at a laptop. She's frowning and frustrated. A waiter, BRENDA (20s), approaches her table.

**BRENDA:** Can I get you anything else, Miss?

**MARCIA:** (snapping) You're not going to believe this, Brenda. I've been trying to license an invention from the NIH, and it's a real mess.

**BRENDA:** Oh no, I'm sorry to hear that, Marcia. What's up?

**MARCIA:** The invention is a system that can predict your psychological status based on your smartphone usage. It's a big deal, but the government is making it hard to get it off the ground.

**BRENDA:** Really? That sounds like a very cool idea.

**MARCIA:** I know, right? But the government is only interested in licensing it to big companies, not small ones. It's a real slap in the face.

**BRENDA:** (symp

In [22]:
pred("""\nRewritten Text: Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be""")


'Rewrite this as a shanty to be sung.'