In [1]:
!pip install datasets==2.16.0 torch accelerate transformers peft trl bitsandbytes



In [2]:
!pip install sentence_transformers==2.6.0



In [1]:
from datasets import load_dataset, Dataset
import pandas as pd 
test_df=pd.read_csv('llm-prompt-recovery/test.csv')
test_dataset = Dataset.from_pandas(test_df, split="test")

train_df=pd.read_csv('gemma-rewrite-nbroad/nbroad-v2.csv')
train_dataset = Dataset.from_pandas(train_df, split="train")
ds=train_dataset.train_test_split(test_size=0.1, seed=37)

In [2]:
import torch
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

st_model = SentenceTransformer("sentence-transformers/sentence-t5-base")

base_text=["Improve this text:","who are you"]
compare_text=["Improve the text:", "who are thy"]
def get_scs(base_text, compare_text):
    base_embedding = st_model.encode(base_text)
    compare_embedding = st_model.encode(compare_text)

    exponent = 3
    cos_sim=[]
    for b, c in zip(base_embedding, compare_embedding):
        cos_sim.append((cosine_similarity([b], [c])**exponent).item())

    score = sum(cos_sim)/float(len(cos_sim))
    return score

print(get_scs(base_text, compare_text))

0.7887373566627502


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_name="mistralai/Mistral-7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(  
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= False,
)

model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             quantization_config=bnb_config,
                                             torch_dtype=torch.float16,
                                             low_cpu_mem_usage=True,
                                             device_map="auto",
                                             trust_remote_code=True
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          trust_remote_code=True,
                                          device_map="auto",
                                          padding_side="left"
                                         )
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = False
tokenizer.add_bos_token = False

model.config.use_cache = False # silence the warnings
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
import json

orig_prefix = "Original Text: "

#mistral "response"
llm_response_for_rewrite = "Provide the modified text and I'll tell you a something general that changed about it.  I'll avoid any specifics though.  My tone will be neutral."

#modified text prefix
rewrite_prefix = "\nRewritten Text: "

#provided as start of Mistral response (anything after this is used as the prompt)
#providing this as the start of the response helps keep things relevant
response_start = "The request was: "

#We insert our detected prompt into this well-scoring baseline text
#thanks to: https://www.kaggle.com/code/rdxsun/lb-0-61
base_line = 'Refine the following passage by emulating the writing style of [insert desired style here], with a focus on enhancing its clarity, elegance, and overall impact. Preserve the essence and original meaning of the text, while meticulously adjusting its tone, vocabulary, and stylistic elements to resonate with the chosen style.Please improve the following text using the writing style of, maintaining the original meaning but altering the tone, diction, and stylistic elements to match the new style.Enhance the clarity, elegance, and impact of the following text by adopting the writing style of , ensuring the core message remains intact while transforming the tone, word choice, and stylistic features to align with the specified style.' 
base_line_swap_text = "[insert desired style here]"

# this function is used to output the right formate for each row in the dataset
def create_text_row(instruction, output, complete=True) -> str:
    if complete:
        text_row = f"""<s>[INST] {instruction} [/INST] \n {output} </s>"""
    else:
        text_row = f"""<s>[INST] {instruction} [/INST] \n {output} """
    return text_row


def get_prompt(orig_text, transformed_text, rewritten_prompt = None):
    message = create_text_row(f"{orig_prefix} {orig_text}", llm_response_for_rewrite)
    if rewritten_prompt is None:
        message = message + create_text_row(f"{rewrite_prefix} {transformed_text}", response_start, False)
    else:
        message = message + create_text_row(f"{rewrite_prefix} {transformed_text}", f"{response_start} {rewritten_prompt}")
    return message


# interate over all the rows formate the dataset and store it in a jsonl file
def process_jsonl_file(output_file_path):
    with open(output_file_path, "w") as output_jsonl_file:
        i=0
        for item in ds["train"]:
            json_object = {
                "text": get_prompt(item["original_text"],item['rewritten_text'],item['rewrite_prompt'])
            }
            output_jsonl_file.write(json.dumps(json_object) + "\n")
            i+=1

# Provide the path where you want to save the formatted dataset
process_jsonl_file("./training_dataset.jsonl")


In [5]:
train_dataset = load_dataset('json', data_files='./training_dataset.jsonl' , split='train')
train_dataset=train_dataset.train_test_split(test_size=0.1, seed=37)

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
from transformers import HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os
from datasets import load_dataset
from trl import SFTTrainer

kbit_model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head",]
)
peft_model = get_peft_model(kbit_model, peft_config)

In [None]:
torch.manual_seed(0)

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_total_limit = 2, 
    save_strategy = "steps", 
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    save_steps=80,
    logging_steps=80,
    learning_rate=2e-5,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="none",
    push_to_hub=False
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_dataset['train'],
    eval_dataset=train_dataset['test'],
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False
)

new_model='Mistral-7b-instruct-v0.1-qlora'
#new_model='Mistral-7b-v0.1-qlora'
trainer.train()
trainer.model.save_pretrained(new_model)



Map:   0%|          | 0/1944 [00:00<?, ? examples/s]

Map:   0%|          | 0/216 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

# base_model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     low_cpu_mem_usage=True,
#     return_dict=True,
#     torch_dtype=torch.float16, #bfloat and float?
#     device_map="auto",
# )
# merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= PeftModel.from_pretrained(model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")


In [None]:
from random import randrange
import re

def remove_numbered_list(text):
    final_text_paragraphs = [] 
    for line in text.split('\n'):
        # Split each line at the first occurrence of '. '
        parts = line.split('. ', 1)
        # If the line looks like a numbered list item, remove the numbering
        if len(parts) > 1 and parts[0].strip().isdigit():
            final_text_paragraphs.append(parts[1].strip())
        else:
            # If it doesn't look like a numbered list item, include the line as is
            final_text_paragraphs.append(line.strip())

    return '  '.join(final_text_paragraphs)


#trims LLM output to just the response
def trim_to_response(text):
    terminate_string = "[/INST]"
    text = text.replace('</s>', '')
    #just in case it puts things in quotes
    text = text.replace('"', '')
    text = text.replace("'", '')

    last_pos = text.rfind(terminate_string)
    return text[last_pos + len(terminate_string):] if last_pos != -1 else text

#looks for response_start / returns only text that occurs after
def extract_text_after_response_start(full_text):
    parts = full_text.rsplit(response_start, 1)  # Split from the right, ensuring only the last occurrence is considered
    if len(parts) > 1:
        return parts[1].strip()  # Return text after the last occurrence of response_start
    else:
        return full_text  # Return the original text if response_start is not found

def sample_pred(messages):
    with torch.inference_mode():
        encoded = tokenizer(messages, return_tensors="pt", padding=True)
        model_inputs = encoded.input_ids.cuda()

        generated_ids = merged_model.generate(model_inputs, pad_token_id=tokenizer.eos_token_id,
                                      #renormalize_logits = True,
                                       max_new_tokens=128, do_sample=True, temperature=0.9, top_k=10, top_p=0.1
                                       #max_new_tokens=128, num_beams=5, do_sample=True, early_stopping=True
                                    )
        decoded = tokenizer.batch_decode(generated_ids)
    
    f=[]
    for d in decoded:
        just_response = trim_to_response(d)
        final_text = extract_text_after_response_start(just_response)

        #default to baseline
        if len(final_text) == 0:
            final_text = base_line

        #mistral has been replying with numbered lists - clean them up....
        final_text = remove_numbered_list(final_text)
        f+=[final_text]
    
    return f

for i in range(3):
    sample = ds["train"][randrange(len(ds['train']))]
    print(sample_pred(get_prompt(sample['original_text'], sample['rewritten_text'], None)))
    print(sample['rewrite_prompt'])

In [None]:
def pred_for_ds(x, idx):
    preds=[]
    for ot, rt, i in zip(x['original_text'],x['rewritten_text'], idx):
        preds=preds+sample_pred(get_prompt(ot, rt, None))
    return preds

In [None]:
for i in range(0,5):
    item=ds["test"][i]
    
    preds=sample_pred(get_prompt(item["original_text"],item['rewritten_text'], None,
                                ))

    val=0
    for pred in preds:
        print('Prediction:',pred)
        print('Actual:',item['rewrite_prompt'])
        val=val+get_scs([item["rewrite_prompt"]],[pred])
        print(get_scs([item["rewritten_text"]],[pred]))
        
print(val/len(preds))

In [None]:
ds2 = test_dataset.map(lambda x, idx: {'rewrite_prompt': pred_for_ds(x, idx)}, with_indices=True, batched=True,  batch_size=4, remove_columns=['original_text','rewritten_text'])
ds2.to_csv("submission.csv", index=False)

In [None]:
ds2['rewrite_prompt'][0]