# Install deps

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
%%capture
!pip3 install -U torch torchvision torchaudio
# !pip3 install -U bitsandbytes
!pip3 install -U sentence_transformers

In [2]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [3]:
import os

PROJECT = "unsplot-mistral7b-lpr"

os.environ["HF_TOKEN"] = "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"
os.environ["BACKUP_DIR"] = "/content/drive/MyDrive/WIP"
os.environ["VERSION"] = PROJECT

os.environ["WANDB_PROJECT"] = "kaggle-lpr"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "unsloth_mistral-it-7b"  # log all model checkpoints

os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [4]:
# !pip install -U -q sentence-transformers
# !pip install -U -q wandb
import torch
torch.__version__

'2.2.1+cu121'

In [5]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from unsloth import FastLanguageModel
# from peft import PeftModel
import re
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from scipy import stats

# Utils function
t5base_model = None

# Function to calculate sharpened cosine similarity
def sharpened_cosine_similarity(vec1, vec2, exponent=3):
    cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
    return cosine_similarity ** exponent

#provides similarity scores of a test_phrase against an array of phrases
def compare_phrases(test_phrase, phrases):
    global t5base_model
    t5base_model = SentenceTransformer('sentence-t5-base')
    if torch.cuda.is_available():
        t5base_model = t5base_model.to(torch.device("cuda"))
    model = t5base_model

    scores = []
    test_embedding = model.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False)

    for phrase in phrases:
        compare_embedding = model.encode(phrase, convert_to_tensor=True, show_progress_bar=False)
        score = sharpened_cosine_similarity(test_embedding, compare_embedding).item()
        scores.append(score)

    return scores

def truncate_sentence(text, max_words):
    if not isinstance(text, str):
        print(text)
    words = text.split(" ")
    if len(words) <= max_words:
        return text
    return " ".join(words[:max_words])


# 1. Prepare data & train

In [6]:
from pathlib import Path
import pandas as pd
import numpy as np

# data
DATA_DIR = Path("/datasets/llm_prompt_recovery/")
CKPT_DIR = Path("./")

# load data
train_df = pd.read_csv(DATA_DIR / "train_data.csv")
test_df = pd.read_csv(DATA_DIR / "test_data.csv")
# train_df.dropna(inplace=True)

print(train_df.shape, test_df.shape)
test_df.head()

(3200, 5) (800, 5)


Unnamed: 0.1,Unnamed: 0,original_text,rewrite_prompt,rewritten_text,id
0,555.0,Alex held the razor blade up against his wrist...,Rewrite the story as a romcom / love story,## Butterfly Kisses and Soul Mates\n\nAlex sat...,
1,,"Dear Santa, \n \n Thanks for the BB gas gun. I...",Rewrite the story as a heartwarming tale,"In the quaint town of Snow Creek, where snowfl...",SCwCSnMXwE
2,527.0,`` You can not!'' Sir Adalhard protested as mu...,Rewrite the essay with two robots from the future,You can not!'' Sir Adalhard protested as much ...,
3,,This prompt is set in an alternate universe wh...,Rewrite the essay as if it is a science fictio...,"In the vast expanse of an alien cosmos, where ...",JfhoNgrBCG
4,,"Happy, adjective, feeling pleasure and enjoyme...",Rewrite essay as about an optimistic grocery c...,"In the bustling aisles of the grocery store, w...",zhtykeylll


In [7]:
choose_df = test_df[test_df['rewrite_prompt'] == "Rewrite this article as if it were a myth being told by ancient storytellers."]
choose_df["rewrite_prompt"][35]

'Rewrite this article as if it were a myth being told by ancient storytellers.'

2. Train

# 3. Predict data

In [8]:
# # model = GemmaModel(
# #     model_name="google/gemma-2b-it",
# #     adapter_model_name = CKPT_DIR / PROJECT_NAME,
# # )

# prmodel = PRModel(
#     # model_name = PROJECT,
#     model=model,
#     tokenizer=tokenizer,
#     # adapter_model_name = CKPT_DIR / PROJECT_NAME
#     prompt_builder = prompt_builder,
# )

# # test
# original = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
# rewritten = "(Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"

# # FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# # inputs = tokenizer(
# # [
# #     MistralInstructPromptBuilder().create_test_row(original, rewritten),
# # ], return_tensors = "pt").to("cuda")

# # outputs = model.generate(**inputs, max_new_tokens = 100, use_cache = True)
# # outputs = [outputs[0][len(inputs["input_ids"][0]):], ]
# # predict = tokenizer.batch_decode(outputs, skip_special_tokens = True)[0]

# # import re
# # predict = predict.replace("\\n", "\n")
# # predict = re.sub(r"^\s+|\s+$", "", predict)
# # if predict.startswith("**Rewrite Prompt:**"):
# #     predict = predict[len("**Rewrite Prompt:**"):].strip()
# # print(predict)

# prmodel.eval()
# prmodel.predict_prompt(original, rewritten)


In [9]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

FastLanguageModel.for_inference(model)

# sentence similarity model
from sentence_transformers import SentenceTransformer

t5base_model = SentenceTransformer('sentence-transformers/sentence-t5-base')
if torch.cuda.is_available():
    t5base_model = t5base_model.to(torch.device("cuda"))

==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Quadro RTX 5000. Max memory: 15.74 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [10]:
# starwar
passage = """As I made my way through the Tatooine desert on my journey to the Jedi Temple to receive my lightsaber and begin my quest to bring balance to the galaxy, I was startled by the sound of humming engines rapidly approaching. I spun around, looking for the source. Suddenly, I felt something lightly bump me on the head, and immediately saw what appeared to be a Jedi Knight flying away. But I had never seen a Jedi Knight like this before. It was smaller, fluffier, and as white as snow. A shiny, maybe? I never found out.

Looking down at the ground, I saw what had hit me in the head. I had received a yellow envelope, sealed with red wax. I picked it up and flipped it over. `` Master Skywalker'' it said in spindly script.

I looked around. Aside from a few Jedi Knights I saw every day making their way to the Temple, there didn't seem to be anyone who might have had something to do with this. I figured opening a letter could n't do that much harm, so I snapped the seal and read the message within:

*To Master Skywalker,*

*We are pleased to inform you that you have been accepted at the Jedi Order. Please find enclosed a list of all necessary books and equipment. Knight training begins on 1 September. We await your lightsaber by no later than 31 July.*

No sooner had I read the words ``a list of all necessary books and equipment'' than a second page fell out of the envelope. It indeed listed the names of several books by authors with strange names, and included odd items such as lightsabers, plants I'd never heard of, and ancient artifacts. Honestly, the thought of dealing with ancient artifacts struck me as rather morbid, but I was too confused by this letter to be very concerned by it."""

# sea shanty
# passage = """(Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be
# """

prompt = f"""Give me tone, style, theme and inspired of following passage. Each should be in 1-2 words. The inspired should be neither actors, movies, writer or none and should be in title case.

For example:
Passage:
\"\"\"The sun was setting over the horizon, casting a warm glow over the fields. The air was filled with the sound of birds chirping and the smell of fresh flowers. It was a peaceful scene, one that made me feel at ease.\"\"\"
Your answer:
- Tone: Peaceful
- Style: Narrative
- Theme: Nature
- Inspired: None.

Your turn:
Passage
\"\"\"{passage}\"\"\"
Your answer:
"""

inputs = tokenizer(
[
    prompt,
    prompt,
], return_tensors = "pt", padding=True).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    use_cache = True,
    pad_token_id=tokenizer.eos_token_id,
)
# for i in range(outputs.shape[0]):
#     print(outputs[i])
outputs = [outputs[0][len(inputs["input_ids"][0]):], ]
predict = tokenizer.batch_decode(outputs, skip_special_tokens = True)[0]

predict = predict.replace("\\n", "\n")
predict = re.sub(r"^\s+|\s+$", "", predict)
predict

'- Tone: Surprised\n- Style: Narrative\n- Theme: Adventure, Jedi\n- Inspired: Star Wars.'

In [11]:
# s="- Tone: Surprised\n- Style: Narrative\n- Theme: Adventure, Jedi\n- Inspired: Star Wars\n\nI hope this is correct. Let me know if there's anything else I can help you with!\n\nBest,\n[Your Name]"

# categories = {}
# category_names = ["tone", "style", "theme", "inspired"]
# for line in s.split("\n"):
#     if not line:
#         continue
#     if not line.startswith("- "):
#         continue
#     line = line[2:]
#     key, value = line.split(":")
#     if key.lower() not in category_names:
#         continue
#     categories[key.strip().lower()] = value.strip()

# categories

In [12]:
# from sentence_transformers import SentenceTransformer
# # sentences = ["This is an example sentence", "Each sentence is converted"]
# sentences = [
#     f"{categories['tone']} tone",
#     f"{categories['style']} style",
#     f"{categories['theme']} theme",
#     f"inspired by {categories['inspired']}",
# ]
# print(sentences)

# model = SentenceTransformer('sentence-transformers/sentence-t5-base')
# embeddings = model.encode(sentences)
# print(embeddings)

In [25]:
import torch

class PromptRecovery:
    category_names = ["tone", "style", "theme", "inspired"]
    
    def __init__(self,
                 model=None, tokenizer=None,
                 model_name=None,
                 prompt_builder=None,
                 init_peft=False,
                 similarity_model=None,
                 device=None):
        d = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if device is None:
            device = d
        self.device = device
        self.prompt_builder = prompt_builder
        self.infer_max_new_tokens = 100

        max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
        dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
        load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

        self.similarity_model = similarity_model

        if model and tokenizer:
            self.model = model
            self.tokenizer = tokenizer
        else:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
                max_seq_length = max_seq_length,
                dtype = dtype,
                load_in_4bit = load_in_4bit,
                # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
            )

            if init_peft:
                model = FastLanguageModel.get_peft_model(
                    model,
                    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
                    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                                    "gate_proj", "up_proj", "down_proj",],
                    lora_alpha = 16,
                    lora_dropout = 0, # Supports any, but = 0 is optimized
                    bias = "none",    # Supports any, but = "none" is optimized
                    use_gradient_checkpointing = True,
                    random_state = 3407,
                    use_rslora = False,  # We support rank stabilized LoRA
                    loftq_config = None, # And LoftQ
                )

            self.model = model
            self.tokenizer = tokenizer

    def eval(self):
        FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference

    # def __init__(self, model, tokenizer):
    #     self.model = model
    #     self.tokenizer = tokenizer

    def extract_categorized_info(self, passages):
        template = """Give me tone, style, theme and inspired of following passage. Each should be in 1-2 words. The inspired should be neither actors, movies, writer or none and should be in title case.

For example:
Passage:
\"\"\"The sun was setting over the horizon, casting a warm glow over the fields. The air was filled with the sound of birds chirping and the smell of fresh flowers. It was a peaceful scene, one that made me feel at ease.\"\"\"
Your answer:
- Tone: Peaceful
- Style: Narrative
- Theme: Nature
- Inspired: None.

Your turn:
Passage
\"\"\"{text}\"\"\"
Your answer:
"""
        prompts = [
            template.format(text=truncate_sentence(passage, 100))
            for passage in passages
        ]
        # prompt = template.format(text=passages)

        inputs = self.tokenizer(
            prompts,
            return_tensors = "pt",
            padding=True,
        ).to(self.device)

        # print(inputs["input_ids"][1])

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=100,
            use_cache = True,
            pad_token_id=tokenizer.eos_token_id,
        )

        list_categories = []
        category_names = self.category_names
        for i in range(outputs.size(0)):
            o = [outputs[i][len(inputs["input_ids"][i]):], ]
            predict = self.tokenizer.batch_decode(o, skip_special_tokens = True)[0]

            predict = predict.replace("\\n", "\n")
            predict = re.sub(r"^\s+|\s+$", "", predict)
            
            # extract categories
            predict = predict.lower()
            categories = {}
            for line in predict.split("\n"):
                if not line:
                    continue
                if not line.startswith("- "):
                    continue
                line = line[2:]
                key, value = line.split(":")
                if key.lower() not in category_names:
                    continue
                value = value.split('.')[0].strip()
                categories[key.strip().lower()] = None if value == 'none' else value
            list_categories.append(categories)
        return list_categories
    
    def find_top_k_unsimilar(self, original_list, rewritten_list, k=1):
        original_embeddings = t5base_model.encode(original_list, convert_to_tensor=True)
        rewritten_embeddings = t5base_model.encode(rewritten_list, convert_to_tensor=True)

        cosine_similarity = torch.nn.functional.cosine_similarity(original_embeddings, rewritten_embeddings, dim=1)
        # print(cosine_similarity)
        ascending_order = torch.argsort(cosine_similarity).tolist()
        
        for i in ascending_order:
            p = rewritten_list[i]
            if "None" in p or "none" in p:
                continue
            return [p, ]
        return None

    def get_sentences(self, categories):
        sentences = [
            f"{categories['tone']} tone",
            f"{categories['style']} style",
            f"{categories['theme']} theme",
            f"inspired by {categories['inspired']}",
        ]
        return sentences

    def predict_prompt(self, original_text, rewritten_text):
        list_categories = self.extract_categorized_info([original_text, rewritten_text])
        original_prompts, rewritten_prompts = [self.get_sentences(cat) for cat in list_categories]
        # print(original_prompts)
        # print(rewritten_prompts)
        predicts = self.find_top_k_unsimilar(original_prompts, rewritten_prompts)
        if predicts is not None:
            return f"Rewrite this into {', '.join(predicts)}"
        return None

In [20]:
import torch

t = torch.tensor([1, 2, 3])
torch.argsort(t).tolist()

[0, 1, 2]

In [26]:
prmodel = PromptRecovery(
    # model_name="unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    # init_peft=True,
    model=model, tokenizer=tokenizer,
    similarity_model=t5base_model,
)

list_categories = []
for index, row in choose_df.iterrows():
    prompt = prmodel.predict_prompt(row['original_text'], row['rewritten_text'])
    print(prompt)

['angry, depressed tone', 'confessional style', 'abuse, trauma theme', 'inspired by wicca']
['dark, melancholic tone', 'descriptive, narrative style', 'abuse, neglect, isolation theme', 'inspired by None']
tensor([0.9176, 0.7944, 0.9277, 0.7834], device='cuda:0')
Rewrite this into descriptive, narrative style


In [15]:
# from sentence_transformers import SentenceTransformer
# # sentences = ["This is an example sentence", "Each sentence is converted"]

# def get_sentences(categories):
#     sentences = [
#         f"{categories['tone']} tone",
#         f"{categories['style']} style",
#         f"{categories['theme']} theme",
#         f"inspired by {categories['inspired']}",
#     ]
#     return sentences

# sentences = [get_sentences(c) for c in list_categories]
# # print(sentences)

# t5base_model = SentenceTransformer('sentence-transformers/sentence-t5-base')
# embeddings = [t5base_model.encode(s, convert_to_tensor=True) for s in sentences]
# # print(embeddings[0].shape)

# if torch.cuda.is_available():
#     t5base_model = t5base_model.to(torch.device("cuda"))

# cosine_similarity = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=1)
# print(cosine_similarity)

## 3.1. Evaluate the test data

In [16]:
import pandas as pd

test_df = pd.read_csv("/kaggle/input/llm-prompt-recovery/test.csv")

test_predict = []
scores = []

model.eval()
for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    try:
        prompt = prmodel.predict_prompt(row['original_text'], row['rewritten_text'])
        assert prompt is not None
    except:
        prompt = "Improve the following text while maintaining the original meaning."
    test_predict.append(prompt)
    # test_scores = compare_phrases(row['rewrite_prompt'], [prompt, ])
    # scores.append(test_scores[0])

# test_df['rewrite_prompt'] = test_predict
# test_df['score'] = scores
# test_df.to_csv(DATA_DIR / "test_data.csv", index=False)

# print('\nTest score stats: ', stats.describe(np.array(scores)))
# print('\nMean SCS score: ', np.mean(np.array(scores)))

# write submission
test_df['rewrite_prompt'] = test_predict
test_df = test_df[['id', 'rewrite_prompt']]

test_df.to_csv('submission.csv', header=True, index=False)
sub = pd.read_csv("/kaggle/working/submission.csv")
sub.head()
