# Install deps

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%%capture
!pip3 install -U torch torchvision torchaudio

In [3]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [4]:
import os

PROJECT = "unsplot-mistral7b-lpr"

os.environ["HF_TOKEN"] = "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"
os.environ["BACKUP_DIR"] = "/content/drive/MyDrive/WIP"
os.environ["VERSION"] = PROJECT

os.environ["WANDB_PROJECT"] = "kaggle-lpr"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "unsloth_mistral-it-7b"  # log all model checkpoints


In [8]:
# !pip install -U -q sentence-transformers
# !pip install -U -q wandb
import torch
torch.__version__

'2.2.1+cu121'

In [5]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from unsloth import FastLanguageModel
# from peft import PeftModel
import re
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from scipy import stats

# Utils function
t5base_model = None

# Function to calculate sharpened cosine similarity
def sharpened_cosine_similarity(vec1, vec2, exponent=3):
    cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
    return cosine_similarity ** exponent

#provides similarity scores of a test_phrase against an array of phrases
def compare_phrases(test_phrase, phrases):
    global t5base_model
    t5base_model = SentenceTransformer('sentence-t5-base')
    if torch.cuda.is_available():
        t5base_model = t5base_model.to(torch.device("cuda"))
    model = t5base_model

    scores = []
    test_embedding = model.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False)

    for phrase in phrases:
        compare_embedding = model.encode(phrase, convert_to_tensor=True, show_progress_bar=False)
        score = sharpened_cosine_similarity(test_embedding, compare_embedding).item()
        scores.append(score)

    return scores

def truncate_sentence(text, max_words):
    if not isinstance(text, str):
        print(text)
    words = text.split(" ")
    if len(words) <= max_words:
        return text
    return " ".join(words[:max_words])


In [6]:
# Model prompt builder
class MistralInstructPromptBuilder:
    # instruction = """You are an AI assistant capable of aiding my comprehension of modifications made to a given text compared to the original, all within a single 30-word sentence. Avoid specific thoughts, then provide a 30-word rewrite prompt for transformation."""
    instruction = """You are an AI assistant capable of aiding my comprehension regarding changes in tone, style, and vocabulary between a modified text and its original, without specifics in content alterations. Provide a 30-word rewrite prompt for transformation."""
    def create_train_row(self, original_text, rewritten_text, rewrite_prompt):
        original_text = truncate_sentence(original_text, 200)
        rewritten_text = truncate_sentence(rewritten_text, 200)
        rewrite_prompt = truncate_sentence(rewrite_prompt, 200)
        input = f"**Original Text:** {original_text}\n**Rewritten Text:** {rewritten_text}"
        output = f"**Rewrite Prompt:** {rewrite_prompt}"
        text_row = f"""<s>[INST] {self.instruction} Here are the inputs\n{input} [/INST] \\n {output} </s>"""
        return text_row

    def create_test_row(self, original_text, rewritten_text):
        original_text = truncate_sentence(original_text, 200)
        rewritten_text = truncate_sentence(rewritten_text, 200)
        input = f"**Original Text:** {original_text}\n**Rewritten Text:** {rewritten_text}"
        text_row = f"""<s>[INST] {self.instruction} Here are the inputs\n{input} [/INST]"""
        return text_row



In [7]:
# Model
class PRModel:
    def __init__(self,
                 model=None, tokenizer=None,
                 model_name=None,
                 prompt_builder=None,
                 init_peft=False,
                 device=None):
        d = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if device is None:
            device = d
        self.device = device
        self.prompt_builder = prompt_builder
        self.infer_max_new_tokens = 100

        max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
        dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
        load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

        if model and tokenizer:
            self.model = model
            self.tokenizer = tokenizer
        else:
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
                max_seq_length = max_seq_length,
                dtype = dtype,
                load_in_4bit = load_in_4bit,
                # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
            )

            if init_peft:
                model = FastLanguageModel.get_peft_model(
                    model,
                    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
                    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                                    "gate_proj", "up_proj", "down_proj",],
                    lora_alpha = 16,
                    lora_dropout = 0, # Supports any, but = 0 is optimized
                    bias = "none",    # Supports any, but = "none" is optimized
                    use_gradient_checkpointing = True,
                    random_state = 3407,
                    use_rslora = False,  # We support rank stabilized LoRA
                    loftq_config = None, # And LoftQ
                )

            self.model = model
            self.tokenizer = tokenizer

    def eval(self):
        FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference

    def predict_prompt(self, original_text, rewritten_text):
        inputs = self.tokenizer(
        [
            self.prompt_builder.create_test_row(original_text, rewritten_text),
        ], return_tensors = "pt").to(self.device)

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=self.infer_max_new_tokens,
            use_cache = True
        )
        outputs = [outputs[0][len(inputs["input_ids"][0]):], ]
        predict = self.tokenizer.batch_decode(outputs, skip_special_tokens = True)[0]

        predict = predict.replace("\\n", "\n")
        predict = re.sub(r"^\s+|\s+$", "", predict)
        if predict.startswith("**Rewrite Prompt:**"):
            predict = predict[len("**Rewrite Prompt:**"):].strip()
        return predict



# 1. Prepare data & train

In [7]:
from pathlib import Path
import pandas as pd
import numpy as np

# data
DATA_DIR = Path("/datasets/llm_prompt_recovery/")
CKPT_DIR = Path("./")

# load data
train_df = pd.read_csv(DATA_DIR / "train_data.csv")
test_df = pd.read_csv(DATA_DIR / "test_data.csv")
# train_df.dropna(inplace=True)

print(train_df.shape, test_df.shape)

(3200, 5) (800, 5)


In [8]:
MistralInstructPromptBuilder().create_train_row(
    train_df.iloc[0]['original_text'],
    train_df.iloc[0]['rewritten_text'],
    train_df.iloc[0]['rewrite_prompt'],
)

'<s>[INST] You are an AI assistant capable of aiding my comprehension of modifications made to a given text compared to the original, all within a single 30-word sentence. Avoid specific thoughts, then provide a 30-word rewrite prompt for transformation. Here are the inputs\n**Original Text:** Hot tears rapidly stream from my eyes. As the shuddering sobs eventually subside, I am a weakened heap on the bed. A familiar coldness seeps out of the dark, deep corners of my mind and threatens to destroy everything it touches. Suddenly, the coldness draws back, and there is warmth. I look to the doorway and he is standing there. I see him, and there is no greater sight. He wipes away my tears and, with them, the burdens that threatened to consume me. He holds me, and his love heals my marred soul until I am bright and new again.\n\n**Rewritten Text:** The hot tears rapidly stream from my eyes. As the shuddering sobs eventually subside, I am a weakened heap on the bed. A familiar bitterness per

2. Train

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset

import gc
import torch
gc.collect()
torch.cuda.empty_cache()

prompt_builder = MistralInstructPromptBuilder()
prmodel = PRModel(
    model_name="unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    # adapter_model_name = CKPT_DIR / PROJECT_NAME,
    prompt_builder = prompt_builder,
    init_peft=True,
)

model, tokenizer = prmodel.model, prmodel.tokenizer

def formatting_prompts_func(examples):
    original_texts = examples["original_text"]
    rewritten_texts = examples["rewritten_text"]
    rewrite_prompts = examples["rewrite_prompt"]

    texts = []
    for original_text, rewritten_text, rewrite_prompt in zip(original_texts, rewritten_texts, rewrite_prompts):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt_builder.create_train_row(original_text, rewritten_text, rewrite_prompt) + tokenizer.eos_token
        texts.append(text)
    return { "text" : texts, }

dataset = Dataset.from_pandas(train_df)
dataset = dataset.map(formatting_prompts_func, batched = True,)

max_seq_length = 2048
n_epochs = 1

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        report_to="wandb",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs=n_epochs,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: Tesla V100-SXM2-16GB. Max memory: 15.773 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/3200 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,200 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 200
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Currently logged in as: [33mdungvu[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,3.1868
2,3.1654
3,3.0746
4,2.6219
5,2.4189
6,2.1724
7,2.226
8,2.146
9,1.8549
10,1.8971


In [None]:
model.save_pretrained(PROJECT)

# 3. Predict data

In [None]:
# model = GemmaModel(
#     model_name="google/gemma-2b-it",
#     adapter_model_name = CKPT_DIR / PROJECT_NAME,
# )

prmodel = PRModel(
    # model_name = PROJECT,
    model=model,
    tokenizer=tokenizer,
    # adapter_model_name = CKPT_DIR / PROJECT_NAME
    prompt_builder = prompt_builder,
)

# test
original = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
rewritten = "(Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"

# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# inputs = tokenizer(
# [
#     MistralInstructPromptBuilder().create_test_row(original, rewritten),
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 100, use_cache = True)
# outputs = [outputs[0][len(inputs["input_ids"][0]):], ]
# predict = tokenizer.batch_decode(outputs, skip_special_tokens = True)[0]

# import re
# predict = predict.replace("\\n", "\n")
# predict = re.sub(r"^\s+|\s+$", "", predict)
# if predict.startswith("**Rewrite Prompt:**"):
#     predict = predict[len("**Rewrite Prompt:**"):].strip()
# print(predict)

prmodel.eval()
prmodel.predict_prompt(original, rewritten)




## 3.1. Evaluate the test data

In [None]:
test_predict = []
scores = []

model.eval()
for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    try:
        prompt = prmodel.predict_prompt(row['original_text'], row['rewritten_text'])
    except:
        prompt = "Improve the following text while maintaining the original meaning"
    test_predict.append(prompt)
    test_scores = compare_phrases(row['rewrite_prompt'], [prompt, ])
    scores.append(test_scores[0])

test_df['rewrite_prompt'] = test_predict
test_df['score'] = scores
test_df.to_csv(DATA_DIR / "test_data.csv", index=False)

print('\nTest score stats: ', stats.describe(np.array(scores)))
print('\nMean SCS score: ', np.mean(np.array(scores)))
