# Install deps

In [None]:
%load_ext autoreload
%autoreload 2

import os

os.environ["HF_TOKEN"] = "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"
os.environ["BACKUP_DIR"] = "/content/drive/MyDrive/WIP"
os.environ["VERSION"] = "01-peft-30epochs"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# for auto train
!pip install -U autotrain-advanced > install_logs.txt
!autotrain setup --colab > setup_logs.txt

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.23.4 which is incompatible.[0m[31m
[0m> [1mINFO    Installing latest xformers[0m
> [1mINFO    Successfully installed latest xformers[0m


In [None]:
!pip install -U -q sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/156.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from scipy import stats

def sharpen_cosine_similarity(x, y):
    A = np.array(x)
    B = np.array(y)
    dot_product = np.dot(A, B)
    magnitude_A = np.linalg.norm(A)
    magnitude_B = np.linalg.norm(B)
    cosine_similarity = dot_product / (magnitude_A * magnitude_B)
    return cosine_similarity ** 3

def calculate_score(predicts, targets):
    embedding_model = SentenceTransformer('sentence-transformers/sentence-t5-base')
    encoded_predicts = embedding_model.encode(predicts)
    encoded_targets = embedding_model.encode(targets)
    score = [
        sharpen_cosine_similarity(target, predict)
        for target, predict in zip(encoded_targets, encoded_predicts)
    ]
    return score

def truncate_sentence(text, max_words):
    if not isinstance(text, str):
        print(text)
    words = text.split(" ")
    if len(words) <= max_words:
        return text
    return " ".join(words[:max_words])

def generate_gemma_prompt(original_text, rewritten_text, rewrite_prompt=None, max_length_each=500):
    instruction_text = 'Generate a rewrite_prompt that effectively transforms the provided original_text into the provided rewritten_text. The rewrite_prompt must be clearly explain how to the original_text is transformed to the rewritten_text, focus on explaining the changes of tone, writting style, publishing, etc. Keep the rewrite_prompt concise, less than 100 words.'

    text = f"""<start_of_turn>user {instruction_text}
Here is the given texts:
# original_text:
{truncate_sentence(original_text, max_length_each)}

# rewritten_text:
{truncate_sentence(rewritten_text, max_length_each)}
<end_of_turn>
<start_of_turn>model""" + \
    (f"""\n{truncate_sentence(rewrite_prompt, max_length_each)}<end_of_turn>""" if rewrite_prompt else '')

    return text

class GemmaModel:
    def __init__(self, model_name, adapter_model_name=None, device="cuda"):
        self.device = device

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quantization_config
        )
        self.model = model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # check peft
        adapter_model = PeftModel.from_pretrained(model, adapter_model_name)
        self.model = adapter_model

    def predict_prompt(self, original_text, rewritten_text, max_new_tokens=300):
        prompt = generate_gemma_prompt(original_text, rewritten_text)
        prompt_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)

        outputs = self.model.generate(
            prompt_ids,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            # temperature=0.9,
            # top_k=1,
            # top_p=0.92,
            # num_return_sequences=1
        )

        try:
            output = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
            output = output.split("<start_of_turn>model", 1)[1].split("<end_of_turn>")[0].replace("<eos>", "")
            return output
        except e as Exception:
            print(e)
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)



# 1. Prepare data & train

In [None]:
! cp -r /content/drive/MyDrive/Kaggle/LLM/rewrite_prompt/data/[LPR]_ourgen_1003 /content/data

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# data
DATA_DIR = Path("./data")
CKPT_DIR = Path("./")

# load data
train_df = pd.read_csv(DATA_DIR / "train_data.csv")
test_df = pd.read_csv(DATA_DIR / "test_data.csv")
train_df.dropna(inplace=True)

print(train_df.shape, test_df.shape)

(3973, 4) (995, 4)


In [None]:
# prepare 'text' in gemma format column for autotrain
# from models.utils import generate_gemma_prompt

def apply_generate_prompt(row):
    return generate_gemma_prompt(row["original_text"], row["rewritten_text"], row["rewrite_prompt"])

train_df["text"] = train_df.apply(apply_generate_prompt, axis=1)
train_df.to_csv(DATA_DIR / "train.csv", index=False)

2. Train

In [None]:
import os
learning_rate = 2e-4
num_epochs = 30
batch_size = 1
block_size = 1024
trainer = "sft"
warmup_ratio = 0.1
weight_decay = 0.01
gradient_accumulation = 4
mixed_precision = "fp16"
peft = True
quantization = "int4"
lora_r = 16
lora_alpha = 32
lora_dropout = 0.05

PROJECT_NAME = "gemma-2bit"
os.environ["PROJECT_NAME"] = PROJECT_NAME
os.environ["MODEL_NAME"] = "google/gemma-2b-it"
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["MIXED_PRECISION"] = str(mixed_precision)
os.environ["PEFT"] = str(peft)
os.environ["QUANTIZATION"] = str(quantization)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)

In [None]:
!autotrain llm \
--train \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path data/ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
--quantization ${QUANTIZATION} \
--peft \
--mixed-precision ${MIXED_PRECISION}

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  warn(
> [1mINFO    Running LLM[0m
> [1mINFO    Params: Namespace(version=False, text_column='text', rejected_text_column='rejected', prompt_text_column='prompt', model_ref=None, warmup_ratio=0.1, optimizer='adamw_torch', scheduler='linear', weight_decay=0.01, max_grad_norm=1.0, add_eos_token=False, block_size=1024, peft=True, lora_r=16, lora_alpha=32, lora_dropout=0.05, logging_steps=-1, evaluation_strategy='epoch', save_total_limit=1, save_strategy='epoch', auto_find_batch_size=False, mixed_precision='fp16', quantization='int4', model_max_length=1024, trainer='default', target_modules=None, merge_adapter=False, use_flash_attention_2=False, dpo_beta=0.1, chat_template=None, padding=None, train=True, deploy=False, inference=False, username=None, backend='local-cli', token=None, repo_id=None, push_to_hub=False, model='google/gemma-2b-it', project_name='gemma-2bit', seed=42, epochs=30, gradient_

In [None]:
# persist model to gg drive
! cp -r ${PROJECT_NAME} ${BACKUP_DIR}/${PROJECT_NAME}-${VERSION}

In [None]:
! cp -r ${BACKUP_DIR}/${PROJECT_NAME}-${VERSION} ${PROJECT_NAME}

# 3. Predict data

In [None]:
model = GemmaModel(
    model_name="google/gemma-2b-it",
    adapter_model_name = CKPT_DIR / PROJECT_NAME,
)

# test
original = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
rewritten = "Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"

model.predict_prompt(original, rewritten, max_new_tokens=300)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

'\nreimagine this song as a folk ballad, with lyrics that are simple yet powerful, yet evocative...'

## 3.1. Evaluate the test data

In [None]:
test_predict = []

model.model.eval()
with torch.no_grad():
    for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        try:
            prompt = model.predict_prompt(row['original_text'], row['rewritten_text'], max_new_tokens=300)
            test_predict.append(prompt)
        except:
            test_predict.append("")

test_df['predict_before'] = test_predict

score = calculate_score(
    test_df['predict'].to_list(),
    test_df['rewrite_prompt'].to_list(),
)
test_df['score'] = score
test_df.to_csv(DATA_DIR / "test_data.csv")

print('Test score stats: ', stats.describe(np.array(score)))
print('Mean SCS score: ', np.mean(np.array(score)))


  1%|          | 9/995 [02:32<3:52:39, 14.16s/it]

In [None]:
# backup
! cp -r ./data ${BACKUP_DIR}/data-${VERSION}