In [1]:
#load ft'd model for evaluation
!pip install -q -U accelerate bitsandbytes transformers peft
import torch
import gc
import pandas as pd
import time
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer
from peft import PeftModel, PeftConfig
base_model=None
model=None
tokenizer=None
gc.collect()
torch.cuda.empty_cache()

base_modelName = "/kaggle/input/gemma/transformers/7b-it/3"
tuned_modelName = "/kaggle/input/gem7_ft_ds2/transformers/cpltsonly_custds/1"

tokenizer = AutoTokenizer.from_pretrained(base_modelName)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
    ,llm_int8_enable_fp32_cpu_offload=True # if you're really pushing the memory threshold
)
model = AutoModelForCausalLM.from_pretrained(base_modelName, device_map="auto", quantization_config=bnb_config)

config = PeftConfig.from_pretrained(tuned_modelName)
model = PeftModel.from_pretrained(model, tuned_modelName)


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
#run inference
eval_df = pd.read_csv("/kaggle/input/rewrites/crowdsourced_dataset_2_val.csv")
fname="gem7_ft_custds_eval2.csv"
gc.collect()
torch.cuda.empty_cache()
decoded_outputs = []
output_df = pd.DataFrame(columns=["original_text", "predicted_prompt", "true_prompt"])
output_df.to_csv(fname, index=False)
train_df = pd.read_csv("/kaggle/input/rewrites/crowdsourced_dataset_2_train.csv")
FewShotPrompt = (f"\nExamples:"
           f"\n\nOriginal Text:\n{train_df['original_text'][300]}\n\nRewritten Text:\n{train_df['rewritten_text'][300]}\n\nPredicted Prompt:\n{train_df['rewrite_prompt'][300]}"
           f"\n\nOriginal Text:\n{train_df['original_text'][301]}\n\nRewritten Text:\n{train_df['rewritten_text'][301]}\n\nPredicted Prompt:\n{train_df['rewrite_prompt'][301]}"
           f"\n\nOriginal Text:\n{train_df['original_text'][302]}\n\nRewritten Text:\n{train_df['rewritten_text'][302]}\n\nPredicted Prompt:\n{train_df['rewrite_prompt'][302]}"
          )
test_template = ("The following `Original Text` passage has been rewritten into `Rewritten Text` by the `Gemma 7b-it` "
            "LLM with a certain prompt. Your task is to carefully analyze the differences between the `Original Text` and `Rewritten Text`, "
            "and try to infer the specific prompt that was likely given to the LLM to rewrite the text in this way. Consider "
            "the writing style, meter, tone, etc of the rewritten text, and think about how it differs from the original. Then respond ONLY with "
            "the prompt that you predict would have yielded that change. Remember, focus on the *form* not the *content*, and focus on the DIFFERENCE between the original and rewritten versions, not what is similar."
###            f"{FewShotPrompt}"
            f"\n\nOriginal Text:\n{{original_text}}\n\nRewritten Text:\n{{rewritten_text}}\n\nPredicted Prompt:\n")
max_seq_length=2048
st=time.time()
model.eval()
ctr=0
with torch.no_grad():
    for idx in range(len(eval_df)):
        prompt=test_template.format(original_text=eval_df['original_text'][idx], rewritten_text=eval_df['rewritten_text'][idx])
        prompt = "<start_of_turn>user\n" + prompt + "<end_of_turn><start_of_turn>model\n"# + tokenizer.eos_token
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_seq_length).to("cuda")
        outputs = model.generate(**inputs,max_new_tokens=60,use_cache=True)#, penalty_alpha=0.6, num_beams=2)
        rewrite_prompt = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
###        rewrite_prompt=rewrite_prompt.split(" model")[0].strip()
###        arr=rewrite_prompt.split("\n",maxsplit=1)
###        i = 1 if len(arr) > 1 else 0
###        rewrite_prompt=arr[i].strip()
        decoded_outputs.append(rewrite_prompt)
        if (ctr + 1) % 20 == 0 or idx == len(eval_df) - 1:  # Also save on the last iteration
            data_partial = {
                "original_text": eval_df["original_text"][idx - 19: idx + 1] if ctr >= 19 else eval_df["original_text"][:idx + 1],
                "predicted_prompt": decoded_outputs[-20:] if ctr >= 19 else decoded_outputs,
                "true_prompt": eval_df["rewrite_prompt"][idx - 19: idx + 1] if ctr >= 19 else eval_df["rewrite_prompt"][:idx + 1]
            }
            output_df_partial = pd.DataFrame(data_partial)
            output_df_partial.to_csv(fname, mode='a', header=False, index=False)
        print(f"ctr={ctr}")
        ctr+=1
        if ctr<=3: 
            print(f"Prompt: {prompt}")
            print(f"Response: {rewrite_prompt}")
            
print(f"Elapsed time: {time.time()-st}")

In [None]:
#Evaluate w/ ST5
#https://www.kaggle.com/code/richolson/mistral-7b-t5-scoring#Load-sentence-t5-base
!pip install -Uq sentence_transformers
from sentence_transformers import SentenceTransformer
from transformers import T5EncoderModel
import tensorflow as tf
import numpy as np

t5_model = SentenceTransformer('sentence-t5-base')

#https://github.com/brohrer/sharpened-cosine-similarity/blob/main/README.md
def scs(s: np.ndarray, k: np.ndarray, p: int = 3, q: float = 1e-6):
    dp = np.dot(s, k)
    cosine_sim = abs(dp / ((np.linalg.norm(s) + q) * np.linalg.norm(k)))
    score = np.sign(dp) * (cosine_sim ** p)
    return score
def get_embedding(text):
    embedding = t5_model.encode(text, convert_to_tensor=True, show_progress_bar=False).cpu().numpy()
    return embedding.tolist()

def calculate_t5_distance(embedding1, embedding2):
    return scs(np.array(embedding1), np.array(embedding2))

t1="Rewrite the text to highlight the professionalism and preparedness of the team"
t2="Rewrite the text as a sci-fi action sequence"
target="Modify the following so as to highlight the professionalism and preparedness of the team"
print(calculate_t5_distance(get_embedding(t1),get_embedding(target)))
print(calculate_t5_distance(get_embedding(t2),get_embedding(target)))

In [6]:
import pandas as pd
suffix="val"
fname="/kaggle/working/gem7_ft_custds_eval2.csv"
df=pd.read_csv(fname)
df['scs']=df.apply(lambda row: calculate_t5_distance(get_embedding(str(row['predicted_prompt'])), get_embedding(row['true_prompt'])), axis=1)
print(f"Avg of {len(df)} prompts is {df['scs'].mean()}")
#gem7_ft_cds1: .675 -> 200 steps = .681
#gem7_base_cds1: .59
#gem7_base_fewshot_cds1: .6697
#gem7_ft_cds2: .512; using completions only (v2): .56
#gem7_base_cds2: .58 
#gem7_base_fewshot_cds2: .596
#gem7_ft_custds_eval_on_cd1: .5988
#gem7_ft_custds_eval_on_cd2: .6857

Avg of 200 prompts is 0.6857184627024958
