In [1]:
TASK = "REG" # "CLASSIF" # "CAUSAL" # "REG"
BASE_MODEL = "microsoft/Phi-3-mini-4k-instruct"
MAX_LENGTH  = 1656
WITH_AUG = True
LORA_PATH = "/home/ec2-user/SageMaker/Judge-LLM/reg/06_05_2024_15_31/results/checkpoint-2368" # AUG Model
# LORA_PATH = "/home/ec2-user/SageMaker/Judge-LLM/reg/03_05_2024_09_26/results/checkpoint-5187"

In [8]:
from peft import PeftModel
from accelerate import notebook_launcher
from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, 
                          Phi3ForCausalLM, Phi3ForSequenceClassification, pipeline)
import torch
import re
import pandas as pd
import json, re
from tqdm.auto import tqdm
import os
from datetime import datetime
import numpy as np
from datasets import load_from_disk
from scipy.stats import pearsonr,spearmanr, kendalltau, mode
from sklearn.metrics import (r2_score, 
                             mean_squared_error,
                             root_mean_squared_error,
                             mean_absolute_error,
                            accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report)


def load_model(BASE_MODEL, lora_path, task, device, merge_unload = False):
    num_labels = 1 if task == "REG" else 5
    
    if task == "CAUSAL":
        model = Phi3ForCausalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, 
                                                          device_map = device, 
                                                          torch_dtype=torch.bfloat16,
                                                          attn_implementation = "flash_attention_2", 
                                                          )
        
    else:
        model = Phi3ForSequenceClassification.from_pretrained(BASE_MODEL, trust_remote_code=True, 
                                                          device_map = device, 
                                                          torch_dtype=torch.bfloat16,
                                                          attn_implementation = "flash_attention_2", 
                                                              num_labels = num_labels
                                                          )
        
    
    if model.config.pad_token_id is None:
        model.config.pad_token_id = model.config.eos_token_id
                                                          
                                                          
    if lora_path is None: return model.eval()
                                        
    peft_model = PeftModel.from_pretrained(model,
                                          lora_path, 
                                           device_map = device)
    
    peft_model = peft_model.eval()
    if merge_unload: peft_model = peft_model.merge_and_unload()
    
    return peft_model


def predict(model, tokenizer, test_data:list, task, MAX_LENGTH, BATCH_SIZE):
    RESULT = []

    with torch.no_grad():
            
        if task in ["CLASSIF", "REG"]:
            BATCHES = [test_data[i:i + BATCH_SIZE] for i in range(0, len(test_data), BATCH_SIZE)]

            for batch in tqdm(BATCHES):
                inputs = tokenizer(batch, truncation= True, max_length=MAX_LENGTH, padding="max_length", 
                           return_tensors = "pt").to(model.device)


                logits = model(**inputs).logits.cpu().to(torch.float32)

                if task == "CLASSIF": scores = torch.softmax(logits, dim = 1).argmax(axis = 1).numpy()
                elif task == "REG": scores = np.clip(logits.squeeze().numpy(), 1,5).tolist()

                RESULT.extend(scores)

        else:
            for text in tqdm(test_data):
                inputs = tokenizer(text, truncation = True, max_length = MAX_LENGTH, padding="max_length", 
                           return_tensors = "pt").to(model.device)

                predicted_tokens = model.generate(**inputs, max_new_tokens = 256, do_sample = False, use_cache = True,
                                eos_token_id=tokenizer.eos_token_id,
                                pad_token_id = tokenizer.pad_token_id)[0]

                
                generated_text = tokenizer.decode(predicted_tokens, skip_special_tokens = True)
                
                try: pred_score = float(re.findall(r"\[RESULT\] \w",generated_text)[0].replace("[RESULT]","").strip())
                except: pred_Score = -1
                
                RESULT.append([pred_score])


    return RESULT

In [3]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast = False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Data Prep

### With and without Augmented Version

In [4]:
sys_ref = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate and a reference answer that gets a score of 5 are given.
1. Write a detailed feedback that assess the quality of the response.
2. After writing a feedback, write a score that is an integer between 1 and 5. You can refer to the reference answer (which has a perfect score of 5) to get the idea for scoring.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations."""

sys_rubric = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations."""

sys_only = """###Task Description:
An instruction (might include an Input inside it) and a response to evaluate are given.
1. Write a detailed feedback that assess the quality of the response strictly.
2. After writing a feedback, write a score that is an integer between 1 and 5.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations."""

sys_ref_rubric = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general. You can also refer to the reference answer (which has a perfect score of 5) to get the idea for scoring.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations."""

sys_map_prom = {"sys_only":sys_only, "ref_only":sys_ref, "rubric_only":sys_rubric, "ref_rub": sys_ref_rubric}


def create_augmented_prompt(inst:str):
    inst = inst[inst.find("###The instruction to evaluate"):]
    if "###Reference Answer" not in inst:
        sys_prompt = sys_map_prom["rubric_only"]
        que_ans = "<|user|>\n" + inst.replace("###Feedback:", "").strip()
        
        prompt = "<|system|>\n" + sys_prompt + "<|end|>\n\n" + que_ans + "<|end|>\n\n<|assistant|>\n"
        return prompt.replace("###Feedback:", "").strip('"')
    


def create_test_prompt(text):
    if WITH_AUG and "###Reference Answer" not in text:  return create_augmented_prompt(text)
    
    text = text.strip(",\n")
    inst = text.replace("###Task Description:\n","<|system|>\n")
    inst = inst.replace("\n\n###The", "<|end|>\n\n<|user|>\n###The")
    inst = inst.replace("\n\n###Feedback:", "<|end|>\n\n<|assistant|>\n")
    
    return inst.replace("###Feedback:", "").strip('"')



def create_judgelm_panda_prompt(que, ans, ref = ""):
    que = str(que)
    ans = str(ans)
    ref = str(ref)
    
    sys_prompt = sys_map_prom["ref_only"] if len(ref) else sys_map_prom["sys_only"]
    
    que = "###The instruction to evaluate:\n"+ que.strip() + "\n\n"
    ans = "###Response to evaluate:\n" + ans.strip()
    
    if len(ref): ans += ("\n\n###Reference Answer (Score 5):\n" + ref.strip())
    
    return "<|system|>\n" + sys_prompt + "<|end|>\n\n" + que + ans + "<|end|>\n\n<|assistant|>\n"


### Prometheus 1,2 Test Data

In [5]:
feedback_ood = pd.read_json("feedback_collection_ood_test.json") # GPT-4 only. No Human scores . Absolute || REF Ans
feedback_ood["prompt"] = feedback_ood["instruction"].apply(create_test_prompt)

feedback_test = pd.read_json("./feedback_collection_test.json") # GPT-4 only. No Human scores . Absolute || REF Ans
feedback_test["prompt"] = feedback_test["instruction"].apply(create_test_prompt)

vicuna = pd.read_json("./vicuna_eval.json") # Absolute. GPT-4 Scores only. Absolute || REF Ans
vicuna["prompt"] = vicuna["instruction"].apply(create_test_prompt)

flask = pd.read_json("./flask_eval.json") # Absolute. Human + GPT-4 scores
flask["prompt"] = flask["instruction"].apply(create_test_prompt)

pref_coll = pd.read_json("./preference_collection_ood_test.json") # REF + RUB || Relative || GPT-4, I guess
pref_coll["chosen_prompt"] = pref_coll["chosen_instruction"].apply(create_test_prompt)
pref_coll["rejected_prompt"] = pref_coll["rejected_instruction"].apply(create_test_prompt)

alpaca = pd.read_json("alpaca_eval.json") # Human + Relative + Rubric + No REF
alpaca["chosen_prompt"] = alpaca["chosen_instruction"].apply(create_test_prompt)
alpaca["rejected_prompt"] = alpaca["rejected_instruction"].apply(create_test_prompt)

hhh = pd.read_json("./hhh_alignment_eval.json") # Human. Relative. No TIES. Human preference
hhh["chosen_prompt"] = hhh["chosen_instruction"].apply(create_test_prompt)
hhh["rejected_prompt"] = hhh["rejected_instruction"].apply(create_test_prompt)

mt_human = pd.read_json("./mt_bench_human_judgement_eval.json") # Relative WITH ties. Human Preference
mt_human["chosen_prompt"] = mt_human["chosen_instruction"].apply(create_test_prompt)
mt_human["rejected_prompt"] = mt_human["rejected_instruction"].apply(create_test_prompt)

# auto_j = pd.read_json("./autoj_pairwise.json", lines = True) # We'll see later

# mt_eval = pd.read_json("./mt_bench_eval.json", lines = True) # NO Scores for anything. Not using it


DATA_MAP = {"feedback_ood":feedback_ood, "feedback_test":feedback_test, "vicuna":vicuna, "flask":flask, 
 "pref_coll":pref_coll, "alpaca":alpaca, "hhh":hhh, "mt_human":mt_human}

### JudgeLM Test Data

In [6]:
judge_que_ans = pd.read_json("./judgelm_val_5k.jsonl", lines = True)
judge_review = pd.read_json("./judgelm_val_5k_gpt4.jsonl", lines = True)
judge_review_with_ref = pd.read_json("./judgelm_val_5k_gpt4_with_reference.jsonl", lines = True)
judge_ref_ans = pd.read_json("./judgelm_val_5k_references.jsonl", lines = True)

judgelm = pd.merge(judge_que_ans, judge_ref_ans, on = "question_id").merge(judge_review, on = "question_id").merge(judge_review_with_ref, on = "question_id").drop(
    ["score_x", "review_id_x", "review_id_y", "metadata_x", "answer1_id_x", "answer2_id_x", "reviewer_id_x", "answer1_id_y", "answer2_id_y", "reviewer_id_y", "metadata_y", "metadata_y", 
     "answer1_model_id", "answer2_model_id", "answer1_metadata", "answer2_metadata", "question_body_y"], axis = 1).rename(
         columns = {"score_y": "score", "score": "score_with_ref"}
     )

judgelm["prompt_1"] = judgelm.apply(lambda row: create_judgelm_panda_prompt(row["question_body_x"], row["answer1_body"]), axis = 1)
judgelm["prompt_2"] = judgelm.apply(lambda row: create_judgelm_panda_prompt(row["question_body_x"], row["answer2_body"]), axis = 1)

judgelm["prompt_1_ref"] = judgelm.apply(lambda row: create_judgelm_panda_prompt(row["question_body_x"], row["answer1_body"], row["reference"]["text"]), axis = 1)
judgelm["prompt_2_ref"] = judgelm.apply(lambda row: create_judgelm_panda_prompt(row["question_body_x"], row["answer2_body"], row["reference"]["text"]), axis = 1)

### PandaLM Test Data

In [9]:
panda = pd.read_json("./pandalm_testset-v1.json")

def get_agreement(row):
    lis = row[['annotator1', 'annotator2', 'annotator3']].astype(int).values.tolist()
    if len(set(lis)) == 3: return "DROP"
    return mode(lis).mode
    

panda["winner"] = panda.apply(get_agreement, axis = 1)

panda["prompt_1"] = panda.apply(lambda row: create_judgelm_panda_prompt(
    row["instruction"] + "\n" + row["input"], row["response1"], ""), axis = 1)

panda["prompt_2"] = panda.apply(lambda row: create_judgelm_panda_prompt(
    row["instruction"] + "\n" + row["input"], row["response2"], ""), axis = 1)

# Inference

In [10]:
model = load_model(BASE_MODEL = BASE_MODEL, 
                   lora_path = LORA_PATH, 
                   task = TASK, device = "cuda:0", merge_unload = True)

# print(model)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.09it/s]
Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluate

In [11]:
def evaluate_df(df, model, tokenizer, task, MAX_LENGTH, BATCH_SIZE, return_scores = False):
    test_data = df.copy(deep = True)
    eval_type = "relative" if "chosen_prompt" in test_data.columns else "absolute"
    
    human, gpt = [], []
    if eval_type == "absolute":
        texts = test_data["prompt"].values.tolist()
        
        try:
            gpt_4 = [np.mean(i) for i in test_data["gpt4_score"].values]
            human = [np.mean(i) for i in test_data["human_score"].values]
        except Exception as e:
            print(e)
    
        predictions = predict(model, tokenizer, texts, task = TASK, MAX_LENGTH = MAX_LENGTH, BATCH_SIZE = 7)
        pred_labels = [float(i) for i in predictions]
        
        if return_scores: return pred_labels
        
        results = {"GPT":{}, "Human": {}}
        
        if gpt_4:
            results["GPT"] = {"Pearson_r": round(pearsonr(gpt_4, pred_labels).statistic, 2), 
                           "Spearman_r": round(spearmanr(gpt_4, pred_labels).statistic, 2) , 
                           "Kendall_tau": round(kendalltau(gpt_4, pred_labels).statistic, 2),
                           "R_2": round(r2_score(gpt_4, pred_labels), 2),
                           "MSE": round(mean_squared_error(gpt_4, pred_labels), 2), 
                           "MAE": round(mean_absolute_error(gpt_4, pred_labels), 2) 
                          }
    
        if human:
            results["Human"] =  {"Pearson_r": round(pearsonr(human, pred_labels).statistic, 2), 
                                 "Spearman_r": round(spearmanr(human, pred_labels).statistic, 2) , 
                                   "Kendall_tau": round(kendalltau(human, pred_labels).statistic, 2),
                                   "R_2": round(r2_score(human, pred_labels), 2),
                                   "MSE": round(mean_squared_error(human, pred_labels), 2), 
                                   "MAE": round(mean_absolute_error(human, pred_labels), 2) }
                   
    
    else:
        text_chosen = test_data["chosen_prompt"].values.tolist()
        text_rejected = test_data["rejected_prompt"].values.tolist()
        
        predictions_chosen = predict(model, tokenizer, text_chosen, task = TASK, MAX_LENGTH = MAX_LENGTH, BATCH_SIZE = 7)
        pred_labels_chosen = np.array([float(i) for i in predictions_chosen])
        
        predictions_rejected = predict(model, tokenizer, text_rejected, task = TASK, MAX_LENGTH = MAX_LENGTH, BATCH_SIZE = 7)
        pred_labels_rejected = np.array([float(i) for i in predictions_rejected])
        
        if return_scores: return (predictions_chosen, predictions_rejected)
        results = {"Accuracy": round((pred_labels_chosen > pred_labels_rejected).mean(), 2)}
        
    
    return results

In [8]:
final_json = {}

for (data_name, val_df) in DATA_MAP.items():
    met = evaluate_df(val_df, model, tokenizer, task = TASK, MAX_LENGTH = MAX_LENGTH, BATCH_SIZE = 7)
    
    final_json[data_name] = met
    
    
final_json["metadata"] = {"lora_model": LORA_PATH, "Base_model": BASE_MODEL}
with open(f"./eval_results/{str(datetime.now())}_{TASK}_AUG-{WITH_AUG}_LEN-{MAX_LENGTH}.json", "w+") as f: json.dump(final_json, f)

'human_score'


100%|██████████| 143/143 [04:10<00:00,  1.75s/it]


'human_score'


100%|██████████| 143/143 [04:09<00:00,  1.75s/it]


'human_score'


100%|██████████| 46/46 [01:20<00:00,  1.74s/it]
100%|██████████| 286/286 [08:16<00:00,  1.74s/it]
100%|██████████| 286/286 [08:20<00:00,  1.75s/it]
100%|██████████| 286/286 [08:19<00:00,  1.75s/it]
100%|██████████| 371/371 [10:31<00:00,  1.70s/it]
100%|██████████| 371/371 [10:30<00:00,  1.70s/it]
100%|██████████| 32/32 [00:53<00:00,  1.69s/it]
100%|██████████| 32/32 [00:53<00:00,  1.69s/it]
100%|██████████| 205/205 [05:49<00:00,  1.70s/it]
100%|██████████| 205/205 [05:48<00:00,  1.70s/it]


## JudgeLM

In [17]:
j_pred_scores = {"prompt_1": [], "prompt_2": [], "prompt_1_ref": [], "prompt_2_ref": []}

for col_name in j_pred_scores.keys():
    tmp = judgelm.copy(deep = True)
    tmp["prompt"] = tmp[col_name]
    
    judgelm[f"{col_name}_pred_score"] = (np.array(evaluate_df(tmp, model, tokenizer, task = TASK, MAX_LENGTH = MAX_LENGTH, 
                                                       BATCH_SIZE = 7, return_scores = True)) *2).tolist()

'gpt4_score'


100%|██████████| 715/715 [20:02<00:00,  1.68s/it]


'gpt4_score'


100%|██████████| 715/715 [20:02<00:00,  1.68s/it]


'gpt4_score'


100%|██████████| 715/715 [20:08<00:00,  1.69s/it]


'gpt4_score'


100%|██████████| 715/715 [20:09<00:00,  1.69s/it]


In [20]:
def calculate_acc(row, score1_col, score2_col, gt_score_col):
    s1 = row[score1_col]
    s2 = row[score2_col]
    gt_s1, gt_s2 = row[gt_score_col]
    
    if (gt_s1 > gt_s2): 
        if s1 > s2: return True
    
    elif (gt_s1 < gt_s2): 
        if s1 < s2: return True
    
    else: 
        if round(s1, 0) == round(s2, 0): return True
    
    return False  

round(judgelm.apply(lambda row: calculate_acc(row, "prompt_1_pred_score",  "prompt_2_pred_score", "score"),axis = 1).mean(), 2)           

0.72

In [21]:
round(judgelm.apply(lambda row: calculate_acc(row, "prompt_1_ref_pred_score",  "prompt_2_ref_pred_score", "score_with_ref"), axis = 1).mean(), 2)

0.77

## PandaLM Test

In [12]:
tmp = panda.copy(deep = True)
tmp["prompt"] = tmp["prompt_1"]
score_1 = evaluate_df(tmp, model, tokenizer, task = TASK, MAX_LENGTH = MAX_LENGTH, BATCH_SIZE = 7, return_scores = True)

del tmp 

tmp = panda.copy(deep = True)
tmp["prompt"] = tmp["prompt_2"]
score_2 = evaluate_df(tmp, model, tokenizer, task = TASK, MAX_LENGTH = MAX_LENGTH, BATCH_SIZE = 7, return_scores = True)

'gpt4_score'


100%|██████████| 143/143 [04:00<00:00,  1.68s/it]


'gpt4_score'


100%|██████████| 143/143 [03:59<00:00,  1.67s/it]


In [16]:
panda["prompt_1_pred"] = score_1
panda["prompt_2_pred"] = score_2

def panda_acc(row):
    winner = int(row["winner"])
    
    ps1 = row["prompt_1_pred"]
    ps2 = row["prompt_2_pred"]
    
    if winner == 0: # both
        if round(ps1, 0) == round(ps2, 0): return True
    elif winner == 1:
        if ps1 > ps2: return True
    elif winner == 2:
        if ps1 < ps2: return True
    
    return False

round(panda.apply(panda_acc, axis = 1).mean(), 2)

0.68