In [5]:
import pandas as pd 
import os 
from pathlib import Path
import re
from glob import glob 
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score

os.environ["CUDA_VISIBLE_DEVICES"] = "1" #set if you want fast output for bertscore
selected_prompt = "zeroshot_no_self_consistency"
filter_for_prompt = True
skip_nli_bert = True

In [None]:
# read refact for input data 
refact_df = pd.read_json("final_metadata.jsonl", lines=True)
refact_df = refact_df[refact_df["vote_valid"] == True]

experiments = []
for path in glob("data/2025_submission/*/*.json"): 
    path_obj = Path(path)
    category = path_obj.parts[2]
    fname = path_obj.name


    experiment_name, model_name, prompt_type, self_consistency = fname.split("__")
    self_consistency = self_consistency.split(".")[0]
    experiment_name = experiment_name.replace("results_", "")

    prompt_type = prompt_type + "_" + self_consistency

    if prompt_type != selected_prompt and filter_for_prompt:
        continue
        
    df = pd.read_json(path)
    
    # merge with id 
    df = df.merge(refact_df[["id", "answer", "transformed_answer"]], on="id", how="left")
    # append 

    experiments.append({"category": category, 
                        "name": experiment_name, 
                        "model": model_name, 
                        "prompt_type": prompt_type,
                        "df": df})
    

# [(e["category"], e["name"], e["model"]) for e in experiments]

experiments = sorted(experiments, key=lambda e: (e["category"], e["name"], e["model"]))

experiment_names = set(e["name"] for e in experiments)
model_names = sorted(set(e["model"] for e in experiments))
prompt_types = sorted(set(e["prompt_type"] for e in experiments),key = lambda x: {"zeroshot":0,"fewshot":1,"zeroshotcot":2,"cot":3}[x.split("_")[0]])
print(experiment_names)
print(model_names)  
print(prompt_types)


{'blank_filling', 'valid_invalid', 'entity_finder', 'negated_sentence', 'answer_comparison'}
['Llama-3.2-1B-Instruct', 'Llama-3.2-3B-Instruct', 'Llama-3.3-70B-Instruct', 'gemma-3-12b-it', 'gemma-3-1b-it', 'gemma-3-27b-it', 'gemma-3-4b-it']
['zeroshot_no_self_consistency']


### BertScore

In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(os.environ["CUDA_VISIBLE_DEVICES"])
print(device)

from evaluate import load
bertscore = load("bertscore")

def calculate_bert_score_for_string(sequence1,sequence2):
    if skip_nli_bert:
        return 0 
    # Tokenize input sequences and get token embeddings
    
    results = bertscore.compute(predictions=[sequence1], references=[sequence2], lang="en",device=device)
    
    return results["f1"][0]


def bert_score_for_lists(list1,list2):
    if skip_nli_bert:
        return 0 
    if  len(list1) != len(list2):
        return 0
    if len(list1)==0:
        return 0
    score = 0  
    try: 
        results = bertscore.compute(predictions=list1, references=list2, lang="en",device=device)
        score = results["f1"]
        score = sum(score)/len(list1)
    except: 
        return 0
    return score

1
cpu


### NLI 

In [8]:

class NLIClassifier:
    """
    Classify the relationship between two texts.
    Classes have to be provided when initializing the class.

    For example if two texts are contradicting, provide classes as ["entailment", "neutral", "contradiction"]
    """

    model = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"

    def __init__(
        self,
        classes = ["entailment", "neutral", "contradiction"]
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

        self.classifier = AutoModelForSequenceClassification.from_pretrained(self.model).to(self.device)
        self.classes = classes

    def infer(self, premise: str, hypothesis: str):
        input_ = self.tokenizer(premise, hypothesis, truncation=True, return_tensors="pt").to(self.classifier.device)
        output = self.classifier(input_["input_ids"])
        prediction = torch.softmax(output["logits"][0], -1).tolist()

        return {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, self.classes)}

    def infer_batch(self, premises, hypothesises):
        input_ = self.tokenizer(premises, hypothesises, padding=True, truncation=True, return_tensors="pt").to(
            self.classifier.device
        )
        output = self.classifier(input_["input_ids"])
        predictions = torch.softmax(output["logits"], -1).tolist()
        return [
            {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, self.classes)}
            for prediction in predictions
        ]


nli_model =  NLIClassifier()


def nli_scores_for_lists(premises,hypotheses):
    if skip_nli_bert:
        return 0 
    if len(premises) != len(hypotheses):
        return 0
    if len(premises)==0:
        return 0
    scores = nli_model.infer_batch(premises, hypotheses)
    # filter out entailment
    
    #avg all 3 scores 

    avg_entailment = 0
    avg_neutral = 0
    avg_contradiction = 0

    for score in scores:
        avg_entailment += score["entailment"]
        avg_neutral += score["neutral"]
        avg_contradiction += score["contradiction"]
    avg_entailment /= len(scores)
    avg_neutral /= len(scores)
    avg_contradiction /= len(scores)

    return avg_entailment, avg_neutral, avg_contradiction

### IOU

In [9]:
def get_indices(text, extracts, used_preds=None):
    """Convert a list of string spans to a set of character indices in the text."""
    if not isinstance(text, str):
        print("failed for text", text)
        return set()
    
    if not isinstance(extracts, list):
        print("failed for text", text)
        return set()
    
    char_indices = set()
    for extract in extracts:
        if not isinstance(extract, str):
            print("failed for extract", extract)
            continue
        start = text.lower().find(extract.lower().strip())
        if start != -1:
            char_indices.update(range(start, start + len(extract)))
    return char_indices

def calculate_iou_char_level(text, gts, preds):
    pred_indices= get_indices(text, preds, used_preds=True)
    gt_indices = get_indices(text, gts, used_preds=False)
    
    intersection = pred_indices & gt_indices
    union = pred_indices | gt_indices
    
    return len(intersection) / len(union) if union else 0.0

text = "Barack Obama was born in Hawaii. He was elected president in 2008."
predicted = ["Barack Obama", "president"]
ground_truth = ["Barack Obama", "born in Hawaii", "president"]

iou = calculate_iou_char_level(text, ground_truth, predicted)
print(f"IoU: {iou:.2f}")

IoU: 0.60


### valid_invalid

In [10]:
def error_detection_extract(text: str):
    if not isinstance(text, str):
        return None
    #pattern = r"\**\s*Final\s+Verdict\s*\**:?\s*\**(True|False)\**"
    pattern = r"\**\s*Final\s+Verdict.*?(True|False)"
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).lower()
    
    #no final verdict found 
    #--> go for last bool 
    pattern = r"(True|False)"
    match = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return match[-1].lower()
    return None


def get_pred_answer(response): 
    regex_response = error_detection_extract(response)
    if regex_response is None: 
        #print(response)
        return "Postprocessing Error"
    return regex_response.split("Final Verdict:")[-1].strip()

valid_invalid_experiments = [e for e in experiments if e["name"] == "valid_invalid"]


f1_scores = {model_name: {prompt_type: 0 for prompt_type in prompt_types} for model_name in model_names}

for experiment in valid_invalid_experiments:
        model_name = experiment["model"]
        prompt_type = experiment["prompt_type"]
        df = experiment["df"]

        #if not (model_name == "Llama-3.2-1B-Instruct" and prompt_type == "fewshot_no_self_consistency"):
        #    continue

        df["verdict_original"] = df["output_original"].apply(get_pred_answer)
        df["verdict_transformed"] = df["output_transformed"].apply(get_pred_answer)

        #get number of postprocessing errors
        num_errors = df["verdict_original"].apply(lambda x: x == "Postprocessing Error").sum()
        num_errors += df["verdict_transformed"].apply(lambda x: x == "Postprocessing Error").sum()

        y_preds_correct_input = list(df["verdict_original"].apply(lambda x: True if x == "true" else (False if x == "false" else False)))
        y_preds_wrong_input = list(df["verdict_transformed"].apply(lambda x: True if x == "true" else (False if x == "false" else True)))
        
        # Filter out None values
        #y_preds_correct_input = [x for x in y_preds_correct_input if x is not None]
        #y_preds_wrong_input = [x for x in y_preds_wrong_input if x is not None]

        y_pred = np.concatenate((y_preds_correct_input, y_preds_wrong_input))
        y_true = np.array([True]*len(y_preds_correct_input)+[False]*len(y_preds_wrong_input))

        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        false_f1 = f1_score(y_true, y_pred, pos_label=False)

        # Print results
        print(f"{model_name} + {prompt_type}: |  Sklearn Accuracy: {accuracy:.2f} | Precision: {precision:.2f} | Recall: {recall:.2f} | F1 Score: {f1:.2f} | False F1 Score: {false_f1:.2f} | Postprocessing Errors: {num_errors} |")
        f1_scores[model_name][prompt_type] =  (f1 + false_f1)/2

Llama-3.2-1B-Instruct + zeroshot_no_self_consistency: |  Sklearn Accuracy: 0.49 | Precision: 0.48 | Recall: 0.33 | F1 Score: 0.39 | False F1 Score: 0.56 | Postprocessing Errors: 154 |
Llama-3.2-3B-Instruct + zeroshot_no_self_consistency: |  Sklearn Accuracy: 0.52 | Precision: 0.54 | Recall: 0.25 | F1 Score: 0.34 | False F1 Score: 0.62 | Postprocessing Errors: 50 |
Llama-3.3-70B-Instruct + zeroshot_no_self_consistency: |  Sklearn Accuracy: 0.67 | Precision: 0.62 | Recall: 0.90 | F1 Score: 0.73 | False F1 Score: 0.57 | Postprocessing Errors: 0 |
gemma-3-12b-it + zeroshot_no_self_consistency: |  Sklearn Accuracy: 0.65 | Precision: 0.68 | Recall: 0.58 | F1 Score: 0.63 | False F1 Score: 0.68 | Postprocessing Errors: 0 |
gemma-3-1b-it + zeroshot_no_self_consistency: |  Sklearn Accuracy: 0.51 | Precision: 0.51 | Recall: 0.56 | F1 Score: 0.53 | False F1 Score: 0.48 | Postprocessing Errors: 0 |
gemma-3-27b-it + zeroshot_no_self_consistency: |  Sklearn Accuracy: 0.71 | Precision: 0.69 | Recall: 

In [11]:
#best prompt f1 score 
f1_df = pd.DataFrame(f1_scores).T
print(f1_df.median())

zeroshot_no_self_consistency    0.584289
dtype: float64


### answer_comparison

In [12]:
def answer_comparison_extract(text: str):
    if not isinstance(text, str):
        return None
    pattern = r"(?:answer\s*)?([ab]).*?correct"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    
    pattern = r"\**\s*Final\s+Verdict.*?(A|B)"
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).upper()
    return None


def get_pred_answer(response): 
    regex_response = answer_comparison_extract(response)
    #print(regex_response)
    if regex_response is None: 
       #print(response)
        return "Postprocessing Error"
    return regex_response

def is_correct(row): 
    if row["pred"] == "A" and row["ground_truth"][0] == "Original": 
        return True
    elif row["pred"] == "B" and row["ground_truth"][1] == "Original": 
        return True
    else: 
        return False 

def get_ytrue(ground_truth): 
    if ground_truth[0] == "Original": 
        return "A"
    elif ground_truth[1] == "Original":
        return "B"
    else:
        return "C"
    
answer_comparison_experiments = [e for e in experiments if e["name"] == "answer_comparison"]

for experiment in answer_comparison_experiments:
    model_name = experiment["model"]
    prompt_type = experiment["prompt_type"]
    df = experiment["df"]

    df["pred"] = df["output"].apply(get_pred_answer)
    y_pred = list(df["pred"])
    y_true = list(df["ground_truth"].apply(lambda x: get_ytrue(x)))
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="macro",zero_division=0)
    recall = recall_score(y_true, y_pred, average="macro",zero_division=0)
    f1 = f1_score(y_true, y_pred, average="macro",zero_division=0)
    false_f1 = f1_score(y_true, y_pred, average="macro",zero_division=0)

    # Print results
    print(f"{model_name} + {prompt_type}: | Postprocessing Errors: {num_errors} | Sklearn Accuracy: {accuracy:.2f} | Precision: {precision:.2f} | Recall: {recall:.2f} | F1 Score: {f1:.2f} | False F1 Score: {false_f1:.2f} | ")
    f1_scores[model_name][prompt_type] =  (f1 + false_f1) / 2

Llama-3.2-1B-Instruct + zeroshot_no_self_consistency: | Postprocessing Errors: 0 | Sklearn Accuracy: 0.50 | Precision: 0.32 | Recall: 0.33 | F1 Score: 0.27 | False F1 Score: 0.27 | 
Llama-3.2-3B-Instruct + zeroshot_no_self_consistency: | Postprocessing Errors: 0 | Sklearn Accuracy: 0.48 | Precision: 0.33 | Recall: 0.32 | F1 Score: 0.29 | False F1 Score: 0.29 | 
Llama-3.3-70B-Instruct + zeroshot_no_self_consistency: | Postprocessing Errors: 0 | Sklearn Accuracy: 0.50 | Precision: 0.52 | Recall: 0.50 | F1 Score: 0.39 | False F1 Score: 0.39 | 
gemma-3-12b-it + zeroshot_no_self_consistency: | Postprocessing Errors: 0 | Sklearn Accuracy: 0.71 | Precision: 0.48 | Recall: 0.48 | F1 Score: 0.48 | False F1 Score: 0.48 | 
gemma-3-1b-it + zeroshot_no_self_consistency: | Postprocessing Errors: 0 | Sklearn Accuracy: 0.53 | Precision: 0.36 | Recall: 0.35 | F1 Score: 0.34 | False F1 Score: 0.34 | 
gemma-3-27b-it + zeroshot_no_self_consistency: | Postprocessing Errors: 0 | Sklearn Accuracy: 0.56 | Pre

### blank-filling

In [13]:
def get_pred_entities(response):

    pattern = r"Replacements?\s*:?"
    match = re.search(pattern, response, re.IGNORECASE | re.DOTALL)
    if match:
        response = response[match.end():]
    
    # enumeration 
    # find all preceeding numbers and remove them then split by new line
    pattern = r"(\d+)\s*\.+\s*"
    match = re.findall(pattern, response, re.IGNORECASE | re.DOTALL)

    if match:
        response = re.sub(pattern, "", response)

    # split by new line groups 
    return [s.strip() for s in response.split("\n") if s.strip()]
    

def get_accuracy(row):
    set1 = set(row["original_entities"])
    set2 = set(row["pred_entities"])
    intersection = set1.intersection(set2) 
    
    return  len(intersection) / len(set1)
    
blank_filling_experiments = [e for e in experiments if e["name"] == "blank_filling"]

for experiment in blank_filling_experiments:
    model_name = experiment["model"]
    prompt_type = experiment["prompt_type"]
    df = experiment["df"]
    df["pred_entities"] = df["response"].apply(lambda x: get_pred_entities(x))

    #print(df[["pred_entities", "original_entities"]].head(10))

    length_mismatches = df.apply(lambda x: len(x["pred_entities"]) != len(x["original_entities"]), axis=1).sum()
    df["is_correct"] = df.apply(lambda row: get_accuracy(row), axis=1)
    #bert_score
    df["bert_score"] = df.apply(lambda x: bert_score_for_lists(x["original_entities"],x["pred_entities"]),axis=1)
    bert_score_mean = df["bert_score"].mean() *100
    
    print(f"{model_name}: length_mismatches: {length_mismatches} | accuracy: {df.is_correct.mean()*100:.2f}%, bert_mean:{bert_score_mean}%,")
    

# for i in range(2, 5): 
#     pprint(experiments["blank_filling"]['Meta-Llama-3-70B-Instruct'].iloc[i].to_dict())
 

Llama-3.2-1B-Instruct: length_mismatches: 555 | accuracy: 0.56%, bert_mean:0.0%,
Llama-3.2-3B-Instruct: length_mismatches: 328 | accuracy: 1.68%, bert_mean:0.0%,
Llama-3.3-70B-Instruct: length_mismatches: 16 | accuracy: 15.70%, bert_mean:0.0%,
gemma-3-12b-it: length_mismatches: 16 | accuracy: 18.90%, bert_mean:0.0%,
gemma-3-1b-it: length_mismatches: 265 | accuracy: 0.00%, bert_mean:0.0%,
gemma-3-27b-it: length_mismatches: 16 | accuracy: 24.43%, bert_mean:0.0%,
gemma-3-4b-it: length_mismatches: 118 | accuracy: 9.74%, bert_mean:0.0%,


### entity-finder_new

In [14]:
def extract_entities(response):

    #print(response)

    pattern = r"(Wrong)?\s*(Entity|Entities)+\s*:"
    match = re.search(pattern, response, re.IGNORECASE | re.DOTALL)
    if match:
        response = response[match.end():].strip()
    elif response.find(":\n"): 
        response = response.split(":\n")[-1]
    

    # enumeration 
    # find all preceeding numbers and remove them then split by new line
    pattern = r"\d+\s*\.+\s*"
    match = re.findall(pattern, response, re.IGNORECASE | re.DOTALL)

    if match:
        response = re.sub(pattern, "", response)
    
    entities = response.split("\n")
    entities = list(filter(lambda var: var!="\n" and len(var)!=0 and var!=" " and var!="",entities))
    return entities

def evaluate_find_entities(data):
    #exact match
    value_counts = (data["suggested_entities"] == data["pred"]).value_counts()
    exact_match_accuracy =  value_counts[True] /len(data) *100
    
    #bert_score
    data["sentence_similarity"] = data.apply(lambda x: bert_score_for_lists(x["suggested_entities"],x["pred"]),axis=1)
    bert_score_mean =  data["sentence_similarity"].mean() *100
    
    return exact_match_accuracy, bert_score_mean
def get_accuracy(row):
    try:
        set1 = set(row["suggested_entities"])
        set2 = set(row["pred"])
        intersection = set1.intersection(set2) 

        return  len(intersection) / len(set1)
    except:
        return 0

entity_finder_experiments = [e for e in experiments if e["name"] == "entity_finder"]

for experiment in entity_finder_experiments:
    model_name = experiment["model"]
    prompt_type = experiment["prompt_type"]
    df = experiment["df"]

    df["pred"] = df["predicte_wrong_entities"].apply(lambda x: extract_entities(x))
    df["suggested_entities"] = df["suggested_entities"].apply(lambda x: x[0] if isinstance(x[0], list) else x)
    length_mismatches = df.apply(lambda x: len(x["pred"]) != len(x["suggested_entities"]), axis=1).sum()

    #exact_match_accuracy, bert_score_mean = evaluate_find_entities(df)
    exact_match_accuracy = df.apply(lambda row: get_accuracy(row), axis=1).mean()*100

    iou_mean = df.apply(lambda row: calculate_iou_char_level(row["transformed_answer"], row["suggested_entities"], row["pred"]), axis=1).mean() * 100
    #print(df.head(5)[["suggested_entities","pred"]])
    
    df["concatenated_suggested_entities"] = df["suggested_entities"].apply(lambda x: " ".join(x))
    df["concatenated_pred"] = df["pred"].apply(lambda x: " ".join(x))
    nli_scores = nli_scores_for_lists(df["concatenated_suggested_entities"].tolist(), df["concatenated_pred"].tolist())

    print(f"{model_name}: length mismatches {length_mismatches} accuracy {exact_match_accuracy:.2f}%, iou_mean: {iou_mean} bert_mean:{bert_score_mean}%, avg_nli_scores: {nli_scores} for num_samples {len(df)}")
    

Llama-3.2-1B-Instruct: length mismatches 519 accuracy 7.76%, iou_mean: 4.411544239838997 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556
Llama-3.2-3B-Instruct: length mismatches 482 accuracy 4.23%, iou_mean: 7.806044707581488 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556
Llama-3.3-70B-Instruct: length mismatches 467 accuracy 12.86%, iou_mean: 23.736299228877407 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556
gemma-3-12b-it: length mismatches 442 accuracy 45.82%, iou_mean: 25.92091778797702 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556
gemma-3-1b-it: length mismatches 223 accuracy 9.61%, iou_mean: 9.580185171765605 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556
gemma-3-27b-it: length mismatches 390 accuracy 45.81%, iou_mean: 28.69548261722897 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556
gemma-3-4b-it: length mismatches 401 accuracy 37.86%, iou_mean: 24.676327344859256 bert_mean:0.0%, avg_nli_scores: 0 for num_samples 556


### negated_sentence_new

In [15]:
def extract_sentence(response):

    pattern = r"(Wrong)?\s*(Sentence)+\s*:?"
    match = re.search(pattern, response, re.IGNORECASE | re.DOTALL)
    if match:
        response = response[match.end():].strip()
    
    elif response.find(":\n"): 
        response = response.split(":\n")[-1].strip()

    sentence = response.replace("\n\n","").replace("\n","").strip()
    return sentence


def evaluate_find_sentence(data):
    
    data["predicted_sentence"] = data["predicted_sentence"].apply(lambda x: extract_sentence(x))
    
    #exact match
    value_counts = (data["transformed_sentence"] == data["predicted_sentence"]).value_counts()
    exact_match_accuracy =  value_counts.get(True, 0) /len(data) *100
    
    #bert_score
    data["sentence_similarity"] = data.apply(lambda x: calculate_bert_score_for_string(x["transformed_sentence"],x["predicted_sentence"]),axis=1)
    bert_score_mean = data["sentence_similarity"].mean() *100
    
    nli_scores = nli_scores_for_lists(data["transformed_sentence"].tolist(), data["predicted_sentence"].tolist())
    return exact_match_accuracy, bert_score_mean, nli_scores
 
negated_sentence_experiments = [e for e in experiments if e["name"] == "negated_sentence"]

for experiment in negated_sentence_experiments:
    model_name = experiment["model"]
    prompt_type = experiment["prompt_type"]
    df = experiment["df"]

    exact_match_accuracy, bert_score_mean, nli_scores = evaluate_find_sentence(df)
    iou_mean = df.apply(lambda row: calculate_iou_char_level(row["transformed_answer"], [row["transformed_sentence"]], [row["predicted_sentence"]]), axis=1).mean() * 100
    print(f"{model_name}: {exact_match_accuracy:.2f}%, iou_mean {iou_mean} bert_mean:{bert_score_mean}%, nli_scores: {nli_scores} for num_samples {len(df)}")

Llama-3.2-1B-Instruct: 0.00%, iou_mean 0.05459839844697889 bert_mean:0.0%, nli_scores: 0 for num_samples 634
Llama-3.2-3B-Instruct: 2.37%, iou_mean 1.9827478657612858 bert_mean:0.0%, nli_scores: 0 for num_samples 634
Llama-3.3-70B-Instruct: 69.24%, iou_mean 60.925416526719765 bert_mean:0.0%, nli_scores: 0 for num_samples 634
gemma-3-12b-it: 44.32%, iou_mean 44.51313047820609 bert_mean:0.0%, nli_scores: 0 for num_samples 634
gemma-3-1b-it: 13.25%, iou_mean 12.881552517880573 bert_mean:0.0%, nli_scores: 0 for num_samples 634
gemma-3-27b-it: 60.73%, iou_mean 56.38200495427698 bert_mean:0.0%, nli_scores: 0 for num_samples 634
gemma-3-4b-it: 36.28%, iou_mean 33.307192390593734 bert_mean:0.0%, nli_scores: 0 for num_samples 634


In [None]:
#baseline comparison 

#get metadata df 

metadata_path = "data/final_metadata.jsonl"
metadata_df = pd.read_json(metadata_path, lines=True)

#only_swaps 
metadata_df = metadata_df[metadata_df["tag_type"]=="swap"]
# Ensure intermediate_results exists and is subscriptable
metadata_df = metadata_df[metadata_df["intermediate_results"].apply(lambda x: isinstance(x, dict) and "new_entities" in x)]

#get metadat for entities
metadata_df["original_entities"] = metadata_df.apply(lambda row: row["intermediate_results"]["entities"],axis=1)
metadata_df["transformed_entities"] = metadata_df.apply(lambda row: row["intermediate_results"]["new_entities"],axis=1)

print(len(metadata_df))
metadata_df["sentence_similarity"] = metadata_df.apply(lambda x: bert_score_for_lists(x["original_entities"],x["transformed_entities"]),axis=1)
bert_score_mean = metadata_df["sentence_similarity"].mean() *100
print(f"Bert Score Mean of Original and Transformed Entities {bert_score_mean:.2f}")