In [None]:
import os
import sys
import json

from modelendpoints import query
import openai
import pandas as pd
from tqdm import tqdm
from scipy.stats import mode

In [None]:
decompose_df = pd.read_parquet("INPUT_FILE")
decompose_df.head()
print(decompose_df.shape)

In [None]:
cluster_missing_prompt_llm_judge="""
You will be given a list of claims about a topic.

You will be given an additional claim about the same topic. The new claim may be semantically similar to the claims in the list. 

Your task is to evaluate to whether this additional claim belongs in the list or not based on conveying some piece of information that is exactly in common with the majority of claims in the list. 

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

0: The New Claim may be related or very semantically similar to the claims in the Claim List, but doesn’t clearly convey a piece of information EXACTLY in common with the majority of sentences in the Claim List 
1: The New Claim CLEARLY conveys some information EXACTLY in common with the MAJORITY of the sentences in the Claim List. Be strict about checking that the New Claim has information which is entailed by those sentences.

Evaluation Steps:

1. Read the Claim List carefully.
2. Read the New Claim carefully. 
3. Assign a score for the New Claim based on the Evaluation Criteria and the notes provided above. Please only respond with a single digit indicating the score.

Example:

Claim List: 

{claim_list}

New Claim: 

{atomic_claim}

Evaluation Form (score ONLY): -"""


KEYS_TO_MESSAGES = {}
KEYS_COUNTER = 1
NUM_DECISIONS = 8
ROLE = "system"

l = decompose_df.shape[0]
for i in range(l):
    row = decompose_df.iloc[i]
    claim_list = row["Cluster Text (For Missing)"]
    claim_list = "\n".join(claim.strip() for claim in claim_list)
    for claim in row["Missing Text"]:
        claim = claim.strip()
        prompt_updated = cluster_missing_prompt_llm_judge.format(claim_list=claim_list,
                                                                 atomic_claim=claim)
        dict_row = [{"role": ROLE, "content": prompt_updated}]
        KEYS_TO_MESSAGES[str(KEYS_COUNTER)]=dict_row
        KEYS_COUNTER +=1)
print(len(KEYS_TO_MESSAGES),len(KEYS_TO_MESSAGES)*NUM_DECISIONS)


OPENAI_KEY = os.environ.get("OPENAI_API_KEY", "")
print(OPENAI_KEY)
client = openai.OpenAI(api_key=OPENAI_KEY)

decompose_output=query.openai_batch(client,keys_to_messages=KEYS_TO_MESSAGES,model="gpt-5",
                                reasoning_effort='minimal',
                                temperature=1,
                                top_p=1,
                                frequency_penalty=0,
                                presence_penalty=0,
                                stop=None,
                                n=NUM_DECISIONS
                                )

print(len(KEYS_TO_MESSAGES),len(decompose_output))
for i in range(len(decompose_output)):
    assert str(i+1) in decompose_output

with open("OUTPUT_FILE","w") as f:
    json.dump(decompose_output,f,indent=True)

In [None]:
KEYS_COUNTER = 1
l = decompose_df.shape[0]
decompose_df["GEVAL_GPT5_8"] = [-1]*l
decompose_df["GEVAL_GPT5_8"] = decompose_df["GEVAL_GPT5_8"].astype(object)
ill_formatted = 0
ill_formatted_rows = 0
COUNT_ZERO=0
COUNT_ONE=0

for i in range(l):
    row = decompose_df.iloc[i]
    claim_rating_row = []
    for _ in row["Missing Text"]:
        response=decompose_output.get(str(KEYS_COUNTER),{}).get('text',[])
        claim_rating_llm=[]
        for claim in response:
            claim = claim.strip()
            try:
                claim_rating_llm.append(int(claim))
            except Exception as e:
                ill_formatted +=1
        KEYS_COUNTER +=1
        if claim_rating_llm:
            cl = int(mode(claim_rating_llm).mode)
            if cl:
                COUNT_ONE+=1
            else:
                COUNT_ZERO+=1
            claim_rating_row.append(cl)
    if claim_rating_row:
        decompose_df.at[i,"GEVAL_GPT5_8"]=claim_rating_row
    else:
        ill_formatted_rows+=1

print("ILL FORMATTED OUTPUT",ill_formatted)
print("ILL FORMATTED ROWS",ill_formatted_rows)
print("0: ", COUNT_ZERO, "1: ", COUNT_ONE)

decompose_df.head()
decompose_df.to_parquet("LLM_judge_OUTPUT_FILE")