In [1]:
import os
import jsonlines
import numpy as np
from utils import *
import pandas as pd
import re
from openai import OpenAI

In [2]:
def parse_statements(input_string):
    """
    Parses a structured input string containing multiple statements into a dictionary.

    Args:
        input_string (str): The input string with statements, criteria, supporting evidence, and scores.

    Returns:
        dict: A dictionary where each statement ID maps to its details.
    """
    # Pattern to match each statement
    pattern = r"STATEMENT (\d+):\nCriteria: (.*?)\nSupporting Evidence: (.*?)\nScore: ([\d.]+)"

    # Find all matches
    matches = re.findall(pattern, input_string, re.DOTALL)

    # Parse into a dictionary
    statements_dict = {}
    for match in matches:
        statement_id = int(match[0])
        criteria = match[1].strip()
        if len(criteria) < 3:
            continue
        supporting_evidence = match[2].strip()
        score = float(match[3])
        statements_dict[f'Statement{statement_id}'] = {
            "Criteria": criteria,
            "Supporting Evidence": supporting_evidence,
            "Score": score,
        }

    return statements_dict

In [3]:
df = pd.read_csv('examples_to_annotate.csv').fillna('')
df = df.replace({np.nan: None})
trulens_results = df['Trulens_gpt-4o_reasons'].values.tolist()
parsed_trulens_results = []
for idx, input_string in enumerate(trulens_results):
    try: 
        parsed_trulens_results.append(parse_statements(input_string))
    except:
        print(idx)
        print(input_string)
        print(df.iloc[idx])
        break


In [4]:
trulens_results[151]

'STATEMENT 0:\nCriteria: Militants armed with guns and grenades attacked a hospital in Afghanistan, killing all four attackers after a two-hour standoff with commandos.\nSupporting Evidence: Militants armed with guns and grenades gained entry after one detonated explosives at a hospital gate and then opened fire on staff and patients. Commandos who landed on the Sardar Daud hospital roof killed all four attackers after several hours of fighting.\nScore: 0.6666666666666666\nSTATEMENT 1:\nCriteria: The attack, claimed by the Islamic State (IS) group, resulted in over 50 injuries.\nSupporting Evidence: The source states, "The so-called Islamic State (IS) group has claimed the attack. More than 50 people were also wounded, the defence ministry said."\nScore: 1.0\nSTATEMENT 2:\nCriteria: The hospital attack marks a shift in IS\'s approach, engaging directly with security forces in the capital for the first time.\nSupporting Evidence: The source states, "The hospital attack marks a change in

In [5]:
parsed_trulens_results[151]

{'Statement0': {'Criteria': 'Militants armed with guns and grenades attacked a hospital in Afghanistan, killing all four attackers after a two-hour standoff with commandos.',
  'Supporting Evidence': 'Militants armed with guns and grenades gained entry after one detonated explosives at a hospital gate and then opened fire on staff and patients. Commandos who landed on the Sardar Daud hospital roof killed all four attackers after several hours of fighting.',
  'Score': 0.6666666666666666},
 'Statement1': {'Criteria': 'The attack, claimed by the Islamic State (IS) group, resulted in over 50 injuries.',
  'Supporting Evidence': 'The source states, "The so-called Islamic State (IS) group has claimed the attack. More than 50 people were also wounded, the defence ministry said."',
  'Score': 1.0},
 'Statement2': {'Criteria': "The hospital attack marks a shift in IS's approach, engaging directly with security forces in the capital for the first time.",
  'Supporting Evidence': 'The source sta

In [6]:
client = OpenAI()
MODEL='gpt-4o'

# Aggrefact prompt
# system = """Determine whether the provided claim can be inferred from the given sentence, using the sentence's context as a reference. A claim is considered to belong to a sentence if it can be logically derived from the sentence while accounting for its contextual meaning. Note that the claim must correspond to a single sentence within the provided context."""
# user = "Contenxt: {summary}\nSentence: {summary_sent}\nClaim: {claim}\nAnswer (yes or no):"
system = """Given a context and a list of sentences (each with an associated index), determine which sentence supports the derivation of the provided claim from the given context. Your response should be the index of the relevant sentence."""
user = "Claim: {claim}\nSentence List: {summary_sents}\nAnswer (sentence index, an integer):"

def call_gpt(system_prompt, user_prompt, model='gpt-4', temperature=0):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=temperature
    )

    # print(completion.choices[0].message.content)
    return completion.choices[0].message.content

In [7]:
# from batch ID to the corresponding sample IDs
skip_samples = {
    5: range(40, 50), 
    10: range(10,20),
    11: range(10),
    12: range(20, 30), 
    15: range(40, 50)
}

annotator_list = {
    7: ['yujia', 'manveer'],
    8: ['miaoran', 'chenyu'],
    10: ['erana', 'vivek', 'manveer'],
    11: ['rogger', 'matt'], #,'matt', , 'new', 'yujia'
    13: ['erana', 'miaoran'], #['erana', 'weisi', 'miaoran']
    16: ['miaoran', 'matt'] #['miaoran', 'yujia', 'matt', 'weisi', 'new']
}
# batch 16
# ['yujia', 'matt']: 0.299	0.267	0.209	0.294	0.244	0.340	0.330
# ['yujia', 'weisi']: -0.417	-0.376	-0.333	-0.384	-0.320	-0.391	-0.387
# ['matt', 'weisi']: -0.138	-0.117	-0.074	-0.104	-0.065	-0.147	-0.092
exclude_batch = []
# exclude_batch = [11,13,16]

In [8]:
sent_level_labels = {}
result_path = 'batch_5_src_no_sports/results'
for batch_id in range(1,16+1):
    if batch_id in exclude_batch:
        continue
    file_path = os.path.join(result_path, f"batch_{batch_id}_annotation.json")
    skip_sample_ids = []
    if batch_id in skip_samples:
        skip_sample_ids = [str(s_id) for s_id in skip_samples[batch_id]]
        print (f"Skipping samples {skip_sample_ids}")
    selected_annotators = None
    # there is an unexpected "new" annotator in batch 7
    if batch_id in annotator_list:
        selected_annotators = annotator_list[batch_id]
        
    _, _, _, batch_sent_level_labels = read_annotation(file_path, summary_sent_file='summary_sent_list.jsonl', skip_sample_ids=skip_sample_ids)
    # print(sent_level_labels)
    sent_level_labels.update(batch_sent_level_labels)
# print(sent_level_labels)
    

Skipping samples ['40', '41', '42', '43', '44', '45', '46', '47', '48', '49']
Skipping samples ['10', '11', '12', '13', '14', '15', '16', '17', '18', '19']
Skipping samples ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Skipping samples ['20', '21', '22', '23', '24', '25', '26', '27', '28', '29']
Skipping samples ['40', '41', '42', '43', '44', '45', '46', '47', '48', '49']


In [None]:
existing_meta_ids = []
if os.path.exists('processed_trulens_claim_level_preds.jsonl'):
    with open('processed_trulens_claim_level_preds.jsonl') as r:
        for p in jsonlines.Reader(r):
            existing_meta_ids.append(p['meta_id'])
print(existing_meta_ids)


for meta_id in sent_level_labels:
    if meta_id in existing_meta_ids:
        continue
    print('='*30)
    print(meta_id)
    item = {'meta_id': meta_id,'results': {}}
    
    trulens_result = parsed_trulens_results[meta_id]
    for sent, sent_labels in sent_level_labels[meta_id].items():
        if sent not in item['results']:
            item['results'][sent] = {'labels': sent_labels, 'claims':[], 'claim_preds':[]}
        
    for _, subresult in trulens_result.items():
        print('-'*20)
        print(subresult['Criteria'])
        summary = ''.join(list(sent_level_labels[meta_id].keys()))
        
            
        sent_lst = [f"sentence idx: {idx}\n{sent}" for idx, sent in enumerate(list(sent_level_labels[meta_id].keys()))]
        result = call_gpt(system, user.format(claim=subresult['Criteria'],summary_sents=sent_lst), model=MODEL)
        print(result)
        try:
            sent = list(sent_level_labels[meta_id].keys())[int(result)]
            print(sent)
            item['results'][sent]['claims'].append(subresult['Criteria'])
            item['results'][sent]['claim_preds'].append(subresult['Score'])
        except:
            pass
        
    print(item)
    if os.path.exists('processed_trulens_claim_level_preds.jsonl'):
        mode = 'a' 
    else:
        mode = 'w'
    with jsonlines.open('processed_trulens_claim_level_preds.jsonl', mode=mode) as writer:
        writer.write(item)

[15, 130, 245, 360, 475, 590, 705, 820, 965, 1050, 6, 121, 236, 351, 466, 581, 696, 811, 990, 1041, 112, 227, 342, 457, 572, 687, 802, 917, 929, 1147, 113, 228, 343, 458, 573, 688, 803, 918, 930, 1148, 114, 229, 344, 459, 574, 689, 804, 919, 931, 1149, 7, 122, 237, 352, 467, 582, 697, 812, 989, 1042, 9, 124, 239, 354, 469, 584, 699, 814, 968, 1044, 10, 125, 240, 355, 470, 585, 700, 815, 926, 1045, 11, 126, 241, 356, 471, 586, 701, 816, 986, 1046, 34, 149, 264, 379, 494, 609, 724, 839, 982, 1069, 4, 119, 234, 349, 464, 579, 694, 809, 974, 1039, 20, 135, 250, 365, 480, 595, 710, 825, 951, 1055, 40, 155, 270, 385, 500, 615, 730, 845, 954, 1075, 19, 134, 249, 364, 479, 594, 709, 824, 948, 1054, 18, 133, 248, 363, 478, 593, 708, 823, 940, 1053, 26, 141, 256, 371, 486, 601, 716, 831, 973, 1061, 2, 117, 232, 347, 462, 577, 692, 807, 927, 1037, 8, 123, 238, 353, 468, 583, 698, 813, 947, 1043, 30, 145, 260, 375, 490, 605, 720, 835, 972, 1065, 12, 127, 242, 357, 472, 587, 702, 817, 932, 1047, 29