In [13]:
# use spacy to split sentences
import spacy

nlp = spacy.load("en_core_web_sm")

LABEL_1 = "Passage"
LABEL_2 = "Claim"

def sentence_splitter(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]


def construct_nli_dataset(sample, intervention):
    subject_ground_truth = sample['dependancies']['subject_entity']['ground_truth']
    subject_ground_truth_string = [f"{sample['requested_rewrite']['subject']} {key} {', '.join(value)}" for key,value in subject_ground_truth.items()][:4]
    related_entity_ground_truth = sample['dependancies']['coupled_entities'][0]['ground_truth']
    related_entity_ground_truth_string = [f"{sample['requested_rewrite']['subject']} {key} {', '.join(value)}" for key,value in related_entity_ground_truth.items()][:4]

    new_fact = sample["requested_rewrite"]["prompt"].format(
            sample["requested_rewrite"]['subject']
        ) + " " + sample["requested_rewrite"]['target_new']['str']
    passage_of_text_about_subject_of_edit = sample['subject_prompt'].strip().replace('\n', ' ')
    passage_of_text_about_related_entity = sample['coupled_prompt'].strip().replace('\n', ' ')
    main_text_segmented = sentence_splitter(passage_of_text_about_subject_of_edit)
    related_text_segmented = sentence_splitter(passage_of_text_about_related_entity)

    sample_dataset_records = []
    for sent in main_text_segmented:
        sample_dataset_records.append({
            "content": f"Sentence 1: {new_fact} \n\n Sentence 2: {sent}",
            "sample": get_sample_id(sample),
            "intervention": intervention,
            "label": "new_fact_and_main_passage"
        })
    for sent in related_text_segmented:
        sample_dataset_records.append({
            "content": f"Sentence 1: {new_fact} \n\n Sentence 2: {sent}",
            "sample": get_sample_id(sample),
            "intervention": intervention,
            "label": "new_fact_and_related_passage"
        })
    for sent in main_text_segmented:
        for ground_truth in subject_ground_truth_string:
            sample_dataset_records.append({
                "content": f"Sentence 1: {ground_truth} \n\n Sentence 2: {sent}",
                "sample": get_sample_id(sample),
                "intervention": intervention,
                "label": "ground_truth_and_main_passage"
            })
    for sent in related_text_segmented:
        for ground_truth in related_entity_ground_truth_string:
            sample_dataset_records.append({
                "content": f"Sentence 1: {ground_truth} \n\n Sentence 2: {sent}",
                "sample": get_sample_id(sample),
                "intervention": intervention,
                "label": "ground_truth_and_related_passage"
            })
        
    sentence_pairs = []
    for sent_1 in main_text_segmented:
        for sent_2 in main_text_segmented:
            if sent_1 != sent_2 and (sent_1, sent_2) not in sentence_pairs:
                sample_dataset_records.append({
                    "content": f"Sentence 1: {sent_1} \n\n Sentence 2: {sent_2}",
                    "sample": get_sample_id(sample),
                    "intervention": intervention,
                    "label": "main_passage_consistency"
                })
                sentence_pairs.append((sent_1, sent_2))
                sentence_pairs.append((sent_2, sent_1))
    
    sentence_pairs = []
    for sent_1 in related_text_segmented:
        for sent_2 in related_text_segmented:
            if sent_1 != sent_2 and (sent_1, sent_2) not in sentence_pairs:
                sample_dataset_records.append({
                    "content": f"Sentence 1: {sent_1} \n\n Sentence 2: {sent_2}",
                    "sample": get_sample_id(sample),
                    "intervention": intervention,
                    "label": "related_passage_consistency"
                })
                sentence_pairs.append((sent_1, sent_2))
                sentence_pairs.append((sent_2, sent_1))

    sentence_pairs = []
    for sent_1 in main_text_segmented:
        for sent_2 in related_text_segmented:
            if (sent_1, sent_2) not in sentence_pairs:
                sample_dataset_records.append({
                    "content": f"Sentence 1: {sent_1} \n\n Sentence 2: {sent_2}",
                    "sample": get_sample_id(sample),
                    "intervention": intervention,
                    "label": "main_passage_and_related_passage_consistency"
                })
                sentence_pairs.append((sent_1, sent_2))
                sentence_pairs.append((sent_2, sent_1))

    return sample_dataset_records



def construct_nli_dataset_paragraphs(sample, intervention):
    subject_ground_truth = sample['dependancies']['subject_entity']['ground_truth']
    subject_ground_truth_string = [f"{sample['requested_rewrite']['subject']} {key} {', '.join(value)}" for key,value in subject_ground_truth.items()]
    related_entity_ground_truth = sample['dependancies']['coupled_entities'][0]['ground_truth']
    related_entity_ground_truth_string = [f"{sample['requested_rewrite']['subject']} {key} {', '.join(value)}" for key,value in related_entity_ground_truth.items()]

    new_fact = sample["requested_rewrite"]["prompt"].format(
            sample["requested_rewrite"]['subject']
        ) + " " + sample["requested_rewrite"]['target_new']['str']
    old_fact = sample["requested_rewrite"]["prompt"].format(
            sample["requested_rewrite"]['subject']
        ) + " " + sample["requested_rewrite"]['target_true']['str']
    passage_of_text_about_subject_of_edit = sample['subject_prompt'].strip().replace('\n', ' ')
    passage_of_text_about_related_entity = sample['coupled_prompt'].strip().replace('\n', ' ')

    sample_id = get_sample_id(sample)
    sample_dataset_records = []
    sample_dataset_records.append({
        "content": f"{LABEL_1}: {passage_of_text_about_subject_of_edit} \n\n {LABEL_2}: {new_fact}",
        "sample": sample_id,
        "intervention": intervention,
        "label": "new_fact_and_main_passage"
    })
    sample_dataset_records.append({
        "content": f"{LABEL_1}: {passage_of_text_about_related_entity} \n\n {LABEL_2}: {new_fact}",
        "sample": sample_id,
        "intervention": intervention,
        "label": "new_fact_and_related_passage"
    })
    sample_dataset_records.append({
        "content": f"{LABEL_1}: {passage_of_text_about_subject_of_edit} \n\n {LABEL_2}: {old_fact}",
        "sample": sample_id,
        "intervention": intervention,
        "label": "old_fact_and_main_passage"
    })
    sample_dataset_records.append({
        "content": f"{LABEL_1}: {passage_of_text_about_related_entity} \n\n {LABEL_2}: {old_fact}",
        "sample": sample_id,
        "intervention": intervention,
        "label": "old_fact_and_related_passage"
    })

    for ground_truth in subject_ground_truth_string:
        sample_dataset_records.append({
            "content": f"{LABEL_1}: {passage_of_text_about_subject_of_edit} \n\n {LABEL_2}: {ground_truth}",
            "sample": sample_id,
            "intervention": intervention,
            "label": "ground_truth_and_main_passage"
        })
    for ground_truth in related_entity_ground_truth_string:
        sample_dataset_records.append({
            "content": f"{LABEL_1}: {passage_of_text_about_related_entity} \n\n {LABEL_2}: {ground_truth}",
            "sample": sample_id,
            "intervention": intervention,
            "label": "ground_truth_and_related_passage"
        })

    return sample_dataset_records

In [14]:
import os
import json
import sys

import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.utils import get_sample_id
from src.construct_samples import (
    NEW_FACT_TEMPLATE,
    RELATED_ENTITY_TEMPLATE,
    MAIN_PASSAGE_TEMPLATE_WITHOUT,
    OLD_FACTS_SUBJECT_TEMPLATE,
    RELATED_PASSAGE_TEMPLATE_WITHOUT,
    OLD_FACTS_RELATED_TEMPLATE,
    get_sample_text
)

from src.prompts import (
    INSTRUCTION_PROMPT,
    SURVEY_EXAMPLES,
    SURVEY_ITEMS
)

survey_header = INSTRUCTION_PROMPT + "".join(SURVEY_EXAMPLES.values())
survey_footer = "".join(SURVEY_ITEMS.values())



def get_json_files(path):
    samples = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".json"):
                # open file and append to samples
                with open(os.path.join(root, file), 'r') as f:
                    samples.append(json.load(f))
    return samples

rome_edit_dir = '../data/survey_samples/rome'
no_edit_dir = '../data/survey_samples/no_edit'
human_dir = '../data/survey_samples/human'

rome_edit_files = get_json_files(rome_edit_dir)
no_edit_files = get_json_files(no_edit_dir)
human_files = get_json_files(human_dir)

samples = []
for file in rome_edit_files:
    samples.extend(construct_nli_dataset_paragraphs(file, 'rome'))
for file in no_edit_files:
    samples.extend(construct_nli_dataset_paragraphs(file, 'no_edit'))
for file in human_files:
    samples.extend(construct_nli_dataset_paragraphs(file, 'human'))




In [15]:
import pandas as pd
test_df = pd.DataFrame(samples)

In [19]:
test_df 

Unnamed: 0,content,sample,intervention,label
0,Passage: ASEAN is an intergovernmental organiz...,ad41a34db7e3975d6b83d2ddedb19d9f,rome,new_fact_and_main_passage
1,"Passage: Philippines, an island nation located...",ad41a34db7e3975d6b83d2ddedb19d9f,rome,new_fact_and_related_passage
2,Passage: ASEAN is an intergovernmental organiz...,ad41a34db7e3975d6b83d2ddedb19d9f,rome,old_fact_and_main_passage
3,"Passage: Philippines, an island nation located...",ad41a34db7e3975d6b83d2ddedb19d9f,rome,old_fact_and_related_passage
4,Passage: ASEAN is an intergovernmental organiz...,ad41a34db7e3975d6b83d2ddedb19d9f,rome,ground_truth_and_main_passage
...,...,...,...,...
721,Passage: Revealing is a jazz album by Ornette ...,857c595296caaeab532251cf9d8f3979,human,old_fact_and_related_passage
722,Passage: Harmolodics is a musical philosophy t...,857c595296caaeab532251cf9d8f3979,human,ground_truth_and_main_passage
723,Passage: Harmolodics is a musical philosophy t...,857c595296caaeab532251cf9d8f3979,human,ground_truth_and_main_passage
724,Passage: Harmolodics is a musical philosophy t...,857c595296caaeab532251cf9d8f3979,human,ground_truth_and_main_passage


135.0

In [26]:

samples =[
    'ad41a34db7e3975d6b83d2ddedb19d9f',
    '857c595296caaeab532251cf9d8f3979',
    'a41ba08ffb8af6eb5ecf70c7a52a6289'
]
test_df.loc[
    ~test_df['sample'].isin(samples)
]

Unnamed: 0,content,sample,intervention,label
62,Passage: Adolph Kolping (1879-1945) was a Germ...,ed7e1c4359063598dbeb73d4da2869c1,rome,new_fact_and_main_passage
63,Passage: Adolph Kolping Memoria is located in ...,ed7e1c4359063598dbeb73d4da2869c1,rome,new_fact_and_related_passage
64,Passage: Adolph Kolping (1879-1945) was a Germ...,ed7e1c4359063598dbeb73d4da2869c1,rome,old_fact_and_main_passage
65,Passage: Adolph Kolping Memoria is located in ...,ed7e1c4359063598dbeb73d4da2869c1,rome,old_fact_and_related_passage
66,Passage: Adolph Kolping (1879-1945) was a Germ...,ed7e1c4359063598dbeb73d4da2869c1,rome,ground_truth_and_main_passage
...,...,...,...,...
690,Passage: Adolph Kolping was a Catholic priest ...,ed7e1c4359063598dbeb73d4da2869c1,human,ground_truth_and_main_passage
691,Passage: The Adolph Kolping Memoria is a memor...,ed7e1c4359063598dbeb73d4da2869c1,human,ground_truth_and_related_passage
692,Passage: The Adolph Kolping Memoria is a memor...,ed7e1c4359063598dbeb73d4da2869c1,human,ground_truth_and_related_passage
693,Passage: The Adolph Kolping Memoria is a memor...,ed7e1c4359063598dbeb73d4da2869c1,human,ground_truth_and_related_passage


In [27]:
samples =[
    'ad41a34db7e3975d6b83d2ddedb19d9f',
    '857c595296caaeab532251cf9d8f3979',
    'a41ba08ffb8af6eb5ecf70c7a52a6289'
]
test_df.loc[
    ~test_df['sample'].isin(samples)
].to_csv('../data/annotation_data/edit_consistency_paragraph_level_final_annotations.csv', index=False)

In [28]:
samples =[
    'ad41a34db7e3975d6b83d2ddedb19d9f',
    '857c595296caaeab532251cf9d8f3979',
    'a41ba08ffb8af6eb5ecf70c7a52a6289'
]
len(test_df.loc[
    ~test_df['sample'].isin(samples)
]) // 2

270