In [46]:
import json
import scipy.stats as stats
import numpy as np
import pandas as pd
import sys
sys.path.append('../')
from src.construct_samples import MAIN_PASSAGE_TEMPLATE, NEW_FACT_TEMPLATE, OLD_FACTS_RELATED_TEMPLATE, OLD_FACTS_SUBJECT_TEMPLATE, RELATED_PASSAGE_TEMPLATE, get_sample_text
from src.utils import get_sample_id

import os
def get_samples_from_dir(dir_path):
    samples = []
    for file_name in os.listdir(dir_path):
        with open(os.path.join(dir_path, file_name), 'r') as f:
            samples.append(json.load(f))
    return samples

# Load the data
methods = [
    'gptj_ft_edit',
    'gptj_ike_edit',
    'gptj_rome_edit',
    'gptj_no_edit',
    'llama2_chat_ft_edit',
    'llama2_chat_ike_edit',
    'llama2_chat_rome_edit',
    'llama2_chat_no_edit'
]

model = 'gpt-3.5-turbo-0613'
analysis_data = {
    k: json.load(open(f'../results/broken_out_survey_{k}_{model}.json')) for k in methods
}
samples = json.load(open('../data/counterfact_with_dependancies_sample.json'))

# model_gptj_no_edit_False_use_sampling_True_token_length_1024_method_FT
sample_id_map = {
    "gptj_ft_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_gptj_no_edit_False_use_sampling_True_token_length_1024_method_FT")
    },
    "gptj_rome_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_gptj_no_edit_False_use_sampling_True_token_length_1024_method_ROME")
    },
    "gptj_ike_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_gptj_no_edit_False_use_sampling_True_token_length_1024_method_IKE")
    },
    "gptj_no_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_gptj_no_edit_TRUE_use_sampling_True_token_length_1024_method_ROME")
    },
    "llama2_chat_ft_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_llama2-chat_no_edit_False_use_sampling_True_token_length_1024_method_FT")
    },
    "llama2_chat_rome_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_llama2-chat_no_edit_False_use_sampling_True_token_length_1024_method_ROME")
    },
    "llama2_chat_ike_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_llama2-chat_no_edit_False_use_sampling_True_token_length_1024_method_IKE")
    },
    "llama2_chat_no_edit": {
    get_sample_id(s): s for s in
    get_samples_from_dir("../data/generated_samples/model_llama2-chat_no_edit_TRUE_use_sampling_True_token_length_1024_method_ROME")
    },
}

SURVEY_ITEM_TO_SAMPLES_TEMPLATE = {
    "new_fact_main_passage": [
        NEW_FACT_TEMPLATE,
        MAIN_PASSAGE_TEMPLATE,
    ],
    "new_fact_related_passage": [
        NEW_FACT_TEMPLATE,
        RELATED_PASSAGE_TEMPLATE,
    ],
    "main_passage_old_facts": [
        MAIN_PASSAGE_TEMPLATE,
        OLD_FACTS_SUBJECT_TEMPLATE,
    ],
    "related_passage_old_facts": [
        RELATED_PASSAGE_TEMPLATE,
        OLD_FACTS_RELATED_TEMPLATE,
    ],
    "main_passage_consistency": [
        MAIN_PASSAGE_TEMPLATE,
    ],
    "related_passage_consistency": [
        RELATED_PASSAGE_TEMPLATE,
    ],
    "cross_passage_consistency": [
        MAIN_PASSAGE_TEMPLATE,
        RELATED_PASSAGE_TEMPLATE,
    ],
    "topicality": [
        MAIN_PASSAGE_TEMPLATE,
        RELATED_PASSAGE_TEMPLATE,
    ],
    "fluency": [
        MAIN_PASSAGE_TEMPLATE,
        RELATED_PASSAGE_TEMPLATE,
    ]
}
human_df = pd.read_csv('../results/human_survey_responses.csv')
human_df['split'] = 'human'
human_rated_samples = {
    "human": {
        get_sample_id(s): s for s in
        get_samples_from_dir("../data/survey_samples/human")
    },  
    "rome": {
        get_sample_id(s): s for s in
        get_samples_from_dir("../data/survey_samples/rome")
    },
    "no_edit": {
        get_sample_id(s): s for s in
        get_samples_from_dir("../data/survey_samples/no_edit")
    },
}
human_samples = []
question_to_label = {
    'The main passage is written as if the new fact is true': 'new_fact_main_passage',
    'The related passage does not contradict the new fact': 'new_fact_related_passage',
    'Ignoring the new fact, most of the old facts are still true in the main passage.': 'main_passage_old_facts',
    'Ignoring the new fact, most of the old facts are still true in the related passage.': 'related_passage_old_facts',
    'Ignoring the old and new facts, the main passage does not contradict itself.': 'main_passage_consistency',
    'Ignoring the old and new facts, the related passage does not contradict itself.': 'related_passage_consistency',
    'Ignoring the old and new facts, the main passage and the related passage do not contradict each other.': 'cross_passage_consistency',
    'The main passage is focused on the subject and the related passage is focused on the related entity': 'topicality',
    'Both passages are natural sounding text close to what a human would write.': 'fluency'
}
for i, human_sample in human_df.iterrows():
    sample_id = human_sample['sample_id']
    method = human_sample['method']
    sample = human_rated_samples[method][sample_id]
    label = question_to_label[human_sample['question']]
    human_samples.append({
        'label': label,
        'score': human_sample['response'],
        'content': get_sample_text(
            sample,
            templates_to_use=SURVEY_ITEM_TO_SAMPLES_TEMPLATE[label]
        ),
        'intervention': method,
        'model': 'human',
        'sample_id': sample_id,
        'split': 'human'
    })

human_df = pd.DataFrame(human_samples).fillna(4)

survey_dfs = []
for method, sample in analysis_data.items():
    base_model = method.split('_')[0]
    for sample_id, data in sample.items():
        for label, scores in data.items():
            templates_to_use = SURVEY_ITEM_TO_SAMPLES_TEMPLATE[label]
            for score in scores:
                survey_dfs.append({
                    'label': label,
                    'score': score,
                    'content': get_sample_text(
                        sample_id_map[method][sample_id],
                        templates_to_use=templates_to_use
                    ),
                    'intervention': method,
                    'model': base_model,
                    'sample_id': sample_id,
                    'split': 'generated'
                })


survey_df = pd.DataFrame(human_samples + survey_dfs).fillna(4)
survey_df.to_csv('../results/survey_ratings_dataset.csv', index=None)

In [57]:
final = "../data/annotation_data/longform_eval_final_results_annotations.json"
pretest = "../data/annotation_data/longform_eval_first_3_samples_paragraph_annotations (2).json"

final = json.load(open(final))
pretest = json.load(open(pretest))

pretest_df = []
import json
import re
with open('../data/annotation_data/longform_eval_first_3_samples_paragraph_annotations (2).json') as f:
    pretest = json.load(f)

intervention_map = {
    'human': 'llama2_chat_human_edit_pretest_annotation',
    'no_edit': 'llama2_chat_no_edit_pretest_annotation',
    'rome': 'llama2_chat_rome_edit_pretest_annotation'
}

anno_question_type = {
    'new_fact_and_main_passage': 'New fact is true',
    'new_fact_and_related_passage': 'New fact is true',
    'old_fact_and_main_passage': 'Old fact is true',
    'old_fact_and_related_passage': 'Old fact is true',
    'ground_truth_and_related_passage': 'Ground truth is true',
    'ground_truth_and_main_passage': 'Ground truth is true',
}


rating_to_number = {
    'supports': 2,
    'neutral': 1,
    'contradicts': 0
}

ratings = []
for example in pretest['examples']:
    # Regex to get the Claim: part from content
    claim = re.search('Claim: (.*)', example['content']).group(1)
    for annotation in example['annotations']:
        # value, tag
        ratings.append({
            "content": f"""
Passage: {annotation['value']}
Claim: {claim}
            """.strip(),
            "sample_id": example['metadata']['sample'],
            "example_id": example['example_id'],
            "classification": annotation['tag'],
            "method": example['metadata']['intervention'],
            "label": example['metadata']['label'],
            'question_type': anno_question_type[
                example['metadata']['label']
            ],
        })

    for rating in example['classifications']:
        for rater in rating['classified_by']:
            anon_id = rater['annotator_id']

            ratings.append({
                "content": example['content'],
                "sample_id": example['metadata']['sample'],
                "example_id": example['example_id'],
                "classification": rating['classname'],
                "method": example['metadata']['intervention'],
                "label": example['metadata']['label'],
                'question_type': anno_question_type[
                    example['metadata']['label']
                ],
            })

for example in final['examples']:
    # Regex to get the Claim: part from content
    claim = re.search('Claim: (.*)', example['content']).group(1)
    for annotation in example['annotations']:
        # value, tag
        ratings.append({
            "content": f"""
Passage: {annotation['value']}
Claim: {claim}
            """.strip(),
            "sample_id": example['metadata']['sample'],
            "example_id": example['example_id'],
            "classification": annotation['tag'],
            "method": example['metadata']['intervention'],
            "label": example['metadata']['label'],
            'question_type': anno_question_type[
                example['metadata']['label']
            ],
        })

    for rating in example['classifications']:
        for rater in rating['classified_by']:
            anon_id = rater['annotator_id']

            ratings.append({
                "content": example['content'],
                "sample_id": example['metadata']['sample'],
                "example_id": example['example_id'],
                "classification": rating['classname'],
                "method": example['metadata']['intervention'],
                "label": example['metadata']['label'],
                'question_type': anno_question_type[
                    example['metadata']['label']
                ],
            })
            

annos_df = pd.DataFrame(ratings)
annos_df.drop_duplicates(subset=["content"], inplace=True)
annos_df.to_csv('../results/annotations_dataset.csv', index=None)
