In [2]:
INSTRUCTION_PROMPT = """
You are a particpant in a survey about natural language generation systems. Please enter the survey to the best of your ability. 

For each survey item: Answer a number between 1 and 7
1 = Strongly Disagree
2 = Disagree
3 = Somewhat Disagree
4 = Neither Agree nor Disagree
5 = Somewhat Agree
6 = Agree
7 = Strongly Agree

The task is to evaluate passages of text after an edit was made to the knowledge of the natural language generation (edit made).
The natural language generation system should generate text that is consistent with the edit made regardless of whether it is factual or not.

Definitions:
Edit Consistency: 
Is the passage consistent with the edit?
Are all the statements in the passage consistent with itself?
Are there any statements that agree with or disagree with the edit?
Internal Consistency: 
Are the passages consistent with themselves?
Regardless of consistency with edit, does the passage contradict itself?
Cross Passage Consistency:
Do the passages contradict each other?
Cohesion/Topicality:
Are the passages about the subject or related entity?
Naturalness: 
Is the passage natural sounding text a native speaker would produce?
Factuality: 
How close is the generated statement to the ground truth provided?

Answer in the following format
Edit Consistency: Number
Internal Consistency: Number
Cross Passage Consistency: Number
Cohesion/Topicality: Number
Naturalness: Number
Factuality: Number

Do not provide any commentary
"""

SURVEY_PROMPT = """
Survey:
Edit Consistency: The passages are consistent with the edit made.
Answer a number between 1 and 7

Internal Consistency: The passages are consistent with themselves, regardless of the edit made.
Answer a number between 1 and 7

Cross Passage Consistency: The passages are consistent with each other, regardless of the edit made.
Answer a number between 1 and 7

Cohesion/Topicality: The passages are about the subject or related entity. They do not veer off topic
Answer a number between 1 and 7

Naturalness: The passage is natural sounding natural text a native speaker would produce.
Answer a number between 1 and 7

Factuality: The generated statement is completely consistent with the provided ground truth.
Answer a number between 1 and 7
"""

In [12]:
import openai
import os
import time

openai.api_key = ""

EVAL_DIR = '../data/evaluation_samples/'
EVAL_FILES = {
    'greedy_no_edit': 'greedy_no_edit_samples.md',
    'greedy': 'greedy_samples.md',
    'sampled_no_edit': 'sampled_no_edit_samples.md',
    'sampled': 'sampled_samples.md',
}

def get_survey_prompt(sample):
    return sample + SURVEY_PROMPT
    
def _parse_scores(answer_text):
    score_dict = {}
    # answer text consists of
    # label: value
    # label: value
    # ...

    for line in answer_text.split('\n'):
        if line == '':
            continue
        label, value = line.split(':')
        score_dict[label] = int(value)

    return score_dict
    
scores_by_type = {}
for eval_type, file_name in EVAL_FILES.items():
    # open the file
    with open(EVAL_DIR + file_name, 'r') as f:
        samples = f.read().split('\n## ')
    overall_scores = {}
    for sample in samples:
        if sample.strip() == '':
            continue
        # get the prompt
        prompt = get_survey_prompt("## " + sample)
        # get the answers
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {
                "role": "system",
                "content": INSTRUCTION_PROMPT
                },
                {
                "role": "user",
                "content": prompt
                },
            ],
            temperature=1,
            max_tokens=256,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        # parse the answers
        answer_text = response['choices'][0]['message']['content']
        scores = _parse_scores(answer_text)
        # add to overall scores
        for label, score in scores.items():
            if label not in overall_scores:
                overall_scores[label] = []
            overall_scores[label].append(score)
        # sleep for 1 second
        time.sleep(10)
    # print the overall scores
    print(eval_type)
    for label, scores in overall_scores.items():
        print(label, sum(scores) / len(scores))
    print()
    scores_by_type[eval_type] = overall_scores
    



greedy_no_edit
Edit Consistency 1.6666666666666667
Internal Consistency 6.555555555555555
Cross Passage Consistency 4.444444444444445
Cohesion/Topicality 6.888888888888889
Fluency 7.0
Factuality 1.8888888888888888

greedy
Edit Consistency 2.3333333333333335
Internal Consistency 4.444444444444445
Cross Passage Consistency 1.8888888888888888
Cohesion/Topicality 4.555555555555555
Fluency 7.0
Factuality 1.2222222222222223

sampled_no_edit
Edit Consistency 1.7777777777777777
Internal Consistency 6.666666666666667
Cross Passage Consistency 5.111111111111111
Cohesion/Topicality 7.0
Fluency 7.0
Factuality 1.8888888888888888

sampled
Edit Consistency 3.4444444444444446
Internal Consistency 4.555555555555555
Cross Passage Consistency 3.5555555555555554
Cohesion/Topicality 6.555555555555555
Fluency 6.888888888888889
Factuality 1.4444444444444444

