In [1]:
# !pip install inspect-ai

In [151]:
# imports

import json
import os
import pandas as pd
import re
import numpy as np
from inspect_ai import Task, task
from inspect_ai import eval as evaluate_model
from inspect_ai.dataset import json_dataset, Sample
from inspect_ai.scorer import includes,pattern, answer, Score, accuracy, stderr, scorer, Scorer, Target
from inspect_ai.solver import generate, chain_of_thought,prompt_template
from inspect_ai.log import list_eval_logs, read_eval_log
from inspect_ai.solver._task_state import TaskState

from statsmodels.stats.proportion import proportion_confint, test_proportions_2indep
from statsmodels.stats.contingency_tables import mcnemar
np.random.seed(0)

# Dataset
## Source: [MoreHopQA final](https://github.com/Alab-NII/morehopqa/blob/main/datasets/files/morehopqa_final.json), letter-only
I loaded the data from the github repo, filtered to questions which had a “letter” answer type, in order to have consistent answer parsing. This resulted in 43 multi-hop questions.
I also parsed the provided “context” for each question into the following format:
- {topic title}
	- {topic sentence}
	- {topic sentence}

Note: I was originally hoping to use the MoreHopQA dataset without the context, but the model seemed unable to get answer any of the questions correctly due to factual recall errors.

In [78]:
# load data
demo_path = "demo_data_letter.json"
og_data_path = os.path.abspath(os.path.join("..", "morehopqa","datasets","files","morehopqa_final.json"))
if not os.path.exists(demo_path): # select 5 samples for demo
    with open(og_data_path,'r', encoding='utf-8') as f:
        d = json.load(f)
    d_filt = [s for s in d if s['answer_type'] == 'letter'] # letter, to give it a better chance?
    demo_data = d_filt
    # demo_data = np.random.choice(d_filt,5,replace=False).tolist()
    print(demo_data)
    with open(demo_path, 'w', encoding='utf-8') as f:
        json.dump(demo_data,f, indent=2)

# load into inspect framework
def process_morehopqa (record):
    q = record['question']
    context = '\n- '.join(['\n\t- '.join([title+':']+[chunk.strip() for chunk in chunks])
                           for title, chunks in record['context']])
    a = record['answer']
    meta = {k:record[k] for k in ['previous_question',
                                                   'previous_answer',
                                                   "no_of_hops",
                                                   "answer_type"]}
    meta['context'] = context
    meta['question'] = q # for prompt template
    meta['answer'] = a
    return Sample(id=record["_id"],
                  input=q,
                  target=a,
                  metadata=meta
                  )
data = json_dataset(demo_path,
                    sample_fields = process_morehopqa)

# # display data
print(f"Dataset has {len(data)} samples.")
print("Example sample: ", json.dumps(data.__dict__['samples'][0].__dict__['metadata'], indent=2))

Dataset has 43 samples.
Example sample:  {
  "previous_question": "Who was the producer of the 1994 American animated epic musical film which Mel Shaw animated?",
  "previous_answer": "Don Hahn",
  "no_of_hops": 2,
  "answer_type": "letter",
  "context": "Mel Shaw:\n\t- Mel Shaw (December 19, 1914 \u2013 November 22, 2012) was an American animator, design artist, writer, and artist.\n\t- Shaw was involved in the animation, story design, and visual development of numerous Disney animated films, beginning with \"Bambi\", which was released in 1942.\n\t- His other animated film credits, usually involving animation design or the story, included \"The Rescuers\" in 1977, \"The Fox and the Hound\" in 1981, \"The Black Cauldron\" in 1985, \"The Great Mouse Detective\" in 1986, \"Beauty and the Beast\" in 1991, and \"The Lion King\" in 1994.\n\t- He was named a Disney Legend in 2004 for his contributions to the Walt Disney Company.\n- The Lion King:\n\t- The Lion King is a 1994 American animat

In [117]:
# get previous question (1 hop)
def get_prev(record):
    q = record['previous_question']
    a = record['previous_answer']
    meta = {k:record[k] for k in ['previous_question',
                                                   'previous_answer',
                                                   "no_of_hops",
                                                   "answer_type"]}
    meta['no_of_hops'] = meta['no_of_hops'] - 1
    context = '\n- '.join(['\n\t- '.join([title+':']+[chunk.strip() for chunk in chunks])
                           for title, chunks in record['context']])
    meta['context'] = context
    meta['question'] = q # for prompt template
    meta['answer'] = a
    return Sample(id=record["_id"],
                  input=q,
                  target=a,
                  metadata=meta
                  )
data_prev = json_dataset(demo_path,
                         get_prev)
print(f"Dataset has {len(data_prev)} samples.")
print("Example sample: ", json.dumps(data_prev.__dict__['samples'][0].__dict__['metadata'], indent=2))

Dataset has 43 samples.
Example sample:  {
  "previous_question": "Who was the producer of the 1994 American animated epic musical film which Mel Shaw animated?",
  "previous_answer": "Don Hahn",
  "no_of_hops": 1,
  "answer_type": "letter",
  "context": "Mel Shaw:\n\t- Mel Shaw (December 19, 1914 \u2013 November 22, 2012) was an American animator, design artist, writer, and artist.\n\t- Shaw was involved in the animation, story design, and visual development of numerous Disney animated films, beginning with \"Bambi\", which was released in 1942.\n\t- His other animated film credits, usually involving animation design or the story, included \"The Rescuers\" in 1977, \"The Fox and the Hound\" in 1981, \"The Black Cauldron\" in 1985, \"The Great Mouse Detective\" in 1986, \"Beauty and the Beast\" in 1991, and \"The Lion King\" in 1994.\n\t- He was named a Disney Legend in 2004 for his contributions to the Walt Disney Company.\n- The Lion King:\n\t- The Lion King is a 1994 American animat

# Eval
I evaluated OpenAI’s "gpt-4o-mini-2024-07-18" model with temperature = 0, using the Inspect framework.

In [170]:
# custom scorer, to handle letter answers (2-hop) or multi-word answers (1-hop)
@scorer(metrics=[accuracy(), stderr()])
def acc_scorer(type='letter'):
    if type == 'letter': # answer scorer for letter answers
        return answer('letter')

    async def score(state, target):
        # answer scorer for potentially multi-word answers
        output = state.output.completion.lower()
        if 'answer:' in output:
            output = output.split('answer:')[-1]
        else:
            output = output.split('\n')[-1]
        match = int(target.text.lower() in output)
        return Score(value=match, answer=output)

    return score

In [173]:
# define inspect task, both with and without chain of thought (cot)
template = "template.txt"
no_cot_prompt = "Answer the QUESTION, using the CONTEXT for assistance. Provide your answer at the end on its own line in the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the question."
cot_prompt = "Answer the QUESTION, using the CONTEXT for assistance. Start your response with \"Let's think step by step\" and then provide your answer at the end on its own line in the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the question.\n"

@task
def multihop_task(n_hops=2, cot=False):
    if n_hops == 2:
        dataset = data
        type = 'letter'
    else:
        dataset = data_prev
        type = 'multi-word'
    if cot:
        prompt = cot_prompt
    else:
        prompt = no_cot_prompt
    return Task(
        dataset = dataset,
        solver = [prompt_template(template, instruction=prompt),
                  generate()
                  ],
        scorer = [includes(), # checks if answer is anywhere in the output
                  acc_scorer(type) # checks if answer appears after the 'ANSWER:' flag
                  ],
    )

In [174]:
# run evaluation
tasks = [multihop_task(n_hops, cot = cot) for n_hops in (1,2) for cot in (False, True)]
print(tasks)
evaluate_model(tasks, model = "openai/gpt-4o-mini-2024-07-18", temperature = 0)

[<inspect_ai._eval.task.task.Task object at 0x000001F553EB7750>, <inspect_ai._eval.task.task.Task object at 0x000001F55584B110>, <inspect_ai._eval.task.task.Task object at 0x000001F555849450>, <inspect_ai._eval.task.task.Task object at 0x000001F555848DD0>]


Output()

Output()

Output()

Output()

# Results

In [185]:
logs = list_eval_logs()
results = pd.DataFrame()
for log in logs:
    samps = read_eval_log(log).samples
    df = pd.DataFrame([s.__dict__ for s in samps])
    df['cot'] = read_eval_log(log).eval.task_args['cot']
    df['n_hops'] = read_eval_log(log).eval.task_args['n_hops']
    results = pd.concat([results,df])
results.head(1)

Unnamed: 0,id,epoch,input,choices,target,sandbox,files,setup,messages,output,scores,metadata,store,events,model_usage,error,attachments,limit,cot,n_hops
0,10cd47420baf11ebab90acde48001122_0,1,What is the first letter of the first name of ...,,g,,,,[content='CONTEXT:\n- Thomas-Pierre-Joseph Tas...,model='gpt-4o-mini-2024-07-18' choices=[ChatCo...,{'includes': value=1 answer='Let\'s think step...,{'previous_question': 'Who is Joseph-André Tas...,{},"[timestamp=datetime.datetime(2025, 1, 25, 22, ...",{'openai/gpt-4o-mini-2024-07-18': input_tokens...,,{'4ed799301fa5b64d307a64bf9b418ff3': 'What is ...,,True,2


In [193]:
def get_includes_score (record):
    return record['scores']['includes'].value == 'C'
def get_answer_score (record):
    return record['scores']['acc_scorer'].value in ['C',1]
results['includes'] = results.apply(get_includes_score, axis=1)
results['accuracy'] = results.apply(get_answer_score, axis=1)

In [198]:
# accuracy scores
acc = results.pivot_table(index=['cot'], columns=['n_hops'],values=['accuracy'], aggfunc='mean')*100
acc.index = ['no_cot', 'cot']
acc.columns = ['1_hop', '2_hop']
acc

Unnamed: 0,1_hop,2_hop
no_cot,95.348837,86.046512
cot,95.348837,93.023256


The accuracy scores for gpt-4o-mini on 1-hop questions were exactly the same with and without CoT, so I won't do any further stats on those.

### Statistical Analysis

In [85]:
# 95% binomial confidence intervals for accuracy (2-hop)
(proportion_confint(results[results['n_hops'] == 2].groupby('cot')['accuracy'].agg('sum'),
                    nobs=results[results['n_hops'] == 2].shape[0]/2,
                    method='binom_test',
                    alpha=0.05))

(cot
 False    0.722968
 True     0.815017
 dtype: float64,
 cot
 False    0.937444
 True     0.980715
 dtype: float64)

In [89]:
# mcnemar test on accuracy (2-hop)
pivoted = results[results['n_hops'] == 2].pivot(index='id', columns='cot', values='accuracy')

contingency = [[0,0],[0,0]]
for i, row in pivoted.iterrows():
    contingency[int(row[False])][int(row[True])] += 1 # False is no_cot, True is cot
print('Contingency table:', contingency)
print('McNemar test results:')
print(mcnemar(contingency, exact=True))

Contingency table: [[3, 3], [0, 37]]
McNemar test results:
pvalue      0.25
statistic   0.0


The results are **not statistically significant**.