In [1]:
# !pip install inspect-ai

In [222]:
# imports

import json
import os
import pandas as pd
import re
import numpy as np
from inspect_ai import Task, task
from inspect_ai import eval as evaluate_model
from inspect_ai.dataset import json_dataset, Sample
from inspect_ai.scorer import includes, answer, Score, accuracy, stderr, scorer
from inspect_ai.solver import generate, prompt_template
from inspect_ai.log import list_eval_logs, read_eval_log

from statsmodels.stats.contingency_tables import mcnemar
import statsmodels.api as sm
np.random.seed(0)

# Dataset
## Source: [MoreHopQA final](https://github.com/Alab-NII/morehopqa/blob/main/datasets/files/morehopqa_final.json), letter-only
I loaded the data from the github repo, filtered to questions which had a “letter” answer type, in order to have consistent answer parsing. This resulted in 43 multi-hop questions.
I also parsed the provided “context” for each question into the following format:
- {topic title}
	- {topic sentence}
	- {topic sentence}

Note: I was originally hoping to use the MoreHopQA dataset without the context, but the model seemed unable to get answer any of the questions correctly due to factual recall errors.

In [78]:
# load data
demo_path = "demo_data_letter.json"
og_data_path = os.path.abspath(os.path.join("..", "morehopqa","datasets","files","morehopqa_final.json"))
if not os.path.exists(demo_path): # select 5 samples for demo
    with open(og_data_path,'r', encoding='utf-8') as f:
        d = json.load(f)
    d_filt = [s for s in d if s['answer_type'] == 'letter'] # letter, to give it a better chance?
    demo_data = d_filt
    # demo_data = np.random.choice(d_filt,5,replace=False).tolist()
    print(demo_data)
    with open(demo_path, 'w', encoding='utf-8') as f:
        json.dump(demo_data,f, indent=2)

# load into inspect framework
def process_morehopqa (record):
    q = record['question']
    context = '\n- '.join(['\n\t- '.join([title+':']+[chunk.strip() for chunk in chunks])
                           for title, chunks in record['context']])
    a = record['answer']
    meta = {k:record[k] for k in ['previous_question',
                                                   'previous_answer',
                                                   "no_of_hops",
                                                   "answer_type"]}
    meta['context'] = context
    meta['question'] = q # for prompt template
    meta['answer'] = a
    return Sample(id=record["_id"],
                  input=q,
                  target=a,
                  metadata=meta
                  )
data = json_dataset(demo_path,
                    sample_fields = process_morehopqa)

# # display data
print(f"Dataset has {len(data)} samples.")
print("Example sample: ", json.dumps(data.__dict__['samples'][0].__dict__['metadata'], indent=2))

Dataset has 43 samples.
Example sample:  {
  "previous_question": "Who was the producer of the 1994 American animated epic musical film which Mel Shaw animated?",
  "previous_answer": "Don Hahn",
  "no_of_hops": 2,
  "answer_type": "letter",
  "context": "Mel Shaw:\n\t- Mel Shaw (December 19, 1914 \u2013 November 22, 2012) was an American animator, design artist, writer, and artist.\n\t- Shaw was involved in the animation, story design, and visual development of numerous Disney animated films, beginning with \"Bambi\", which was released in 1942.\n\t- His other animated film credits, usually involving animation design or the story, included \"The Rescuers\" in 1977, \"The Fox and the Hound\" in 1981, \"The Black Cauldron\" in 1985, \"The Great Mouse Detective\" in 1986, \"Beauty and the Beast\" in 1991, and \"The Lion King\" in 1994.\n\t- He was named a Disney Legend in 2004 for his contributions to the Walt Disney Company.\n- The Lion King:\n\t- The Lion King is a 1994 American animat

In [117]:
# get previous question (1 hop)
def get_prev(record):
    q = record['previous_question']
    a = record['previous_answer']
    meta = {k:record[k] for k in ['previous_question',
                                                   'previous_answer',
                                                   "no_of_hops",
                                                   "answer_type"]}
    meta['no_of_hops'] = meta['no_of_hops'] - 1
    context = '\n- '.join(['\n\t- '.join([title+':']+[chunk.strip() for chunk in chunks])
                           for title, chunks in record['context']])
    meta['context'] = context
    meta['question'] = q # for prompt template
    meta['answer'] = a
    return Sample(id=record["_id"],
                  input=q,
                  target=a,
                  metadata=meta
                  )
data_prev = json_dataset(demo_path,
                         get_prev)
print(f"Dataset has {len(data_prev)} samples.")
print("Example sample: ", json.dumps(data_prev.__dict__['samples'][0].__dict__['metadata'], indent=2))

Dataset has 43 samples.
Example sample:  {
  "previous_question": "Who was the producer of the 1994 American animated epic musical film which Mel Shaw animated?",
  "previous_answer": "Don Hahn",
  "no_of_hops": 1,
  "answer_type": "letter",
  "context": "Mel Shaw:\n\t- Mel Shaw (December 19, 1914 \u2013 November 22, 2012) was an American animator, design artist, writer, and artist.\n\t- Shaw was involved in the animation, story design, and visual development of numerous Disney animated films, beginning with \"Bambi\", which was released in 1942.\n\t- His other animated film credits, usually involving animation design or the story, included \"The Rescuers\" in 1977, \"The Fox and the Hound\" in 1981, \"The Black Cauldron\" in 1985, \"The Great Mouse Detective\" in 1986, \"Beauty and the Beast\" in 1991, and \"The Lion King\" in 1994.\n\t- He was named a Disney Legend in 2004 for his contributions to the Walt Disney Company.\n- The Lion King:\n\t- The Lion King is a 1994 American animat

# Eval
I evaluated OpenAI’s "gpt-4o-mini-2024-07-18" model with temperature = 0, using the Inspect framework.

In [170]:
# custom scorer, to handle letter answers (2-hop) or multi-word answers (1-hop)
@scorer(metrics=[accuracy(), stderr()])
def acc_scorer(type='letter'):
    if type == 'letter': # answer scorer for letter answers
        return answer('letter')

    async def score(state, target):
        # answer scorer for potentially multi-word answers
        output = state.output.completion.lower()
        if 'answer:' in output:
            output = output.split('answer:')[-1]
        else:
            output = output.split('\n')[-1]
        match = int(target.text.lower() in output)
        return Score(value=match, answer=output)

    return score

In [200]:
# define inspect task, both with and without chain of thought (cot)

@task
def multihop_task(n_hops=2, context=True, cot=False):
    if context:
        template = "template.txt"
        no_cot_prompt = "Answer the QUESTION, using the CONTEXT for assistance. Provide your answer at the end on its own line in the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the question."
        cot_prompt = "Answer the QUESTION, using the CONTEXT for assistance. Start your response with \"Let's think step by step\" and then provide your answer at the end on its own line in the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the question.\n"
    else:
        template = "template_no_context.txt"
        no_cot_prompt = "Answer the QUESTION. Provide your answer at the end on its own line in the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the question."
        cot_prompt = "Answer the QUESTION. Start your response with \"Let's think step by step\" and then provide your answer at the end on its own line in the form \"ANSWER: $ANSWER\" (without quotes) where $ANSWER is the answer to the question.\n"

    if n_hops == 2:
        dataset = data
        type = 'letter'
    else:
        dataset = data_prev
        type = 'multi-word'

    if cot:
        prompt = cot_prompt
    else:
        prompt = no_cot_prompt

    return Task(
        dataset = dataset,
        solver = [prompt_template(template, instruction=prompt),
                  generate()
                  ],
        scorer = [includes(), # checks if answer is anywhere in the output
                  acc_scorer(type) # checks if answer appears after the 'ANSWER:' flag
                  ],
    )

In [202]:
# run evaluation
tasks = [multihop_task(n_hops, context = context, cot = cot) for n_hops in (1,2) for context in (False,) for cot in (False, True)]
print(tasks)
evaluate_model(tasks, model = "openai/gpt-4o-mini-2024-07-18", temperature = 0)

Output()

[<inspect_ai._eval.task.task.Task object at 0x000001F552A3E110>, <inspect_ai._eval.task.task.Task object at 0x000001F553D8E8D0>, <inspect_ai._eval.task.task.Task object at 0x000001F555AA46D0>, <inspect_ai._eval.task.task.Task object at 0x000001F555AA40D0>]


Output()

Output()

Output()

# Results

In [204]:
logs = list_eval_logs()
results = pd.DataFrame()
for log in logs:
    samps = read_eval_log(log).samples
    df = pd.DataFrame([s.__dict__ for s in samps])
    log_args = read_eval_log(log).eval.task_args
    df['cot'] = log_args['cot']
    df['n_hops'] = log_args['n_hops']
    if 'context' in log_args:
        df['has_context'] = log_args['context']
    else:
        df['has_context'] = True
    results = pd.concat([results,df])
results.head(1)

Unnamed: 0,id,epoch,input,choices,target,sandbox,files,setup,messages,output,...,metadata,store,events,model_usage,error,attachments,limit,cot,n_hops,has_context
0,10cd47420baf11ebab90acde48001122_0,1,What is the first letter of the first name of ...,,g,,,,[content='QUESTION: What is the first letter o...,model='gpt-4o-mini-2024-07-18' choices=[ChatCo...,...,{'previous_question': 'Who is Joseph-André Tas...,{},"[timestamp=datetime.datetime(2025, 1, 25, 23, ...",{'openai/gpt-4o-mini-2024-07-18': input_tokens...,,{'4ed799301fa5b64d307a64bf9b418ff3': 'What is ...,,True,2,False


In [205]:
def get_includes_score (record):
    return record['scores']['includes'].value == 'C'
def get_answer_score (record):
    return record['scores']['acc_scorer'].value in ['C',1]
results['includes'] = results.apply(get_includes_score, axis=1)
results['accuracy'] = results.apply(get_answer_score, axis=1)

In [None]:
# accuracy scores
for has_context in [True, False]:
    acc = (results[results['has_context'] == has_context]
           .pivot_table(index=['cot'], columns=['n_hops'],values=['accuracy'], aggfunc='mean')
           *100)
    acc.index = ['no_cot', 'cot']
    acc.columns = ['1_hop', '2_hop']
    print(f'Accuracy with{"out" if not has_context else ""} context:')
    display(acc)

Accuracy with context:


Unnamed: 0,1_hop,2_hop
no_cot,95.348837,86.046512
cot,95.348837,93.023256


Accuracy without context:


Unnamed: 0,1_hop,2_hop
no_cot,18.604651,27.906977
cot,20.930233,30.232558


The accuracy scores for gpt-4o-mini on 1-hop questions with context were exactly the same with and without CoT, so I won't do any further stats on those.

### Statistical Analysis - Wald Test

In [217]:
res_agg = results.groupby(['n_hops', 'has_context', 'cot']).agg({'accuracy': 'mean', 'id': 'count'}).rename(columns={'id':'n'}).reset_index()
res_agg

Unnamed: 0,n_hops,has_context,cot,accuracy,n
0,1,False,False,0.186047,43
1,1,False,True,0.209302,43
2,1,True,False,0.953488,43
3,1,True,True,0.953488,43
4,2,False,False,0.27907,43
5,2,False,True,0.302326,43
6,2,True,False,0.860465,43
7,2,True,True,0.930233,43


In [220]:
# This Wald Test p-value function wrapper is borrowed from my previous work (also written by me).
def p_value(data, target, comparisons=None, hypothesis=None, categories=['n_hops', 'has_context', 'cot'], verbose=True):
    """
    Calculate a p-value for comparing the effect of categorical values on a target result.
    Fits a linear model to the one-hot encoded categorical features.
    Returns p-value(s) from Wald test to see if learned coefficients are significantly different.

    - data is the table of results. Must contain a column 'n' with the number of trials per observation.
    - target is the column name corresponding to the target data to fit to (this column should contain numeric values)
    - Comparisons: List of (category colummn name, value1, value2) representing a comparison.
    - hypothesis: To specify a single, more complicated Wald test. If hypothesis is provided, comparisons is ignored.
    - categories: list of categorical features to include in the regression
    """
    # Remove unnecessary columns and do one-hot encode categorical features
    numericals = ['n',target]
    data_encoded = pd.get_dummies(data[categories+numericals], columns=categories, drop_first=False)

    # Format features to fit the model
    X = data_encoded.drop(numericals, axis=1).astype(int)
    y = data_encoded[target].astype(float)
    if max(y) > 1: # convert percents to decimals
        y = y/100

    X = sm.add_constant(X)
    # Sample weights indicating the number of trials for each observation
    sample_weight = data_encoded['n'].astype(int)# Fit the logistic regression model using aggregated data
    model = sm.GLM(y, X, family=sm.families.Binomial(), freq_weights=sample_weight)
    results = model.fit()# Print the summary of the model
    if verbose:
        print(results.summary())

    # perform Wald test to see if the coefficients are significantly different
    p_values = {}
    if hypothesis is None: # no specific hypothesis provided - use comparisons
        for category, value1, value2 in comparisons:
            hypothesis_string = f'{category}_{value1} = {category}_{value2}'
            wald_test_result = results.wald_test(hypothesis_string, scalar=False)
            if verbose:
                print(wald_test_result)
            p_values[hypothesis_string] = wald_test_result.pvalue, results.params
    else: # use specific hypothesis
        wald_test_result = results.wald_test(hypothesis, scalar=False)
        if verbose:
            print(wald_test_result)
        p_values[hypothesis] = wald_test_result.pvalue, results.params
    return p_values

In [223]:
p_value(res_agg, 'accuracy', comparisons=[('n_hops', 1, 2), ('has_context', True, False), ('cot', True, False)])

                 Generalized Linear Model Regression Results                  
Dep. Variable:               accuracy   No. Observations:                    8
Model:                            GLM   Df Residuals:                      340
Model Family:                Binomial   Df Model:                            3
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -99.106
Date:                Sat, 25 Jan 2025   Deviance:                       4.6445
Time:                        23:40:48   Pearson chi2:                     4.79
No. Iterations:                     5   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                 0.2754      0.06

{'n_hops_1 = n_hops_2': (array(0.65000263),
  const                0.275419
  n_hops_1             0.068961
  n_hops_2             0.206458
  has_context_False   -1.685267
  has_context_True     1.960686
  cot_False            0.023101
  cot_True             0.252318
  dtype: float64),
 'has_context_True = has_context_False': (array(8.22382235e-27),
  const                0.275419
  n_hops_1             0.068961
  n_hops_2             0.206458
  has_context_False   -1.685267
  has_context_True     1.960686
  cot_False            0.023101
  cot_True             0.252318
  dtype: float64),
 'cot_True = cot_False': (array(0.45012518),
  const                0.275419
  n_hops_1             0.068961
  n_hops_2             0.206458
  has_context_False   -1.685267
  has_context_True     1.960686
  cot_False            0.023101
  cot_True             0.252318
  dtype: float64)}

The results are only statistically significant for **having the context in the prompt**, not the number of hops or use of chain of thought.