# Self-Ask Exploration

The goal is to test out self-ask on complex questions from QAMPARI and RoMQA.

We'll start with QAMPARI, getting all of the complex questions from the dev set.

In [1]:
import jsonlines
import numpy as np
import random

**Load & Sort QAMPARI Data**

In [2]:
qmp_dev_data_path = "/scratch/ddr8143/multiqa/downloads/data/qampari/dev_data.jsonl"
qmp_train_data_path = "/scratch/ddr8143/multiqa/downloads/data/qampari/train_data.jsonl"

In [3]:
qmp_comp_devd = []
with open(qmp_dev_data_path) as f:
    qmp_devd_iter = jsonlines.Reader(f)
    for d in qmp_devd_iter:
        if 'wikitables' in d['qid'] or 'wikidata_simple' in d['qid']:
            continue
        qmp_comp_devd.append(d)

In [4]:
print("Num Dev Non-Simple Wikidata Qs:", len(qmp_comp_devd))

Num Dev Non-Simple Wikidata Qs: 400


In [5]:
qmp_dev = []
with open(qmp_dev_data_path) as f:
    qmp_devd_iter = jsonlines.Reader(f)
    for d in qmp_devd_iter:
        if 'wikitables' in d['qid']:
            continue
        qmp_dev.append(d)

In [6]:
print("Num Dev Wikidata Qs:", len(qmp_dev))

Num Dev Wikidata Qs: 815


In [7]:
qmp_train = []
with open(qmp_train_data_path) as f:
    qmp_traind_iter = jsonlines.Reader(f)
    for d in qmp_traind_iter:
        if 'wikitables' in d['qid']:
            continue
        qmp_train.append(d)

In [8]:
print("Num Train Wikidata Qs:", len(qmp_train))

Num Train Wikidata Qs: 56075


In [9]:
for i in range(30):
    print(i, qmp_comp_devd[i]['question_text'])

0 Harmony Korine was both screenwriter and director of what movie?
1 Where did administrators of the UN Development Programme attend school?
2 Who directed a film that had P. Balachandran as a screenwriter?
3 Where was a Bishop of Bradford taught?
4 For which movie did Mani Ratnam work on the script and serve as producer?
5 From which country did Seattle Storm make draft selections?
6 who are alumnus of both Upper Canada College and Trinity College?
7 Where did a First-Circuit Appeals Court Judge of the United States attend college?
8 Who directed the TV show whose screenplay was written by B. J. Novak?
9 Which movie was both directed and screenwritten by Kamal Haasan?
10 Arsenal F.C. competed in and won which competition?
11 Joe Pasternak produced a motion picture that was directed by who?
12 Who won a Coke Zero Sugar 400 competition?
13 What team emerged victorious in the competition that included the Detroit Tigers?
14 What film was directed by Radha Mohan and produced by Prakash Ra

In [10]:
for i in range(10):
    print(i, qmp_train[i]['question_text'])

0 Which movie, clip, TV show etc. had Chezhiyan as director of photography?
1 Which movie, clip, TV show etc. had Andrew Droz Palermo as director of photography?
2 Which movie, clip, TV show etc. had Nizar Shafi as director of photography?
3 Which movie, clip, TV show etc. had Steven Soderbergh as director of photography?
4 Which movie, clip, TV show etc. had Haris Savides as director of photography?
5 Which movie, clip, TV show etc. had Philip H. Lathrop as director of photography?
6 Which movie, clip, TV show etc. had Claude Lelouch as director of photography?
7 Which movie, clip, TV show etc. had Russ Meyer as director of photography?
8 Which movie, clip, TV show etc. had Herman Schopp as director of photography?
9 Which movie, clip, TV show etc. had Peter Hyams as director of photography?


## Create Final Prompts with Train Data

**Sample equal number of samples from each type for prompt**

In [11]:
qtype_id_lists = {
    "wikidata_simple": [],
    "wikidata_comp": [], 
    "wikidata_intersection": [],
}
for i, qd in enumerate(qmp_train):
    for qtype in qtype_id_lists.keys():
        if qtype in qd['qid']:
            qtype_id_lists[qtype].append(i)
            
for k, v in qtype_id_lists.items():
    print(f"{k}: {len(v)}")

wikidata_simple: 28574
wikidata_comp: 25200
wikidata_intersection: 2301


In [104]:
for i in qtype_id_lists['wikidata_comp'][:1000]:
    print(i, qmp_train[i]['question_text'])
    

28574 What are the publication dates of movie, clip, TV show etc. that  had Chezhiyan as director of photography?
28575 What are the publication dates of movie, clip, TV show etc. that  had George Albert Smith as director of photography?
28576 What are the publication dates of movie, clip, TV show etc. that  had Arthur Crabtree as director of photography?
28577 What are the publication dates of movie, clip, TV show etc. that  had Adam Arkapaw as director of photography?
28578 What are the publication dates of movie, clip, TV show etc. that  had Amal Neerad as director of photography?
28579 Who are the cast members of movie, clip, TV show etc. that  had Arthur Crabtree as director of photography?
28580 Who are the actors in a media where Sameer Thahir is the director of photography?
28581 Who are the cast members of movie, clip, TV show etc. that  had Adam Arkapaw as director of photography?
28582 Who are the cast members of movie, clip, TV show etc. that  had Amal Neerad as director of

In [21]:
ninds = 10
inds_in_list_to_use = {
    k: list(np.random.randint(0, high=len(qdata), size=ninds)) for k, qdata in qtype_id_lists.items()
}
for k, ilist in inds_in_list_to_use.items():
    inds_to_use[k] = [qtype_id_lists[k][i] for i in ilist]
    
for k, v in inds_to_use.items():
    print(k, v)

wikidata_simple [16043, 4484, 11212, 7078, 22726, 22999, 13441, 26081, 8947, 25243]
wikidata_comp [49871, 53056, 40308, 36854, 44983, 40413, 44919, 51530, 45160, 38873]
wikidata_intersection [55083, 54453, 54839, 53887, 55449, 55315, 55284, 54935, 54931, 55950]


In [69]:
def sort_question_types(input_data):
    qtype_id_lists = {
        "wikidata_simple": [],
        "wikidata_comp": [], 
        "wikidata_intersection": [],
    }
    for i, qd in enumerate(input_data):
        for qtype in qtype_id_lists.keys():
            if qtype in qd['qid']:
                qtype_id_lists[qtype].append(i)

    for k, v in qtype_id_lists.items():
        print(f"{k}: {len(v)}")
        
    return qtype_id_lists

In [72]:
def sample_n_ids_each(input_data, ninds, qtype_lists):
    inds_to_use = {}
    inds_in_list_to_use = {
        k: list(np.random.randint(0, high=len(qdata), size=ninds)) for k, qdata in qtype_lists.items()
    }
    for k, ilist in inds_in_list_to_use.items():
        inds_to_use[k] = [qtype_lists[k][i] for i in ilist]

    for k, v in inds_to_use.items():
        print(k, v)
    return inds_to_use

In [None]:
# Fix the inds for train data to make prompts
inds_to_use = {
    "wikidata_simple": [16043, 4484, 11212, 7078, 22726, 22999, 13441, 26081, 8947, 25243],
    "wikidata_comp": [49871, 53056, 40308, 36854, 44983, 40413, 44919, 51530, 45160, 38873],
    "wikidata_intersection": [55083, 54453, 54839, 53887, 55449, 55315, 55284, 54935, 54931, 55950],
}

In [73]:
# But sample new dev data inds
dev_qtype_id_lists = sort_question_types(qmp_dev)
print()
dev_inds_to_use = sample_n_ids_each(qmp_dev, 20, dev_qtype_id_lists)

wikidata_simple: 415
wikidata_comp: 200
wikidata_intersection: 200

wikidata_simple [152, 189, 76, 536, 591, 197, 76, 478, 258, 463, 573, 501, 648, 591, 401, 392, 404, 158, 790, 238]
wikidata_comp [515, 522, 723, 154, 678, 671, 202, 79, 343, 719, 584, 62, 767, 107, 802, 662, 379, 488, 618, 569]
wikidata_intersection [735, 380, 582, 92, 71, 178, 365, 410, 596, 677, 553, 633, 600, 1, 780, 611, 652, 673, 63, 222]


In [11]:
dev_inds_to_use = {
    "wikidata_simple": [152, 189, 76, 536, 591, 197, 76, 478, 258, 463, 573, 501, 648, 591, 401, 392, 404, 158, 790, 238],
    "wikidata_comp": [515, 522, 723, 154, 678, 671, 202, 79, 343, 719, 584, 62, 767, 107, 802, 662, 379, 488, 618, 569],
    "wikidata_intersection": [735, 380, 582, 92, 71, 178, 365, 410, 596, 677, 553, 633, 600, 1, 780, 611, 652, 673, 63, 222],
}

## Create Prompt from the first n indices

In [80]:
for i in inds_to_use["wikidata_simple"]:
    print(i, qmp_train[i]['question_text'])

16043 Who played for the Stirling Albion F.C.?
4484 Which software, art, etc. has Sean Maguire as performer?
11212 Which spatial entity is located in Gmina Skąpe?
7078 Which software, art, etc. has Don Broco as performer?
22726 Which film has Saritha as a member of its cast
22999 Which film has Jimmy Hanley as a member of its cast
13441 Which spatial entity is located in Saldus Municipality?
26081 Who was born in Rossington?
8947 Which spatial entity is located in Zhuzhou?
25243 Who was born in Leeds?


In [30]:
for i in inds_to_use["wikidata_comp"]:
    print(i, qmp_train[i]['question_text'])

49871 What are the dates of death of persons that were a member of the political party Australian Labor Party (Anti-Communist)?
53056 What are the dates of birth of persons that were influenced by Ion Luca Caragiale?
40308 What are the locations of buildings that were designed by John S. Van Bergen?
36854 Who are the cast members of film that  had Stephen Sommers as screenwriter?
44983 What are the dates of birth of persons that received the award Kumar Suvarna Chandrak?
40413 What are the locations of entities that were operated by Okinawa Prefecture?
44919 What are the dates of birth of persons that received the award National Book Award?
51530 What are the publication dates of songs that had its lyrics written by Claude Kelly?
45160 What are the dates of birth of persons that are a Chino Hills High School alumni?
38873 What are the performers of songs that had its lyrics written by John Fogerty?


In [31]:
for i in inds_to_use["wikidata_intersection"]:
    print(i, qmp_train[i]['question_text'])

55083 Which film has Anna Neagle as a member of its cast and was directed by Herbert Wilcox?
54453 Which film has Ilaiyaraaja as performer and was directed by G. N. Rangarajan?
54839 Which film has M. G. Ramachandran as a member of its cast and has J. Jayalalithaa as a member of its cast
53887 Who played for the Hartford Whalers and played for the Columbus Blue Jackets?
55449 Which film has Mohanlal as a member of its cast and was produced by Mohanlal?
55315 Which film has Fearless Nadia as a member of its cast and was directed by Homi Wadia?
55284 Which film has Curly Howard as a member of its cast and was directed by Del Lord?
54935 Which film has Urmila Matondkar as a member of its cast and was directed by Ram Gopal Varma?
54931 Which film has Ramya Krishnan as a member of its cast and was directed by K. Raghavendra Rao?
55950 Which infrastructure is a part of Muni Metro and What entities were operated by San Francisco Municipal Railway?


In [41]:
official_train_decomp = {
    "wikidata_simple": {
        i: {"question_type": "simple", "question_text": qmp_train[i]['question_text']} 
        for i in [16043, 4484, 11212, 7078, 22726, 22999, 13441, 26081, 8947, 25243]
    },
    "wikidata_comp": {
        49871: {
            'question_text': 'What are the dates of death of persons that were a member of the political party Australian Labor Party (Anti-Communist)?',
            'question_type': 'composition',
            'subquestions': [
                'Who was a member of the political party Australian Labor Party (Anti-Communist)?',
                'What is the date of death of [ANS1]?',
            ]
        },
        53056: {
            'question_text': 'What are the dates of birth of persons that were influenced by Ion Luca Caragiale?',
            'question_type': 'composition',
            'subquestions': [
                'Who was influenced by Ion Luca Caragiale?',
                'What was the date of birth of [ANS1]?',
            ]
        },
        40308: {
            'question_text': 'What are the locations of buildings that were designed by John S. Van Bergen?',
            'question_type': 'composition',
            'subquestions': [
                'What buildings were designed by John S. Van Bergen?',
                'What is the location of [ANS1]?',
            ]
        },
        36854: {
            'question_text': 'Who are the cast members of film that  had Stephen Sommers as screenwriter?',
            'question_type': 'composition',
            'subquestions': [
                'What films had Stephen Sommers as screenwriter?',
                'Who are the cast members of [ANS1]?',
            ]
        },
        44983: {
            'question_text': 'What are the dates of birth of persons that received the award Kumar Suvarna Chandrak?',
            'question_type': 'composition',
            'subquestions': [
                'Who received the award Kumar Suvarna Chandrak?',
                'What is the date of birth of [ANS1]?',
            ]
        },
        40413: {
            'question_text': 'What are the locations of entities that were operated by Okinawa Prefecture?',
            'question_type': 'composition',
            'subquestions': [
                'What entities were operated by Okinawa Prefecture?',
                'What is the location of [ANS1]?',
            ]
        },
        44919: {
            'question_text': 'What are the dates of birth of persons that received the award National Book Award?',
            'question_type': 'composition',
            'subquestions': [
                'Who received the award National Book Award?',
                'What is the date of birth of [ANS1]?',
            ]
        },
        51530: {
            'question_text': 'What are the publication dates of songs that had its lyrics written by Claude Kelly?',
            'question_type': 'composition',
            'subquestions': [
                'What songs had their lyrics written by Claude Kelly?',
                'What is the publication date of [ANS1]?',
            ]
        },
        45160: {
            'question_text': 'What are the dates of birth of persons that are a Chino Hills High School alumni?',
            'question_type': 'composition',
            'subquestions': [
                'Who are Chino Hills High School alumni?',
                'What is the date of birth of [ANS1]?',
            ]
        },
        38873: {
            'question_text': 'What are the performers of songs that had its lyrics written by John Fogerty?',
            'question_type': 'composition',
            'subquestions': [
                'What songs had their lyrics written by John Fogerty?',
                'Who is the performer of [ANS1]?',
            ]
        },
    },
    "wikidata_intersection": {
        55083: {
            'question_text': 'Which film has Anna Neagle as a member of its cast and was directed by Herbert Wilcox?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Anna Neagle as a member of its cast?',
                'Which film was directed by Herbert Wilcox?',
            ]
        },
        54453: {
            'question_text': 'Which film has Ilaiyaraaja as performer and was directed by G. N. Rangarajan?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Ilaiyaraaja as performer?',
                'Which film was directed by G. N. Rangarajan?',
            ]
        },
        54839: {
            'question_text': 'Which film has M. G. Ramachandran as a member of its cast and has J. Jayalalithaa as a member of its cast',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has M. G. Ramachandran as a member of its cast?',
                'Which film has J. Jayalalithaa as a member of its cast?',
            ]
        },
        53887: {
            'question_text': 'Who played for the Hartford Whalers and played for the Columbus Blue Jackets?',
            'question_type': 'intersection',
            'subquestions': [
                'Who played for the Hartford Whalers?',
                'Who played for the Columbus Blue Jackets?',
            ]
        },
        55449: {
            'question_text': 'Which film has Mohanlal as a member of its cast and was produced by Mohanlal?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Mohanlal as a member of its cast?',
                'Which film was produced by Mohanlal?',
            ]
        },
        55315: {
            'question_text': 'Which film has Fearless Nadia as a member of its cast and was directed by Homi Wadia?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Fearless Nadia as a member of its cast?',
                'Which film was directed by Homi Wadia?',
            ]
        },
        55284: {
            'question_text': 'Which film has Curly Howard as a member of its cast and was directed by Del Lord?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Curly Howard as a member of its cast?',
                'Which film was directed by Del Lord?',
            ]
        },
        54935: {
            'question_text': 'Which film has Urmila Matondkar as a member of its cast and was directed by Ram Gopal Varma?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Urmila Matondkar as a member of its cast?',
                'Which film was directed by Ram Gopal Varma?',
            ]
        },
        54931: {
            'question_text': 'Which film has Ramya Krishnan as a member of its cast and was directed by K. Raghavendra Rao?',
            'question_type': 'intersection',
            'subquestions': [
                'Which film has Ramya Krishnan as a member of its cast?',
                'Which film was directed by K. Raghavendra Rao?',
            ]
        },
        55950: {
            'question_text': 'Which infrastructure is a part of Muni Metro and What entities were operated by San Francisco Municipal Railway?',
            'question_type': 'intersection',
            'subquestions': [
                'Which infrastructure is a part of Muni Metro?',
                'What entities were operated by San Francisco Municipal Railway?',
            ]
        },
    },
}


In [57]:
def qdata_to_print_prompt(all_qdata, qdecompose, num_each=3, include_simple=False, shuffle=True):
    prompt_list = []
    if include_simple:
        i = 0 
        for qtid, decomp in qdecompose['wikidata_simple'].items():
            if i == num_each:
                break
            prompt_list.append("""
Question: {init_q}
Can this be decomposed: No.""".format(
                init_q=decomp['question_text'],
            ))
            i += 1
        
    for qtype in ['wikidata_comp', 'wikidata_intersection']:
        i = 0
        for qtid, decomp in qdecompose[qtype].items():
            if i == num_each:
                break
            prompt_list.append("""
Question: {init_q}
Can this be decomposed: Yes.
Is this a composition or intersection question: {qtype}.
Question 1: {subqs1}
Question 2: {subqs2}
So the final answers are: {answer_list}.""".format(
                init_q=all_qdata[qtid]['question_text'],
                qtype=decomp['question_type'],
                subqs1=decomp['subquestions'][0],
                subqs2=decomp['subquestions'][1],
                answer_list=", ".join(list(set([a['answer_text'] for a in all_qdata[qtid]['answer_list']]))[:5]),
                
            ))
            i += 1

    if shuffle:
        random.shuffle(prompt_list)
    for p in prompt_list:
        print(p)

In [87]:
def qdata_to_print_prompt_v2(all_qdata, qdecompose, num_each=3, include_simple=False, shuffle=True):
    prompt_list = []
    if include_simple:
        i = 0 
        for qtid, decomp in qdecompose['wikidata_simple'].items():
            if i == num_each:
                break
            prompt_list.append("""
Question: {init_q}
Is this a simple, composition or intersection question: {qtype}.""".format(
                init_q=decomp['question_text'],
                qtype=decomp['question_type'],
            ))
            i += 1
        
    for qtype in ['wikidata_comp', 'wikidata_intersection']:
        i = 0
        for qtid, decomp in qdecompose[qtype].items():
            if i == num_each:
                break
            prompt_list.append("""
Question: {init_q}
Is this a simple, composition or intersection question: {qtype}.""".format(
                init_q=all_qdata[qtid]['question_text'],
                qtype=decomp['question_type'],
                subqs1=decomp['subquestions'][0],
                subqs2=decomp['subquestions'][1],
                answer_list=", ".join(list(set([a['answer_text'] for a in all_qdata[qtid]['answer_list']]))[:5]),
                
            ))
            i += 1

    if shuffle:
        random.shuffle(prompt_list)
    for p in prompt_list:
        print(p)

In [101]:
def qdata_to_print_prompt_v3(all_qdata, qdecompose, num_each=3, include_simple=False, shuffle=True):
    prompt_list = []
    if include_simple:
        i = 0 
        for qtid, decomp in qdecompose['wikidata_simple'].items():
            if i == num_each:
                break
            prompt_list.append("""
Question: {init_q}
Can this be decomposed: No.""".format(
                init_q=decomp['question_text'],
            ))
            i += 1
        
    for qtype in ['wikidata_comp', 'wikidata_intersection']:
        i = 0
        for qtid, decomp in qdecompose[qtype].items():
            if i == num_each:
                break
            prompt_list.append("""
Question: {init_q}
Question Type: {qtype}.
Question 1: {subqs1}
Question 2: {subqs2}""".format(
                init_q=all_qdata[qtid]['question_text'],
                qtype=decomp['question_type'],
                subqs1=decomp['subquestions'][0],
                subqs2=decomp['subquestions'][1],
                answer_list=", ".join(list(set([a['answer_text'] for a in all_qdata[qtid]['answer_list']]))[:5]),
                
            ))
            i += 1

    if shuffle:
        random.shuffle(prompt_list)
    for p in prompt_list:
        print(p)

In [102]:
qdata_to_print_prompt_v3(qmp_train, official_train_decomp, num_each=5, include_simple=False, shuffle=True)


Question: Which film has M. G. Ramachandran as a member of its cast and has J. Jayalalithaa as a member of its cast
Question Type: intersection.
Question 1: Which film has M. G. Ramachandran as a member of its cast?
Question 2: Which film has J. Jayalalithaa as a member of its cast?

Question: Which film has Anna Neagle as a member of its cast and was directed by Herbert Wilcox?
Question Type: intersection.
Question 1: Which film has Anna Neagle as a member of its cast?
Question 2: Which film was directed by Herbert Wilcox?

Question: Who are the cast members of film that  had Stephen Sommers as screenwriter?
Question Type: composition.
Question 1: What films had Stephen Sommers as screenwriter?
Question 2: Who are the cast members of [ANS1]?

Question: Which film has Ilaiyaraaja as performer and was directed by G. N. Rangarajan?
Question Type: intersection.
Question 1: Which film has Ilaiyaraaja as performer?
Question 2: Which film was directed by G. N. Rangarajan?

Question: Which f

In [94]:
qdata_to_print_prompt_v2(qmp_train, official_train_decomp, num_each=3, include_simple=True, shuffle=True)


Question: What are the dates of birth of persons that were influenced by Ion Luca Caragiale?
Is this a simple, composition or intersection question: composition.

Question: What are the locations of buildings that were designed by John S. Van Bergen?
Is this a simple, composition or intersection question: composition.

Question: Which software, art, etc. has Sean Maguire as performer?
Is this a simple, composition or intersection question: simple.

Question: What are the dates of death of persons that were a member of the political party Australian Labor Party (Anti-Communist)?
Is this a simple, composition or intersection question: composition.

Question: Which film has Ilaiyaraaja as performer and was directed by G. N. Rangarajan?
Is this a simple, composition or intersection question: intersection.

Question: Which spatial entity is located in Gmina Skąpe?
Is this a simple, composition or intersection question: simple.

Question: Which film has Anna Neagle as a member of its cast a

In [15]:
for k, ilist in dev_inds_to_use.items():
    #if k != "wikidata_comp":
    #if k != "wikidata_intersection":
    if k != "wikidata_simple":
        continue
    print(k)
    print()
    for i in ilist:
        print("Question:", qmp_dev[i]['question_text'])
        #print("Can this be decomposed:")
        print("Question Type:")
        print()
        #print(i, qmp_dev[i]['question_text'])
        #input()
    print()

wikidata_simple

Question: Who is a member of the Pakistan Air Force?
Question Type:

Question: What ships were Type VII submarines?
Question Type:

Question: Martha Graham choreographed what dance?
Question Type:

Question: What buildings are owned and maintained by Din l-Art Ħelwa?
Question Type:

Question: What is the name of an athlete drafted by the Oakland Athletics?
Question Type:

Question: What written work did Jonathan Strahan edit?
Question Type:

Question: Martha Graham choreographed what dance?
Question Type:

Question: What vehicle was constructed by J-BUS?
Question Type:

Question: Who was Mayor of Boston?
Question Type:

Question: Who is a member of the Union Theological Seminary staff member?
Question Type:

Question: What is the name of the people who graduated from Ypsilanti High School?
Question Type:

Question: Which natural wonder can be found within the Florida Keys National Marine Sanctuary?
Question Type:

Question: Which musical scores did Albert Von Tilzer co

In [79]:
# Is this really a simple question?
qmp_dev[76]

{'entities': [{'entity_url': 'https://en.wikipedia.org/wiki/Martha_Graham',
   'entity_text': 'Martha Graham',
   'aliases': ['Martha Graham', 'Martha Grehem']}],
 'question_text': 'Martha Graham choreographed what dance?',
 'answer_list': [{'answer_text': 'Appalachian Spring',
   'aid': '707__wikidata_simple__dev__0',
   'aliases': ['Appalachian Spring'],
   'answer_url': 'https://en.wikipedia.org/wiki/Appalachian_Spring',
   'proof': [{'proof_text': 'appalachian spring is a musical composition by aaron copland that was premiered in 1944 and has achieved widespread and enduring popularity as an orchestral suite. the music, scored for a thirteen-member chamber orchestra, was created upon commission of the choreographer and dancer martha graham with funds from the coolidge foundation.',
     'found_in_url': 'https://en.wikipedia.org/wiki/Appalachian_Spring',
     'pid': '707__wikidata_simple__dev__0__0'}]},
  {'answer_text': 'Judith',
   'aid': '707__wikidata_simple__dev__1',
   'aliase

## Explore Prompt Formats Using Dev Data

**Directly Use 4-shot Self-Ask Prompt**

In [None]:
# Taken from: https://github.com/ofirpress/self-ask/blob/main/self-ask_plus_search-engine_demo.ipynb
base_prompt = ['''Question: Who lived longer, Muhammad Ali or Alan Turing?
Are follow up questions needed here: Yes.
Follow up: How old was Muhammad Ali when he died?
Intermediate answer: Muhammad Ali was 74 years old when he died.
Follow up: How old was Alan Turing when he died?
Intermediate answer: Alan Turing was 41 years old when he died.
So the final answer is: Muhammad Ali 

Question: When was the founder of craigslist born?
Are follow up questions needed here: Yes.
Follow up: Who was the founder of craigslist?
Intermediate answer: Craigslist was founded by Craig Newmark.
Follow up: When was Craig Newmark born?
Intermediate answer: Craig Newmark was born on December 6, 1952.
So the final answer is: December 6, 1952

Question: Who was the maternal grandfather of George Washington?
Are follow up questions needed here: Yes.
Follow up: Who was the mother of George Washington?
Intermediate answer: The mother of George Washington was Mary Ball Washington.
Follow up: Who was the father of Mary Ball Washington?
Intermediate answer: The father of Mary Ball Washington was Joseph Ball.
So the final answer is: Joseph Ball 

Question: Are both the directors of Jaws and Casino Royale from the same country? 
Are follow up questions needed here: Yes. 
Follow up: Who is the director of Jaws? 
Intermediate Answer: The director of Jaws is Steven Spielberg. 
Follow up: Where is Steven Spielberg from? 
Intermediate Answer: The United States. 
Follow up: Who is the director of Casino Royale? 
Intermediate Answer: The director of Casino Royale is Martin Campbell. 
Follow up: Where is Martin Campbell from? 
Intermediate Answer: New Zealand. 
So the final answer is: No

Question: ''', 
'''
Are follow up questions needed here:''', ]

In [None]:
# Also taken from: https://github.com/ofirpress/self-ask/blob/main/self-ask_plus_search-engine_demo.ipynb
# But then modified

#def promptf(question, prompt, intermediate = "\nIntermediate answer:", followup = "Follow up:", finalans= '\nSo the final answer is:'):
INTERMEDIATE = "\nIntermediate answer:"
FOLLOWUP = "Follow up:"
FINALANS = "\nSo the final answer is:"
def printprompt(qid, devqs, prompt):
    question = devqs[qid]['question_text']
    cur_prompt = prompt[0] +  question + prompt[1]

    print(cur_prompt, end ='')

    """
    ret_text = call_gpt(cur_prompt, intermediate)

    while followup in get_last_line(ret_text):

      
      cur_prompt += ret_text
      question = extract_question(ret_text)
      external_answer = get_answer(question)

      if external_answer is not None:
        cur_prompt += intermediate + ' ' + external_answer + '.'
        print(intermediate + ' ' + yellowfy(external_answer) + '.', end='' )
        ret_text = call_gpt(cur_prompt, intermediate)
      else:
        #We only get here in the very rare case that Google returns no answer.
        cur_prompt += intermediate
        print(intermediate + ' ')
        gpt_answer = call_gpt(cur_prompt, ['\n'+followup, finalans])
        cur_prompt += gpt_answer

    
    if finalans not in ret_text:
      cur_prompt += finalans
      print(finalans, end = '')
      ret_text = call_gpt(cur_prompt, '\n')

    return cur_prompt + ret_text
    """

In [None]:
printprompt(0, qmp_comp_devd, base_prompt)

**Now Choose Subset to Try and Try Them**

In [None]:
for i in range(30):
    offset = 200
    if "wikidata_comp" not in qmp_comp_devd[i+offset]['qid']:
        continue
    print(i+offset, qmp_comp_devd[i+offset]['question_text'])

In [None]:
inds_to_try = [
    0,  # 0. screenwriter and director
    3,  # 1. where taught
    10, # 2. competed and won
    27, # 3. film steinbeck wrote
    32, # 4. institiution where educated
    44, # 5. graduated from two places
    59, # 6. which one did peerson win
    232,# 7. company produced wirtten
    255,# 8. composer for movie produced
    347,# 9. objects person designed depicted what
]

In [None]:
for i in inds_to_try:
    print(qmp_comp_devd[i]['question_text'])

In [None]:
test_ind = 0
print(qmp_comp_devd[inds_to_try[test_ind]]['question_text'])
print([a['answer_text'] for a in qmp_comp_devd[inds_to_try[test_ind]]['answer_list']])

print()

printprompt(inds_to_try[0], qmp_comp_devd, base_prompt)

In [6]:
inds_to_type = {
    "composition": [228, 229, 230, 231, 23, 24, 27, 28, 50, 51, 53, 54, 57],
    "intersection": [222, 226, 20, 21, 22, 25, 26, 55, 56, 58],
    "filter": [223, 225, 227, 29, 52, 59]
}

**Make a better prompt**

In [7]:
# Can we make a prompt out of these?
for ii in sorted(inds_to_type['filter']):#range(10):
    i = ii + 0
    print(i, qmp_comp_devd[i]['qid'])
    print(qmp_comp_devd[i]['question_text'])
    print([a['answer_text'] for a in qmp_comp_devd[i]['answer_list']])
    print()

29 374__wikidata_intersection__dev
Which PGA Champsionship did Jack Nicklaus win?
['1980 PGA Championship', '1975 PGA Championship', '1971 PGA Championship', '1973 PGA Championship', '1963 PGA Championship']

52 418__wikidata_intersection__dev
Which Monaco Grand Prix was won by Michael Schumacher?
['1999 Monaco Grand Prix', '1994 Monaco Grand Prix', '1997 Monaco Grand Prix', '2001 Monaco Grand Prix', '1995 Monaco Grand Prix']

59 103__wikidata_intersection__dev
Which Monaco Grand Prix did Ayrton Senna win?
['1993 Monaco Grand Prix', '1989 Monaco Grand Prix', '1987 Monaco Grand Prix', '1990 Monaco Grand Prix', '1991 Monaco Grand Prix', '1992 Monaco Grand Prix']

223 129__wikidata_intersection__dev
In which FA Cup Final did Blackburn Rovers F.C. compete?
['1882 FA Cup Final', '1960 FA Cup Final', '1891 FA Cup Final', '1890 FA Cup Final', '1884 FA Cup Final', '1885 FA Cup Final', '1928 FA Cup Final', '1886 FA Cup Final']

225 329__wikidata_intersection__dev
Which Super Bowl did the San Fr

In [54]:
"""
[{
        'question_text': '',
        'question_type': '',
        'subquestions': [
            '',
            '',
        ]
    }],
"""
decomposed_qs_intersection = {
    0: [{
        'question_text': 'Harmony Korine was both screenwriter and director of what movie?',
        'question_type': 'intersection',
        'subquestions': [
            'Harmony Korine was the screenwriter of what movie?',
            'Harmony Korine was the director of what movie?',
        ]
    }],
    22: [{ # intersection
        "question_text": "Who was both a graduate from Ananda College and University of Ceylon?",
        "question_type": "intersection",
        "subquestions": [
            "Who graduated from Ananda College?",
            "Who graduated from University of Ceylon?",
        ],
    }],
    25: [{
        'question_text': 'Which movie had K. S. L. Swamy as its director and Vijaya Bhaskar as its musical composer?',
        'question_type': 'intersection',
        'subquestions': [
            'Which movie has K. S. L. Swamy as its director?',
            'Which movie has Vijaya Bhaskar as its musical composer?',
        ]
    }],
    29: [{
        "question_text": "Which PGA Champsionship did Jack Nicklaus win?",
        "question_type": "intersection",
        "subquestions": [
            "Which PGA Champsionship have been played?",
            "What games has Jack Nicklaus won?"
        ]
    }],
    90: [{ # intersection (but I see filter too)
        "question_text": "What Superbowls did the Washington Football Team play in?",
        "question_type": "intersection",
        "subquestions": [
            "Which Superbowls were played?",
            "Which games did the Washington Football Team play in?"
        ]
    }],
    95: [{
        'question_text': 'What music was composed by Devi Sri Prasad and produced by Dil Raju?',
        'question_type': 'intersection',
        'subquestions': [
            'What music was composed by Devi Sri Prasad?',
            'What music was produced by Dil Raju?',
        ]
    }],
    380: [{ # intersection
        "question_text": "Which competition had Vitória F.C. and S.L. Benfica as participants?",
        "question_type": "intersection",
        "subquestions": [
            "Which competition had Vitória F.C. as a participant?",
            "Which competition had S.L. Benfica as a participant?",

        ],
    }],
    385: [{
        'question_text': 'What movie did Irwin Allen both direct and produce?',
        'question_type': 'intersection',
        'subquestions': [
            'What movie did Irwin Allen direct?',
            'What movie did Irwin Allen produce?',
        ]
    }],
}

"""
[{
            'question_text': '',
            'question_type': '',
            'subquestions': [
                '',
                '',
            ]
        }],
"""
decomposed_qs_composition = {
    11: [{
        'question_text': 'Joe Pasternak produced a motion picture that was directed by who?',
        'question_type': 'composition',
        'subquestions': [
            'What motion pictures did Joe Pasternak produce?',
            'Who directed it?',
        ]
    }],
    110: [{
        'question_text': 'Who directed the TV show that Tom Kauffman worked on as a screenwriter?',
        'question_type': 'composition',
        'subquestions': [
            'What TV shows did Tom Kauffman work on as a screenwriter?',
            'Who directed it?',
        ]
    }],
    120: [{
        'question_text': 'Where did former Bishops of Warrington go to school?',
        'question_type': 'composition',
        'subquestions': [
            'Who were the former Bishops of Warrington?',
            'Where did they go to school?',
        ]
    }],
    150: [{ # compositional
        "question_text": "Who was credited as director for a movie penned by Peter Baynham?",
        "question_type": "composition",
        "subquestions": [
            "What movies were penned by Peter Baynham?",
            "Who was credited the director for it?"
        ],
    }],
    221: [{
        'question_text': 'Where did an employee of University of Pennsylvania Law School receive their education?',
        'question_type': 'composition',
        'subquestions': [
            'Who was an employee of University of Pennsylvania Law School?',
            'Where did they receive their education?',
        ]
    }],
}

# many of these are intersections that are better answered as filters
decomposed_qs_filter = {
    29: [{
        "question_text": "Which PGA Champsionship did Jack Nicklaus win?",
        "question_type": "filter",
        "subquestions": [
            "Which PGA Champsionship have been played?",
            "Did Jack Nicklaus win it?"
        ]
    }],
    52: [{
        "question_text": "Which Monaco Grand Prix was won by Michael Schumacher?",
        "question_type": "filter",
        "subquestions": [
            "Which Monaco Grand Prix have been held?",
            "Did Michael Schumacher win it?"
        ]
    }],
    59: [{
        "question_text": "Which Monaco Grand Prix did Ayrton Senna win?",
        "question_type": "filter",
        "subquestions": [
            "Which Monaco Grand Prix have been held?",
            "Did Ayrton Senna win it?"
        ]
    }],
    90 : [{ # intersection (but I see filter too)
        "question_text": "What Superbowls did the Washington Football Team play in?",
        "question_type": "filter",
        "subquestions": [
            "Which Superbowls have been played?",
            "Did the Washington Football Team play in it?"
        ]
    }],
    155: [{
        'question_text': 'Which FA Cup Final featured the Tottenham Hotspurs as competitors?',
        'question_type': 'filter',
        'subquestions': [
            'Which FA Cup Finals were played?',
            'Did it feature the Tottenham Hotsurs as a competitor?',
        ]
    }],
    223: [{
        "question_text": "In which FA Cup Final did Blackburn Rovers F.C. compete?",
        "question_type": "filter",
        "subquestions": [
            "Which FA Cup Finals have been played?",
            "Did Blackburn Rovers F.C. compete in it?"
        ]
    }],
}

In [None]:
qmp_comp_devd[155]['qid']

In [50]:
# Then, lets focus on pure intersections for now as the first trial q is a pure intersection
"""
Question: Who lived longer, Muhammad Ali or Alan Turing?
Are follow up questions needed here: Yes.
Follow up: How old was Muhammad Ali when he died?
Intermediate answer: Muhammad Ali was 74 years old when he died.
Follow up: How old was Alan Turing when he died?
Intermediate answer: Alan Turing was 41 years old when he died.
So the final answer is: Muhammad Ali 
"""
#Is this a composition, filter or intersection question: {qtype}

def qdata_to_prompt(qdata, qdecompose):
    #Question 2: {subqs2}""".format(
    return """
Question: {init_q}
Can this be decomposed: Yes.
Is this a composition or intersection question: {qtype}.
Question 1: {subqs1}
Question 2: {subqs2}
So the final answers are: {answer_list}.""".format(
        init_q=qdata['question_text'],
        qtype=qdecompose['question_type'],
        subqs1=qdecompose['subquestions'][0],
        subqs2=qdecompose['subquestions'][1],
        answer_list=", ".join(list(set([a['answer_text'] for a in qdata['answer_list']]))[:5]),
    )

In [None]:
qind = 22
print(qdata_to_prompt(qmp_comp_devd[qind], decomposed_qs_intersection[qind][0]))

In [None]:
# Intersection Prompt
test_qind = 385
for qind in [
    0, 
    22, 
    25, 
    95,
    385
]:
    if qind == test_qind:
        continue
    print(qdata_to_prompt(qmp_comp_devd[qind], decomposed_qs_intersection[qind][0]))
print()
print("Question: " + qmp_comp_devd[test_qind]['question_text'])
print("Can this be decomposed:")

In [None]:
# Composition Prompt
test_qind = 11
for qind in [
    11, 
    110, 
    120, 
    150,
    221
]:
    if qind == test_qind:
        continue
    print(qdata_to_prompt(qmp_comp_devd[qind], decomposed_qs_composition[qind][0]))
print()
print("Question: " + qmp_comp_devd[test_qind]['question_text'])
print("Can this be decomposed:")

In [None]:
# Combined: get q type too
test_qind = 25
curr_qs_comp = {**decomposed_qs_composition, **decomposed_qs_intersection}
for qind in [
    0, 
    11, 
    110, 
    22, 
    120, 
    25, 
]:
    if qind == test_qind:
        continue
    print(qdata_to_prompt(qmp_comp_devd[qind], curr_qs_comp[qind][0]))
print()
print("Question: " + qmp_comp_devd[test_qind]['question_text'])
print("Can this be decomposed:")

In [31]:
# Filter prompt
test_qind = 223
curr_qs_comp = {**decomposed_qs_composition, **decomposed_qs_intersection, **decomposed_qs_filter}
for qind in [
    29,
    52,
    90,
    155,
    223,
]:
    if qind == test_qind:
        continue
    print(qdata_to_prompt(qmp_comp_devd[qind], curr_qs_comp[qind][0]))
print()
print("Question: " + qmp_comp_devd[test_qind]['question_text'])
print("Can this be decomposed:")


Question: Which PGA Champsionship did Jack Nicklaus win?
Can this be decomposed: Yes.
Is this a composition, filter or intersection question: filter.
Question 1: Which PGA Champsionship have been played?
Question 2: Did Jack Nicklaus win it?
So the final answers are: 1980 PGA Championship, 1971 PGA Championship, 1975 PGA Championship, 1963 PGA Championship, 1973 PGA Championship.

Question: Which Monaco Grand Prix was won by Michael Schumacher?
Can this be decomposed: Yes.
Is this a composition, filter or intersection question: filter.
Question 1: Which Monaco Grand Prix have been held?
Question 2: Did Michael Schumacher win it?
So the final answers are: 1999 Monaco Grand Prix, 1997 Monaco Grand Prix, 1994 Monaco Grand Prix, 2001 Monaco Grand Prix, 1995 Monaco Grand Prix.

Question: What Superbowls did the Washington Football Team play in?
Can this be decomposed: Yes.
Is this a composition, filter or intersection question: filter.
Question 1: Which Superbowls have been played?
Questio

In [55]:
# Combined: get q type too
test_qind = 40
curr_qs_comp = {**decomposed_qs_composition, **decomposed_qs_intersection}#, **decomposed_qs_filter}
for qind in sorted([
    # Intersection
    0, 
    22, 
    29, # FILTER
    #25, 
    95,
    385,
    # Composition
    11, 
    110, 
    120, 
    150,
    221,
    # Filter
    #29,
    #52,
    #90,
    #155,
    #223,
]):
    if qind == test_qind:
        continue
    print(qdata_to_prompt(qmp_comp_devd[qind], curr_qs_comp[qind][0]))
    
for test_qind in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 52, 74, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 184, 205, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 360, 361, 383]:
    #test_qind = i+40
    #if "int" not in qmp_comp_devd[test_qind]['qid']:
    #    continue
    print()
    print(test_qind)
    print("Question: " + qmp_comp_devd[test_qind]['question_text'])
    print("Can this be decomposed:")


Question: Harmony Korine was both screenwriter and director of what movie?
Can this be decomposed: Yes.
Is this a composition or intersection question: intersection.
Question 1: Harmony Korine was the screenwriter of what movie?
Question 2: Harmony Korine was the director of what movie?
So the final answers are: Gummo, The Beach Bum, Mister Lonely, Spring Breakers, Julien Donkey-Boy.

Question: Joe Pasternak produced a motion picture that was directed by who?
Can this be decomposed: Yes.
Is this a composition or intersection question: composition.
Question 1: What motion pictures did Joe Pasternak produce?
Question 2: Who directed it?
So the final answers are: Richard Wallace, Erich Schönfelder, Norman Taurog, Charles Walters, Richard Thorpe.

Question: Who was both a graduate from Ananda College and University of Ceylon?
Can this be decomposed: Yes.
Is this a composition or intersection question: intersection.
Question 1: Who graduated from Ananda College?
Question 2: Who graduated f

In [None]:
test_qind = 221
for qind in [
    11, 
    110, 
    120, 
    150,
    221
]:
    if qind == test_qind:
        continue
    print(qdata_to_prompt(qmp_comp_devd[qind], decomposed_qs_composition[qind][0]))
print()
print("Question: " + qmp_comp_devd[test_qind]['question_text'])
print("Can this be decomposed:")

In [None]:
def comp_ans(gta_str, preda_str):
    gta = list(set([a.replace(' ', '').lower() for a in gta_str.split(", ")]))
    preda = set(list([a.replace(' ', '').lower() for a in preda_str.split(", ")]))
    print("Num pred answers:", len(preda))
    num_in = 0
    for a in gta:
        if a in preda:
            num_in += 1
            
    print(f"Num GT pred: {num_in} / {len(gta)}")
    print(f"Num pred not GT: {len(preda) - num_in}")

In [None]:
comp_ans(
    gta_str="University of Pennsylvania, University of Oklahoma, New College of Florida, University of Pennsylvania Law School, University of Toronto, Alfred University, Southwestern College, University of Pennsylvania, Stanford Law School, Merton College, St Antony's College, Harvard Law School, Harvard University, Yale University, Reading High School, Smith College, Harvard University, Yale Law School, University of Pennsylvania Law School, University of Chicago Law School, University of Pennsylvania Law School, Lower Merion High School, Massachusetts Institute of Technology, Islamic Azad University, University of Pennsylvania Law School, Northwestern University School of Law, Harvard University, University of California, Harvard Law School, University of North Carolina at Chapel Hill, University of Chicago, Yale Law School, University of Pennsylvania Law School, Yale Law School, Princeton University, Somerville College, Cornell University, University of Oklahoma, University of Michigan Law School, University of Pennsylvania Law School, University of Michigan, University of Pennsylvania Law School, University of Pennsylvania Law School",
    preda_str="Stanford University, Harvard University, Yale University, University of California, Berkeley, University of Virginia, Columbia University, University of Michigan, Duke University, University of Pennsylvania, Georgetown University, Cornell University, Northwestern University",
)