In [1]:
import json
import sh
import os

from pyserini.search.lucene import LuceneSearcher

In [4]:
base_path = "/scratch/ddr8143/repos/DPR/downloads/data"
ambigqa_path = f"{base_path}/ambigqa"
ambigqa_light_path = f"{base_path}/ambigqa_light"
nq_path = f"{base_path}/retriever"
qp_path = f"{base_path}/qampari"

In [5]:
bm25_out_path = "/scratch/ddr8143/repos/pyserini/runs"
bm25_out_name = lambda dataset, split, hits: f"{bm25_out_path}/bm25.{dataset}.{split}.h{hits}.json"

In [6]:
bm25_out_name("ambigqa_light", "dev", 100)

'/scratch/ddr8143/repos/pyserini/runs/bm25.ambigqa_light.dev.h100.json'

In [7]:
# Goal 1: Inspect the context annotations

In [8]:
def count_no_positive(dataset_path, hits):
    no_pos_path = f"{dataset_path}__w_no_pos"
    if os.path.exists(no_pos_path):
        dataset = json.load(open(no_pos_path))
    else:
        dataset = json.load(open(dataset_path))
    total_len = len(dataset)
    no_positive = [d for d in dataset if len(d["positive_ctxs"]) == 0]
    no_pos_len = len(no_positive)
    print(f"{hits:4} | {no_pos_len}/{total_len} ({no_pos_len * 100 / total_len:0.2f}%)")

In [9]:
print(f"Hits | No Positive Contexts Retrieved")
print(f"---- | ------------------------------")
for hits in [100, 400, 1000]:
    count_no_positive(bm25_out_name("ambigqa_light", "dev", hits), hits)

Hits | No Positive Contexts Retrieved
---- | ------------------------------
 100 | 319/2002 (15.93%)
 400 | 221/2002 (11.04%)
1000 | 170/2002 (8.49%)


In [10]:
dev_dataset = json.load(open(bm25_out_name("ambigqa_light", "dev", 100)))

In [11]:
dev_dataset[0]

{'question': 'Who plays the doctor in dexter season 1?',
 'answers': ['Goldwyn', 'Tony Goldwyn'],
 'positive_ctxs': [{'docid': '13691534',
   'score': '9.432925',
   'text': '"Dexter (season 1)"\nKiller case closed, though Doakes keeps close tabs on Dexter, still suspicious of his actions. Rita discovers evidence that Dexter may have set up Paul. The series pilot was developed by James Manos, Jr. based on Jeff Lindsay\'s novel. Manos served as an executive producer for the pilot along with John Goldwyn and Sara Colleton. The pilot was produced by Dennis Bishop. Steven Brown also served as a producer for the pilot episode. Chad Tomasoski worked as an associate producer. The pilot was directed by Michael Cuesta. Manos, Goldwyn and Colleton returned as executive producers for the first season. Mid-season',
   'has_answer': True}],
 'hard_negative_ctxs': [{'docid': '2499310',
   'score': '12.204408',
   'text': '"Denise Crosby"\n""Key West"". She also appeared in two episodes of the cable 

In [4]:
print(ambigqa_light_path)
sh.ls(ambigqa_light_path)

/scratch/ddr8143/repos/DPR/downloads/data/ambigqa_light


dev.json  train.json

In [5]:
train_dataset_path = '/'.join([ambigqa_light_path, "train.json"])
train_dataset = json.load(open(train_dataset_path))

In [23]:
strange_ds = []
for d in train_dataset:
    if len(d["annotations"]) > 1:
        strange_ds.append(d)

In [24]:
len(strange_ds)

149

In [29]:
strange_ds[2]

{'annotations': [{'type': 'singleAnswer', 'answer': ['New Delhi']},
  {'type': 'singleAnswer',
   'answer': ['New Delhi', 'New Delhi, India', 'Delhi']}],
 'id': '3978528412752837293',
 'question': "India's first ever all india institute of ayurveda has come up in which city?"}

In [33]:
for d in strange_ds[:10]:
    answers = []
    for anns in d['annotations']:
        if anns['type'] == 'multipleQAs':
            for qap in anns['qaPairs']:
                answers.extend(qap['answer'])
        else:
            answers.extend(anns['answer'])
    print(d["question"])
    print("       A:", answers)
    print("       A:", set(answers))

Dogri language is spoken in which state of india?
       A: ['Jammu and Kashmir, Himachal Pradesh, Punjab', 'Jammu and Kashmir, Himachal Pradesh, Punjab']
       A: {'Jammu and Kashmir, Himachal Pradesh, Punjab'}
Who plays granny on once upon a time?
       A: ['Beverley Elliott', 'Elliott', 'Beverley Elliott']
       A: {'Beverley Elliott', 'Elliott'}
India's first ever all india institute of ayurveda has come up in which city?
       A: ['New Delhi', 'New Delhi', 'New Delhi, India', 'Delhi']
       A: {'Delhi', 'New Delhi, India', 'New Delhi'}
Type of epithelial tissue containing cells that can change shapes as the tissue stretches?
       A: ['Transitional epithelium', 'a type of stratified epithelium', 'Transitional epithelium']
       A: {'a type of stratified epithelium', 'Transitional epithelium'}
When did the ranch season 2 come out?
       A: ['June 16, 2017', 'December 15, 2017', 'June 16, 2017', '2017', 'December 15, 2017', '2017']
       A: {'2017', 'December 15, 2017', 'Ju

In [25]:
d1 = strange_ds[0]

In [26]:
d1

{'annotations': [{'type': 'singleAnswer',
   'answer': ['Jammu and Kashmir, Himachal Pradesh, Punjab']},
  {'type': 'singleAnswer',
   'answer': ['Jammu and Kashmir, Himachal Pradesh, Punjab']}],
 'id': '-52876016653618605',
 'question': 'Dogri language is spoken in which state of india?'}

In [13]:
d2 = train_dataset[1]

In [14]:
d2

{'annotations': [{'type': 'singleAnswer', 'answer': ['David Morse']}],
 'id': '4790842463458965203',
 'question': 'Who played george washington in the john adams series?'}

In [20]:
{"answer": ['muf', 'in'], **d2}

{'answer': ['muf', 'in'],
 'annotations': [{'type': 'singleAnswer', 'answer': ['David Morse']}],
 'id': '4790842463458965203',
 'question': 'Who played george washington in the john adams series?'}

In [22]:
"muffins.json".endswith('.json')

True

In [19]:
for d in [d1, d2]:
    answers = []
    if len(d['annotations']) != 1:
        raise Exception("Too many annotations! Is this a test set, you haven't thought about how to handle test set...")
    if d['annotations'][0]['type'] == 'multipleQAs':
        for qap in d['annotations'][0]['qaPairs']:
            answers.extend(qap['answer'])
    else:
        answers.extend(d['annotations'][0]['answer'])
    print(d["question"], "A:", answers)

When did the simpsons first air on television? A: ['April 19, 1987', 'December 17, 1989']
Who played george washington in the john adams series? A: ['David Morse']


In [None]:
answers

In [8]:
d["question"]

'When did the simpsons first air on television?'

In [None]:
# Goal 2: ???

In [34]:
postprocessed = "/scratch/ddr8143/repos/pyserini/runs/run.dpr.ambigqa_light.train.bm25.json"

In [35]:
pp_data = json.load(open(postprocessed))

In [38]:
ppk = list(pp_data.keys())

In [41]:
positive_contexts = [c for c in pp_data[ppk[0]]['contexts'] if c['has_answer']]
hard_negative_context = None
for c in pp_data[ppk[0]]['contexts']:
    if not c['has_answer']:
        hard_negative_context = c
        break

In [44]:
pp_data[ppk[0]]['question']

'When did the simpsons first air on television?'

In [45]:
pp_data[ppk[0]]['answers']

['April 19, 1987', 'December 17, 1989']

In [42]:
positive_contexts

[{'docid': '4540208',
  'score': '11.211435',
  'text': '"The Simpsons shorts"\nThe Simpsons shorts The Simpsons shorts are an American animated TV series of 48 one-minute shorts that ran on the variety television program ""The Tracey Ullman Show"" for three seasons, before the characters spun off into ""The Simpsons"", their own half-hour prime-time show. It features Homer, Marge, Bart, Lisa, and Maggie. The series was created by Matt Groening, who designed the Simpson family and wrote many of the shorts. The shorts first aired on April 19, 1987 starting with ""Good Night"". The final short to air was ""TV Simpsons"", originally airing on May 14, 1989. ""The Simpsons"" later debuted on',
  'has_answer': True},
 {'docid': '9852167',
  'score': '11.157161',
  'text': '"Good Night (The Simpsons)"\nGood Night (The Simpsons) ""Good Night"" (also known as ""Good Night Simpsons"") is the first of forty-eight Simpsons shorts that appeared on the variety show ""The Tracey Ullman Show"". It ori

In [43]:
hard_negative_context

{'docid': '4949996',
 'score': '12.078153',
 'text': '"Helter Shelter (The Simpsons)"\nHelter Shelter (The Simpsons) ""Helter Shelter"" is the fifth episode of the fourteenth season of the American animated television sitcom ""The Simpsons"". It originally aired on the Fox network in the United States on December 1, 2002. In the episode, the Simpson family has to find temporary residence while their house is fumigated for termites. When they run out of options, they decide to become contestants on a reality show where families live in the manner that people did in 1895. The family is initially miserable, but slowly adapt to their new life, which causes the show to lose ratings.',
 'has_answer': False}