## Load Checkpoint

In [3]:
import transformers
import torch
import sh

In [4]:
sh.ls("/scratch/ddr8143/repos/DPR/downloads/checkpoint/retriever/single/nq/bert-base-encoder.cp")

/scratch/ddr8143/repos/DPR/downloads/checkpoint/retriever/single/nq/bert-base-encoder.cp

In [6]:
the_model = torch.load("/scratch/ddr8143/repos/DPR/downloads/checkpoint/retriever/single/nq/bert-base-encoder.cp", map_location=torch.device('cpu'))

In [7]:
the_model.keys()

odict_keys(['model_dict', 'optimizer_dict', 'scheduler_dict', 'offset', 'epoch', 'encoder_params'])

In [13]:
model_dict = the_model["model_dict"]

In [14]:
top_levels = set([k.split(".")[0] for k in model_dict.keys()])

In [15]:
top_levels

{'ctx_model', 'question_model'}

In [21]:
q_model_ks = [k for k in model_dict.keys() if 'question_model' in k and 'embeddings' in k]
c_model_ks = [k for k in model_dict.keys() if 'ctx_model' in k and 'embeddings' in k]

In [22]:
q_model_ks

['question_model.embeddings.word_embeddings.weight',
 'question_model.embeddings.position_embeddings.weight',
 'question_model.embeddings.token_type_embeddings.weight',
 'question_model.embeddings.LayerNorm.weight',
 'question_model.embeddings.LayerNorm.bias']

In [23]:
c_model_ks

['ctx_model.embeddings.word_embeddings.weight',
 'ctx_model.embeddings.position_embeddings.weight',
 'ctx_model.embeddings.token_type_embeddings.weight',
 'ctx_model.embeddings.LayerNorm.weight',
 'ctx_model.embeddings.LayerNorm.bias']

In [None]:
cfg = BertConfig.from_pretrained(cfg_name if cfg_name else "bert-base-uncased")

In [24]:
from transformers import BertModel, BertConfig

In [27]:
cfg = BertConfig.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
hf_bert = model.state_dict()

In [32]:
model_embed_ks = [k for k in hf_bert.keys() if "embeddings" in k]
model_embed_ks

['embeddings.position_ids',
 'embeddings.word_embeddings.weight',
 'embeddings.position_embeddings.weight',
 'embeddings.token_type_embeddings.weight',
 'embeddings.LayerNorm.weight',
 'embeddings.LayerNorm.bias']

## Test using BM25 search

In [11]:
from pyserini.search.lucene import LuceneSearcher
from pyserini.index.lucene import IndexReader

In [12]:
#LuceneSearcher.list_prebuilt_indexes()

In [13]:
searcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr')

Attempting to initialize pre-built index wikipedia-dpr.
/home/ddr8143/.cache/pyserini/indexes/index-wikipedia-dpr-20210120-d1b9e6.c28f3a56b2dfcef25bf3bf755c264d04 already exists, skipping download.
Initializing wikipedia-dpr...


In [4]:
hits = searcher.search('hubble space telescope')

In [5]:
for i in range(0, 10):
    doc = searcher.doc(hits[i].docid)
    docstr = doc.raw()[:100].replace("\n", "")
    print(f'{i+1:2} {hits[i].docid:15} {hits[i].score:.5f}, {docstr}')

 1 500264          17.42493, {  "id" : "500264",  "contents" : "\"Hubble Space Telescope\"\nHubble Space Telescope The Hubble S
 2 500350          17.02356, {  "id" : "500350",  "contents" : "\"Hubble Space Telescope\"\nThese are often European in origin,
 3 500368          16.58024, {  "id" : "500368",  "contents" : "\"Hubble Space Telescope\"\nvery narrow field—Lucky Cam, for ex
 4 500266          16.45677, {  "id" : "500266",  "contents" : "\"Hubble Space Telescope\"\ndata, while the Goddard Space Fligh
 5 500367          16.37399, {  "id" : "500367",  "contents" : "\"Hubble Space Telescope\"\nthe visible, ultraviolet, and infra
 6 500265          16.27738, {  "id" : "500265",  "contents" : "\"Hubble Space Telescope\"\nultraviolet, visible, and near infr
 7 500362          16.26323, {  "id" : "500362",  "contents" : "\"Hubble Space Telescope\"\n(SCRS). , the Trump Administration 
 8 14244283        16.20809, {  "id" : "14244283",  "contents" : "\"Hubble (film)\"\nHubble (film) Hubbl

In [6]:
print(hits[0].raw)

{
  "id" : "500264",
  "contents" : "\"Hubble Space Telescope\"\nHubble Space Telescope The Hubble Space Telescope (HST) is a space telescope that was launched into low Earth orbit in 1990 and remains in operation. Although not the first space telescope, Hubble is one of the largest and most versatile and is well known as both a vital research tool and a public relations boon for astronomy. The HST is named after the astronomer Edwin Hubble and is one of NASA's Great Observatories, along with the Compton Gamma Ray Observatory, the Chandra X-ray Observatory and the Spitzer Space Telescope. With a mirror, Hubble's four main instruments observe in the near"
}


In [7]:
# python -m pyserini.search.lucene \
#   --index wikipedia-dpr \
#   --topics dpr-nq-test \
#   --output runs/run.dpr.nq-test.bm25.trec

## Then load NQ and AmbigQA and get queries

In [2]:
import sh
import json

In [3]:
base_path = "/scratch/ddr8143/repos/DPR/downloads/data"
ambigqa_path = f"{base_path}/ambigqa"
ambigqa_light_path = f"{base_path}/ambigqa_light"
nq_path = f"{base_path}/retriever"
qp_path = f"{base_path}/qampari"

In [3]:
sh.ls(ambigqa_light_path)

dev.json  train.json

In [4]:
nq_data = json.load(open(f"{nq_path}/nq-dev.json"))

In [6]:
print("NQ Keys:", nq_data[0].keys())
print()
for k in ["question", "answers"]:
    print(k + ": ", nq_data[0][k])
print(nq_data[0]["hard_negative_ctxs"][0])

NQ Keys: dict_keys(['dataset', 'question', 'answers', 'positive_ctxs', 'negative_ctxs', 'hard_negative_ctxs'])

question:  who sings does he love me with reba
answers:  ['Linda Davis']
{'title': "Why Don't You Love Me (Beyoncé song)", 'text': 'song. According to the lyrics of "Why Don\'t You Love Me", Knowles impersonates a woman who questions her love interest about the reason for which he does not value her fabulousness, convincing him she\'s the best thing for him as she sings: "Why don\'t you love me... when I make me so damn easy to love?... I got beauty... I got class... I got style and I got ass...". The singer further tells her love interest that the decision not to choose her is "entirely foolish". Originally released as a pre-order bonus track on the deluxe edition of "I Am...', 'score': 14.678405, 'title_score': 0, 'passage_id': '14525568'}


In [14]:
nq_data[1]["positive_ctxs"]

[{'title': 'Great Lakes',
  'text': 'Great Lakes The Great Lakes (), also called the Laurentian Great Lakes and the Great Lakes of North America, are a series of interconnected freshwater lakes located primarily in the upper mid-east region of North America, on the Canada–United States border, which connect to the Atlantic Ocean through the Saint Lawrence River. They consist of Lakes Superior, Michigan, Huron, Erie, and Ontario, although hydrologically, there are four lakes, Superior, Erie, Ontario, and Michigan-Huron. The lakes are interconnected by the Great Lakes Waterway. The Great Lakes are the largest group of freshwater lakes on Earth by total area, and second largest',
  'score': 1000,
  'title_score': 1,
  'passage_id': '151960'},
 {'title': 'Great Lakes',
  'text': 'Great Lakes The Great Lakes (), also called the Laurentian Great Lakes and the Great Lakes of North America, are a series of interconnected freshwater lakes located primarily in the upper mid-east region of North 

In [11]:
abl_data = json.load(open(f"{ambigqa_light_path}/dev.json"))

In [12]:
print("ABL Keys:", abl_data[0].keys())

ABL Keys: dict_keys(['annotations', 'id', 'question'])


In [13]:
for i in range(10):
    print(f"============================= Example {i} ==================================")
    print(json.dumps(abl_data[i], indent=4))
    print()

{
    "annotations": [
        {
            "type": "singleAnswer",
            "answer": [
                "Tony Goldwyn",
                "Goldwyn"
            ]
        }
    ],
    "id": "-807825952267713091",
    "question": "Who plays the doctor in dexter season 1?"
}

{
    "annotations": [
        {
            "type": "singleAnswer",
            "answer": [
                "usually continues uninterrupted until death"
            ]
        },
        {
            "type": "singleAnswer",
            "answer": [
                "constant",
                "usually continues uninterrupted until death"
            ]
        }
    ],
    "id": "8266116451988110240",
    "question": "How often does spermatogeneis\u2014the production of sperm\u2014occur?"
}

{
    "annotations": [
        {
            "type": "singleAnswer",
            "answer": [
                "1950"
            ]
        },
        {
            "type": "singleAnswer",
            "answer": [
                

In [10]:
abl_data[0]

NameError: name 'abl_data' is not defined

In [8]:
ab_data = json.load(open(f"{ambigqa_path}/dev.json"))

In [9]:
ab_data[0]

{'viewed_doc_titles': ['Dexter (season 1)'],
 'used_queries': [{'query': 'Who plays the doctor in dexter season 1?',
   'results': [{'title': 'Dexter (season 1)',
     'snippet': 'The first <b>season</b> of <b>Dexter</b> is an adaptation of Jeff Lindsay&#39;s first novel in a series \nof the same ... and left taunting clues, leading <b>Dexter</b> to believe that the Killer is \n<b>playing</b> a game with him. ... An unexplained suicide of a wealthy and powerful \nwoman leads <b>Dexter</b> to suspect that her psychologist, <b>Dr</b>. Emmett Meridian, may&nbsp;...'},
    {'title': 'Christian Camargo',
     'snippet': 'Christian Camargo (né Minnick; born July 7, 1971) is an American <b>actor</b>, \nproducer, writer, and director. He is perhaps best known for his roles as Brian \nMoser in the Showtime drama <b>Dexter</b>, Michael Corrigan in the Netflix ... He guest \nstarred as Wade Crocker on the third <b>season</b> of Syfy&#39;s Haven. Camargo portrayed \nthe title&nbsp;...'},
    {'tit

In [44]:
print("AB Keys:", ab_data[0].keys())
print()
#for k in ["question", "nq_answers"]:
for i in range(10):
    print(i)
    print("     q:  ", ab_data[i]["question"])
    for a in ab_data[i]["annotations"]:
        if "answer" in a:
            print("         SA| t:", a["type"], "a:", a["answer"])
        else:
            print("         MA| t:", [dd["answer"] for dd in a["qaPairs"]])
    
    #print("     at: ", ab_data[i]["annotations"][0]["type"])
    print("     nqa:", ab_data[i]["nq_answer"])
    #print("     ann:", ab_data[i]["annotations"])
#for k in ab_data[0].keys():
#    print(k + ": ", ab_data[0][k])
#print(ab_data[0]["hard_negative_ctxs"][0])

AB Keys: dict_keys(['viewed_doc_titles', 'used_queries', 'annotations', 'nq_answer', 'id', 'nq_doc_title', 'question'])

0
     q:   Who plays the doctor in dexter season 1?
         SA| t: singleAnswer a: ['Tony Goldwyn', 'Goldwyn']
     nqa: ['Tony Goldwyn']
1
     q:   How often does spermatogeneis—the production of sperm—occur?
         SA| t: singleAnswer a: ['usually continues uninterrupted until death']
         SA| t: singleAnswer a: ['constant', 'usually continues uninterrupted until death']
     nqa: ['74 days']
2
     q:   When was the first remote control tv invented?
         SA| t: singleAnswer a: ['1950']
         SA| t: singleAnswer a: ['1950']
     nqa: ['1950']
3
     q:   Why did the st louis cardinals move to arizona?
         SA| t: singleAnswer a: ['mediocrity of the Cardinals,a then-21-year-old stadium,game attendance to dwindle']
         MA| t: [['overall mediocrity of the Cardinals'], ['old stadium'], ['game attendance to dwindle']]
     nqa: ['1988']
4
     q

# And QAMPARI

In [4]:
sh.ls(qp_path)

dev_data.jsonl	test_data.jsonl  train_data.jsonl

In [5]:
qp_data = []
for l in open(f"{qp_path}/train_data.jsonl").readlines():
    qp_data.append(json.loads(l))

In [6]:
qp_data[0]

{'entities': [{'entity_url': 'https://en.wikipedia.org/wiki/Chezhiyan',
   'entity_text': 'Chezhiyan',
   'aliases': ['Chezhiyan']}],
 'question_text': 'Which movie, clip, TV show etc. had Chezhiyan as director of photography?',
 'answer_list': [{'answer_text': 'To Let',
   'aid': '0__wikidata_simple__train__0',
   'aliases': ['To Let'],
   'answer_url': 'https://en.wikipedia.org/wiki/To_Let_(film)',
   'proof': [{'proof_text': 'To let is a 2017 indian tamil-language drama film written, directed and filmed by chezhiyan.',
     'found_in_url': 'https://en.wikipedia.org/wiki/To_Let_(film)',
     'pid': '0__wikidata_simple__train__0__0'}]},
  {'answer_text': 'Kalloori',
   'aid': '0__wikidata_simple__train__1',
   'aliases': ['Kalloori'],
   'answer_url': 'https://en.wikipedia.org/wiki/Kalloori',
   'proof': [{'proof_text': 'Chezhiyan is an indian filmmaker and director of photography who works primarily in the tamil film industry. He received critical acclaim for his work in "kalloori" (

In [40]:
p1 = qp_data[0]["answer_list"][-1]["proof"][0]["proof_text"]
p1

' is a japanese manga series written by buronson and illustrated by ryoichi ikegami, published shogakukan\'s "big comic superior" from 1996 to 1998.\nplot.\nmayo is a professional assassin who is hired by the "organization" to kill the mother of a young prostitute, shion.'

In [41]:
hits = searcher.search(p1, 100)

In [43]:
hits[0].raw

'{\n  "id" : "6946343",\n  "contents" : "Buronson\\nare \\"\\"Phantom Burai\\"\\", with art by Kaoru Shintani. Buronson , real name , also known as , is a Japanese manga writer, most known for creating \\"\\"Fist of the North Star\\"\\". In 2002, he shared the Shogakukan Manga Award for general manga for \\"\\"Heat\\"\\" with Ryoichi Ikegami. Buronson was born on June 16, 1947 in Saku, Nagano. He graduated from the Japan Air Self-Defense Forces Training School in 1967 and served as an Air Force radar mechanic. In 1969 he was discharged from the Japan Maritime Self-Defense Force and was soon hired by Hiroshi Motomiya as a manga assistant. He"\n}'

In [42]:
for i in range(0, len(hits)):
    doc = searcher.doc(hits[i].docid)
    docstr = doc.raw()[:100].replace("\n", "")
    print(f'{i+1:2} {hits[i].docid:15} {hits[i].score:.5f}, {docstr}')

 1 6946343         39.76038, {  "id" : "6946343",  "contents" : "Buronson\nare \"\"Phantom Burai\"\", with art by Kaoru Shintan
 2 6946341         39.65023, {  "id" : "6946341",  "contents" : "Buronson\nBuronson , real name , also known as , is a Japanese
 3 12548776        35.82513, {  "id" : "12548776",  "contents" : "\"Lord (manga)\"\nLord (manga) Chō-Sangokushi -LORD (超三国志-覇, 
 4 12548777        35.35414, {  "id" : "12548777",  "contents" : "\"Lord (manga)\"\nthe Romance of the Three Kingdoms. A 3 volu
 5 4197649         32.12986, {  "id" : "4197649",  "contents" : "\"Mai, the Psychic Girl\"\nMai, the Psychic Girl Mai, the Psyc
 6 14580850        31.18036, {  "id" : "14580850",  "contents" : "\"Rainbow: Nisha Rokubō no Shichinin\"\nsimulcast distributi
 7 6946342         30.62995, {  "id" : "6946342",  "contents" : "Buronson\nthe script of \"\"\"\" in 1972, drawn by Goro Sakai.
 8 14580847        30.36670, {  "id" : "14580847",  "contents" : "\"Rainbow: Nisha Rokubō no Shichinin\

In [18]:
print(hits[0].raw)

{
  "id" : "12548776",
  "contents" : "\"Lord (manga)\"\nLord (manga) Chō-Sangokushi -LORD (超三国志-覇, lit. \"\"Extraordinary Records of Three Kingdoms -LORD\"\") also known as LORD (覇, \"\"Ha\"\", lit: Conqueror) is the manga drawn by Ryoichi Ikegami (池上 遼一, IKEGAMI Ryoichi) and written by Buronson (武論尊; 史村 翔, FUMIMURA Shō; 岡村 善行, OKAMURA Yoshiyuki). Lord marks the fourth time this creative team have worked together on a manga. They previously collaborated on Strain, Sanctuary, and Heat, the 2002 Shogakukan Manga Award winner. The story is loosely based on the Three Kingdoms period, using both real and original characters and events from the historical period in Chinese history and folklore of"
}


In [19]:
print(hits[1].raw)

{
  "id" : "12548777",
  "contents" : "\"Lord (manga)\"\nthe Romance of the Three Kingdoms. A 3 volume sequel called Soul Lord 2 ended in 2013. Lord (manga) Chō-Sangokushi -LORD (超三国志-覇, lit. \"\"Extraordinary Records of Three Kingdoms -LORD\"\") also known as LORD (覇, \"\"Ha\"\", lit: Conqueror) is the manga drawn by Ryoichi Ikegami (池上 遼一, IKEGAMI Ryoichi) and written by Buronson (武論尊; 史村 翔, FUMIMURA Shō; 岡村 善行, OKAMURA Yoshiyuki). Lord marks the fourth time this creative team have worked together on a manga. They previously collaborated on Strain, Sanctuary, and Heat, the 2002 Shogakukan Manga Award winner. The story is loosely based on the Three Kingdoms period, using"
}
