In [1]:
%load_ext autoreload
%autoreload 2

In [31]:
import csv
from dataclasses import dataclass
import gzip
from io import TextIOWrapper
import os
from pathlib import Path
from pprint import pprint
import re
import sys
from typing import Optional
if '..' not in sys.path: sys.path.append('..')

import numpy as np
import pandas as pd

In [3]:
DATA_PATH = Path(os.path.expandvars('$HOME')) / 'data'
MSMARCO_PATH = DATA_PATH / 'msmarco'
MSMARCO_DOCTRAIN_QUERIES_FPATH = MSMARCO_PATH / 'msmarco-doctrain-queries.tsv.gz'
MSMARCO_DOCTRAIN_QRELS_FPATH = MSMARCO_PATH / 'msmarco-doctrain-qrels.tsv.gz'
MSMARCO_DOCTRAIN_TOP100_FPATH = MSMARCO_PATH / 'msmarco-doctrain-top100.gz'
MSMARCO_DOCDEV_QUERIES_FPATH = MSMARCO_PATH / 'msmarco-docdev-queries.tsv.gz'
MSMARCO_DOCDEV_QRELS_FPATH = MSMARCO_PATH / 'msmarco-docdev-qrels.tsv.gz'
MSMARCO_DOCDEV_TOP100_FPATH = MSMARCO_PATH / 'msmarco-docdev-top100.gz'
MSMARCO_DOCS_FPATH = MSMARCO_PATH / 'msmarco-docs.tsv'
MSMARCO_DOCS_LOOKUP_FPATH = MSMARCO_PATH / 'msmarco-docs-lookup.tsv.gz'

In [4]:
list(fpath.name for fpath in MSMARCO_PATH.iterdir())

['msmarco-doctrain-queries.tsv.gz',
 'msmarco-docdev-qrels.tsv.gz',
 'msmarco-docdev-top100.gz',
 'msmarco-docdev-queries.tsv.gz',
 'msmarco-doctrain-top100.gz',
 'msmarco-docs.tsv',
 'msmarco-doctrain-qrels.tsv.gz',
 'msmarco-docs-lookup.tsv.gz']

In [5]:
fids = []
for fpath in MSMARCO_PATH.iterdir():
    fname, fid = fpath.name, fpath.with_suffix('').with_suffix('').name
    fid = fid.replace('-', '_').upper()
    print(f'{fid}_FPATH = MSMARCO_PATH / \'{fname}\'')
    fids.append(fid)


MSMARCO_DOCTRAIN_QUERIES_FPATH = MSMARCO_PATH / 'msmarco-doctrain-queries.tsv.gz'
MSMARCO_DOCDEV_QRELS_FPATH = MSMARCO_PATH / 'msmarco-docdev-qrels.tsv.gz'
MSMARCO_DOCDEV_TOP100_FPATH = MSMARCO_PATH / 'msmarco-docdev-top100.gz'
MSMARCO_DOCDEV_QUERIES_FPATH = MSMARCO_PATH / 'msmarco-docdev-queries.tsv.gz'
MSMARCO_DOCTRAIN_TOP100_FPATH = MSMARCO_PATH / 'msmarco-doctrain-top100.gz'
MSMARCO_DOCS_FPATH = MSMARCO_PATH / 'msmarco-docs.tsv'
MSMARCO_DOCTRAIN_QRELS_FPATH = MSMARCO_PATH / 'msmarco-doctrain-qrels.tsv.gz'
MSMARCO_DOCS_LOOKUP_FPATH = MSMARCO_PATH / 'msmarco-docs-lookup.tsv.gz'


In [6]:
def read_queries(queries_fpath: Path) -> dict[str, str]:
    res = {}
    with gzip.open(queries_fpath, 'rt', encoding='utf8') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        for topicid, querystring_of_topicid in tsvreader:
            assert topicid not in res
            res[topicid] = querystring_of_topicid
    return res

queries = read_queries(MSMARCO_DOCDEV_QUERIES_FPATH)
print(len(queries))
pprint({k: queries[k] for k in list(queries.keys())[:10]})

5193
{'1090270': 'botulinum definition',
 '1101278': 'do prince harry and william have last names',
 '1101279': 'do physicians pay for insurance from their salaries?',
 '118457': 'define bona fides',
 '174249': 'does xpress bet charge to deposit money in your account',
 '178627': 'effects of detox juice cleanse',
 '201376': 'here there be dragons comic',
 '320792': 'how much is a cost to run disneyland',
 '54544': 'blood diseases that are sexually transmitted',
 '68095': 'can hives be a sign of pregnancy'}


In [7]:
def read_offsets(lookup_fpath: Path) -> dict[str, int]:
    res = {}
    with gzip.open(lookup_fpath, 'rt', encoding='utf8') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        for [docid, _, offset] in tsvreader:
            res[docid] = int(offset)
    return res

offsets = read_offsets(MSMARCO_DOCS_LOOKUP_FPATH)
print(len(offsets))

3213835


In [8]:
def read_qrels(qrels_fpath: Path) -> dict[str, list[str]]:
    res = {}
    with gzip.open(qrels_fpath, 'rt', encoding='utf8') as f:
        for _ in range(10):
            l = f.readline()
            print(l.rstrip(), ' ' in l, '\t' in l)
        tsvreader = csv.reader(f, delimiter=' ')
        for [topicid, _, docid, rel] in tsvreader:
            assert rel == "1"
            docids = res.setdefault(topicid, [])
            docids.append(docid)
    return res

qrels = read_qrels(MSMARCO_DOCDEV_QRELS_FPATH)
# qrels = read_qrels(MSMARCO_DOCTRAIN_QRELS_FPATH)
len(qrels)

2 0 D1650436 1 True False
1215 0 D1202771 1 True False
1288 0 D1547717 1 True False
1576 0 D1313702 1 True False
2235 0 D2113408 1 True False
2798 0 D2830290 1 True False
2962 0 D125453 1 True False
4696 0 D2523421 1 True False
4947 0 D2001134 1 True False
6217 0 D1361055 1 True False


5183

In [9]:
def read_queries_df(queries_fpath: Path) -> pd.DataFrame:
    with gzip.open(queries_fpath, 'rt', encoding='utf8') as f:
        df = pd.read_csv(f, sep='\t', header = None, names=('topicid', 'query'))
        df.set_index('topicid', inplace=True)
    return df

df_qs_dev = read_queries_df(MSMARCO_DOCDEV_QUERIES_FPATH)
df_qs_train = read_queries_df(MSMARCO_DOCTRAIN_QUERIES_FPATH)
df_qs_dev, df_qs_train

(                                                     query
 topicid                                                   
 174249   does xpress bet charge to deposit money in you...
 320792                how much is a cost to run disneyland
 1090270                               botulinum definition
 1101279  do physicians pay for insurance from their sal...
 201376                         here there be dragons comic
 ...                                                    ...
 147073   difference between discrete and process manufa...
 243761                  how long did abraham lincoln serve
 162662        does adult acne rosacea give you blepharitis
 247194                        how long do you bake muffins
 195199                                      glioma meaning
 
 [5193 rows x 1 columns],
                                                      query
 topicid                                                   
 1185869  )what was the immediate impact of the success ...
 1185868  __

In [10]:
def read_offsets_df(lookup_fpath: Path) -> pd.DataFrame:
    with gzip.open(lookup_fpath, 'rt', encoding='utf8') as f:
        df = pd.read_csv(f, sep='\t', header=None, names=('docid', 'off_trec', 'off_tsv'), usecols=('docid', 'off_tsv'))
        df.set_index('docid', inplace=True)
    return df

df_off = read_offsets_df(MSMARCO_DOCS_LOOKUP_FPATH)
print(df_off)

              off_tsv
docid                
D1555982            0
D301595          1852
D1359209         7973
D2147834        23656
D1568809        31104
...               ...
D2027345  22889189901
D3094124  22889193345
D1659457  22889201376
D1633167  22889206934
D1551606  22889207225

[3213835 rows x 1 columns]


In [11]:
def read_qrels_df(qrels_fpath: Path) -> pd.DataFrame:
    with gzip.open(qrels_fpath, 'rt', encoding='utf8') as f:
        df = pd.read_csv(f, sep=' ', header=None, names=('topicid', 'x', 'docid', 'rel'), usecols=('topicid', 'docid', 'rel'))
        df.set_index('topicid', inplace=True)
    assert len(df.index.unique()) == len(df)
    assert (df['rel'] == 1).sum() == len(df)
    return df

df_qrels_dev = read_qrels_df(MSMARCO_DOCDEV_QRELS_FPATH)
df_qrels_train = read_qrels_df(MSMARCO_DOCTRAIN_QRELS_FPATH)
df_qrels_dev, df_qrels_train

(            docid  rel
 topicid               
 2        D1650436    1
 1215     D1202771    1
 1288     D1547717    1
 1576     D1313702    1
 2235     D2113408    1
 ...           ...  ...
 1102330  D3062847    1
 1102335  D2921145    1
 1102351  D2361582    1
 1102390  D1073324    1
 1102400   D677570    1
 
 [5193 rows x 2 columns],
             docid  rel
 topicid               
 3         D312959    1
 5         D140227    1
 12        D213890    1
 15       D1033338    1
 16        D508131    1
 ...           ...  ...
 1185862  D2008201    1
 1185864  D1126522    1
 1185865   D630512    1
 1185868    D59235    1
 1185869    D59219    1
 
 [367013 rows x 2 columns])

In [12]:
def read_top_df(top_fpath: Path) -> pd.DataFrame:
    with gzip.open(top_fpath, 'rt', encoding='utf8') as f:
        df = pd.read_csv(f, sep=' ', header=None, names=('topicid', 'x', 'docid', 'rank', 'score', 'runstring'),
                         usecols=('topicid', 'docid', 'rank', 'score'))
        df.set_index(['topicid', 'docid'], inplace=True)
    return df


df_top_dev = read_top_df(MSMARCO_DOCDEV_TOP100_FPATH)
df_top_train = read_top_df(MSMARCO_DOCTRAIN_TOP100_FPATH)
df_top_dev, df_top_train

(                  rank    score
 topicid docid                  
 174249  D3126539     1 -5.99003
         D978773      2 -6.18444
         D399803      3 -6.20982
         D2204704     4 -6.24312
         D3126541     5 -6.24726
 ...                ...      ...
 195199  D2523452    96 -7.71499
         D2936620    97 -7.71628
         D2630286    98 -7.72922
         D3161177    99 -7.73019
         D2834135   100 -7.73695
 
 [519300 rows x 2 columns],
                   rank    score
 topicid docid                  
 1185869 D59221       1 -4.80433
         D59220       2 -4.92127
         D2192591     3 -5.05215
         D2777518     4 -5.05486
         D2371978     5 -5.07048
 ...                ...      ...
 748176  D2519846    96 -6.27750
         D1660014    97 -6.27775
         D3395520    98 -6.27988
         D2519845    99 -6.28317
         D3083754   100 -6.28339
 
 [36701116 rows x 2 columns])

In [13]:
fid_docs = open(MSMARCO_DOCS_FPATH, 'r', encoding='utf-8')

In [14]:

def cut(s: str, sz: int) -> str:
    if len(s) <= sz:
        return s
    return f'{s[:sz]}...'

class MsmDoc:
    docid: str
    url: str
    title: str
    body: str

    def __init__(self, docid: str, url: str, title: str, body: str) -> None:
        self.docid = docid
        self.url = url
        self.title = title
        self.body = body
    
    def __str__(self) -> str:
        return f'Id: {self.docid}. Title: {cut(self.title, 50)}. Body: {cut(self.body, 100)}. Url: {self.url}'
    
    def __repr__(self) -> str:
        return self.__str__()
    
    @staticmethod
    def from_line(l: str) -> 'MsmDoc':
        docid, url, title, body = l.rstrip().split('\t')
        return MsmDoc(docid=docid, url=url, title=title, body=body)

def get_doc(fid: TextIOWrapper, offset: int) -> MsmDoc:
    fid.seek(offset)
    l = fid.readline().rstrip()
    return MsmDoc.from_line(l)


In [15]:
docid = 'D2777518'
off = df_off.loc[docid]
print(off.item(), type(off.item()))
doc = get_doc(fid_docs, off.item())
doc

19459435257 <class 'int'>


Id: D2777518. Title: An Introduction to the Manhattan Project. Body: "Humanities ›History & Culture An Introduction to the Manhattan Project Share Flipboard Email Print©.... Url: http://history1900s.about.com/od/1940s/a/Manhattan-Project.htm

In [16]:
# fid_docs.close()

In [17]:
df_qs, df_qrels, df_top = df_qs_dev, df_qrels_dev, df_top_dev
# df_qs, df_qrels, df_top = df_qs_train, df_qrels_train, df_top_train

In [18]:
len(df_qs.index.intersection(df_qrels.index)) == len(df_qs) == len(df_qrels) == len(df_qs.index.unique())

True

In [19]:
df_top_ = df_top.reset_index(drop=False)
df_top_agg = df_top_.groupby('topicid').agg({'docid': 'count', 'rank': 'sum', 'score': 'mean'})
df_top_agg

Unnamed: 0_level_0,docid,rank,score
topicid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,100,5050,-6.815962
1215,100,5050,-5.370802
1288,100,5050,-4.371539
1576,100,5050,-9.890031
2235,100,5050,-6.137435
...,...,...,...
1102330,100,5050,-5.838318
1102335,100,5050,-5.705734
1102351,100,5050,-7.040014
1102390,100,5050,-6.259723


In [20]:
len(df_top_agg.index.unique()) == len(df_top_agg), len(df_top.index.unique()) == len(df_top)

(True, True)

In [21]:
df_top_agg[df_top_agg['docid'] != 100], df_top_agg[df_top_agg['rank'] != 5050]

(Empty DataFrame
 Columns: [docid, rank, score]
 Index: [],
 Empty DataFrame
 Columns: [docid, rank, score]
 Index: [])

In [22]:
qs_top_inds = df_qs.index.difference(df_top_agg.index)
top_qs_inds = df_top_agg.index.difference(df_qs.index)
df_qs.loc[qs_top_inds], df_top_agg.loc[top_qs_inds]

(Empty DataFrame
 Columns: [query]
 Index: [],
 Empty DataFrame
 Columns: [docid, rank, score]
 Index: [])

In [23]:
tid_ind = np.random.randint(len(df_top_agg))
tid = df_top_agg.index[tid_ind]
tid, df_top_agg.loc[tid]

(1009527,
 docid     100.00000
 rank     5050.00000
 score      -5.84499
 Name: 1009527, dtype: float64)

In [24]:
df_top_tid = df_top.loc[tid]
print(df_top_tid)
print(df_qs.loc[tid])

          rank    score
docid                  
D3255793     1 -5.30337
D2959602     2 -5.40700
D3473629     3 -5.45224
D777245      4 -5.47914
D1299836     5 -5.54870
...        ...      ...
D307909     96 -6.00863
D2619196    97 -6.01052
D1393090    98 -6.01321
D68832      99 -6.01485
D2577257   100 -6.01620

[100 rows x 2 columns]
query    what's grounding
Name: 1009527, dtype: object


In [25]:
def get_doc_by_did(did: int) -> MsmDoc:
    off = df_off.loc[did].item()
    return get_doc(fid_docs, off)


In [26]:
did = df_top_tid.index[0]
print(df_top_tid.loc[did])
doc = get_doc_by_did(did)
print(doc.title)
print(doc.body[:200])

rank     1.00000
score   -5.30337
Name: D3255793, dtype: float64
4 Grounding Techniques That Will Help You Stay More Calm
Movement & Fitness4 Grounding Techniques That Will Help You Stay More Calm By Lydia Noyes March 28, 2017Do you feel often like life is passing you by? Is it hard to find a minute to calm down and real


In [27]:
doc_rel = get_doc_by_did(df_qrels.loc[tid]['docid'])
doc_rel

Id: D490087. Title: What is the difference between earthing, grounding.... Body: Valentine Chriz, works at Philadelphia, PAUpdated Oct 4, 2017 · Author has 95 answers and 57.7k answ.... Url: https://www.quora.com/What-is-the-difference-between-earthing-grounding-and-neutral

In [28]:
len(df_qs_dev), len(df_qs_train), len(df_qs_dev.index.intersection(df_qs_train.index))

(5193, 367013, 0)

In [29]:
fid_docs.seek(0)
docids = []
while True:
    l = fid_docs.readline()
    if not l:
        break
    docid = l[:l.index('\t')]
    docids.append(docid)

In [30]:
len(docids)

3213835

In [33]:
pat = re.compile(r'^D(\d+)$')
for docid in docids:
    m = pat.match(docid)
    assert m is not None

In [34]:
numdocids = [int(docid[1:]) for docid in docids]

In [43]:
docids_set = set(docids)
numdocids_set = set(numdocids)
len(docids), len(docids) == len(docids_set) == len(numdocids) == len(numdocids_set)

(3213835, True)

In [31]:
fid_docs.close()
del fid_docs

In [47]:
qrels_docids_dev = df_qrels_dev['docid']
qrels_docids_dev_unique = qrels_docids_dev.unique()
print(len(qrels_docids_dev), len(qrels_docids_dev_unique))
qrels_docids_train = df_qrels_train['docid']
qrels_docids_train_unique = qrels_docids_train.unique()
print(len(qrels_docids_train), len(qrels_docids_train_unique))
print(len(np.intersect1d(qrels_docids_train_unique, qrels_docids_dev_unique)))

5193 5185
367013 319927
801


In [51]:
top_docids_dev, top_docids_train = df_top_dev.index.get_level_values(1), df_top_train.index.get_level_values(1)
top_docids_dev_u, top_docids_train_u = top_docids_dev.unique(), top_docids_train.unique()
print(len(top_docids_dev), len(top_docids_dev_u))
print(len(top_docids_train), len(top_docids_train_u))

519300 400737
36701116 2765494


In [53]:
print(len(df_qrels_dev), len(df_top_dev.index.get_level_values(0)))
print(len(df_qrels_train), len(df_top_train.index.get_level_values(0)))

5193 519300
367013 36701116
