# Table 1 - Full Index Runs

This notebook reproduces the ColBERT runs in Table 1 which uses the newly generated indices.
BM25 figures are obtained from the [Terrier Data Repository](http://data.terrier.org/msmarco_passage.dataset.html).

In [1]:
import pyterrier as pt
pt.init(tqdm='notebook')

PyTerrier 0.8.0 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
import torch
torch.__version__

'1.10.0+cu113'

## load the stops removed index

In [3]:
import pyterrier_colbert
from pyterrier_colbert.ranking import *
checkpoint="http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip"
index=("/nfsindices/staticPrunedIndices/repro_paper_SIGIR2022/","stops")
factory = pyterrier_colbert.ranking.ColBERTFactory(checkpoint, *index, )


Some weights of the model checkpoint at bert-base-uncased were not used when initializing ColBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAI

[Feb 15, 21:40:48] #> Loading model checkpoint.
[Feb 15, 21:40:48] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip




[Feb 15, 21:40:59] #> checkpoint['epoch'] = 0
[Feb 15, 21:40:59] #> checkpoint['batch'] = 44500


In [4]:
factory.faiss_index_on_gpu = False


In [5]:
[x.mmap.shape[0] for x in factory._rrm().part_mmap]

Loading reranking index, memtype=mem


Loading index shards to memory:   0%|          | 0/30 [00:00<?, ?shard/s]

[15449104,
 15627119,
 15646684,
 15674365,
 15689361,
 15670286,
 15757845,
 15532714,
 15453631,
 15518354,
 15539412,
 15540951,
 15621620,
 15633490,
 15590560,
 15631383,
 15648003,
 15692722,
 16015586,
 16162576,
 17950180,
 16334153,
 15961213,
 15340614,
 15133805,
 15173588,
 15015812,
 15057601,
 15250116,
 7106533]

## eval setup

In [6]:
from pyterrier.measures import *
MEASURES = [AP(rel=2)@1000, nDCG@10,nDCG@20,nDCG@100, RR(rel=2)@10,RR(rel=2),RR@10, RR ,R(rel=2)@1000,"mrt", "num_ret"]

In [7]:
import pandas as pd
def do_exp(pipelines, topics='2019', names=None, baseline=None, save_dir=None) -> pd.DataFrame:
    TQ = None
    topics_map = { '2019' : 'test-2019', '2020' : 'test-2020', 'dev' : 'dev.small' }
    TQ = (
       pt.get_dataset("msmarco_passage").get_topics(topics_map[topics]), 
       pt.get_dataset("msmarco_passage").get_qrels(topics_map[topics])
    )
    if save_dir is not None:
        save_dir = save_dir % str(topics)
        os.makedirs(save_dir, exist_ok=True)
    df = pt.Experiment(
        pipelines, 
        *TQ,
        MEASURES,
        batch_size=20,
        filter_by_qrels=True,
        verbose=True,
        names=names,
        save_dir=save_dir
    )
    if save_dir:
        df.to_pickle(save_dir + "/results_df.pkl")
        
    return df

# load in results from the Original index 

This corresponds to the set of documents to be re-ranked in the "Orig. Faiss" rows of Table 1.

In [5]:
src_df = pd.concat([
    pt.io.read_results("/nfs/staticPruning/results/repro/baseline_%s/approx_baseline.res.gz" %t) for t in ['2019', '2020', 'dev']
])

## Perform runs using the Stopwords Index

In [9]:
baselineApprox = (
    (factory.ann_retrieve_score() % 1000) 
    >> factory.index_scorer(query_encoded=True)
)

def _add_docno(df):
    df = df.copy()
    df["docid"] = df.docno.astype(int)
    return df

orig_faiss = (
    pt.Transformer.from_df(src_df) 
    >> pt.apply.generic(_add_docno)
    >> factory.index_scorer()
)


[Feb 15, 21:44:34] #> Loading the FAISS index from /nfsindices/staticPrunedIndices/repro_paper_SIGIR2022/stops/ivfpq.faiss ..
[Feb 15, 21:44:58] #> Building the emb2pid mapping..
[Feb 15, 21:45:23] len(self.emb2pid) = 461419381


test that one topic works using the orig_faiss pipeline

In [10]:
test_topic = pt.get_dataset("msmarco_passage").get_topics('test-2019')
test_topic = test_topic[test_topic.qid == "1037798"]


orig_faiss(test_topic).head(1)[['qid', 'docno', 'score']]

Unnamed: 0,qid,docno,score
45,1037798,1597,13.247594


calculate the results for the stopwords index

In [11]:
all_res = []
from IPython.display import display, HTML

for t in ['2019', '2020', 'dev']:
    print(t)
    df = do_exp(
        [baselineApprox ,orig_faiss],
        names=['approx_baseline', 'orig_faiss'],
        topics=t,
        save_dir='/nfs/staticPruning/results/repro/stopindex_%s/'
    ) 
    display(df)
    all_res.append(df)

#pd.concat(all_res)

2019


pt.Experiment:   0%|          | 0/6 [00:00<?, ?batches/s]

Unnamed: 0,name,AP(rel=2)@1000,nDCG@10,nDCG@20,nDCG@100,RR(rel=2)@10,RR(rel=2),RR@10,RR,R(rel=2)@1000,num_ret,mrt
0,approx_baseline,0.399892,0.686446,0.638233,0.572319,0.825581,0.82591,0.928295,0.928295,0.672651,43000.0,225.55268
1,orig_faiss,0.398735,0.686294,0.638128,0.571983,0.830233,0.83041,0.928295,0.928295,0.69536,43000.0,155.356856


2020


pt.Experiment:   0%|          | 0/6 [00:00<?, ?batches/s]

Unnamed: 0,name,AP(rel=2)@1000,nDCG@10,nDCG@20,nDCG@100,RR(rel=2)@10,RR(rel=2),RR@10,RR,R(rel=2)@1000,num_ret
0,approx_baseline,0.440025,0.681037,0.645387,0.591887,0.859899,0.861323,0.916226,0.91765,0.729086,54000.0
1,orig_faiss,0.452856,0.687692,0.653272,0.604778,0.86034,0.861764,0.916667,0.918091,0.750889,54000.0


dev


pt.Experiment:   0%|          | 0/698 [00:00<?, ?batches/s]

Unnamed: 0,name,AP(rel=2)@1000,nDCG@10,nDCG@20,nDCG@100,RR(rel=2)@10,RR(rel=2),RR@10,RR,R(rel=2)@1000,num_ret
0,approx_baseline,0.0,0.386964,0.411081,0.438271,0.0,0.0,0.328412,0.339039,0.0,6980000.0
1,orig_faiss,0.0,0.391132,0.416078,0.444745,0.0,0.0,0.331263,0.342262,0.0,6980000.0


# Perform runs using the IDF_uniform_100 index.



In [10]:
#this may not free sufficient memory to run the following cells.
del(factory)

In [3]:
import pyterrier_colbert
from pyterrier_colbert.ranking import *
checkpoint="http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip"
index=("/nfsindices/staticPrunedIndices/repro_paper_SIGIR2022/","idf_uniform_100")
factory = pyterrier_colbert.ranking.ColBERTFactory(checkpoint, *index, )
factory.faiss_index_on_gpu = False
[x.mmap.shape[0] for x in factory._rrm().part_mmap]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing ColBERT: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing ColBERT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ColBERT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAI

[Feb 15, 22:46:59] #> Loading model checkpoint.
[Feb 15, 22:46:59] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/ecir2021-tutorial/colbert_model_checkpoint.zip




[Feb 15, 22:47:08] #> checkpoint['epoch'] = 0
[Feb 15, 22:47:08] #> checkpoint['batch'] = 44500
Loading reranking index, memtype=mem


Loading index shards to memory:   0%|          | 0/30 [00:00<?, ?shard/s]

[12785415,
 12940023,
 12959797,
 12993264,
 12996879,
 12973964,
 13051655,
 12899739,
 12865110,
 12914274,
 12928676,
 12925241,
 12998281,
 13006198,
 12967598,
 13001830,
 13012426,
 13041338,
 13231898,
 13345915,
 14870935,
 13493776,
 13217827,
 12717560,
 12557106,
 12597488,
 12464258,
 12498494,
 12656402,
 5918433]

In [8]:
baselineApprox = (
    (factory.ann_retrieve_score() % 1000) 
    >> factory.index_scorer(query_encoded=True)
)

def _add_docno(df):
    df = df.copy()
    df["docid"] = df.docno.astype(int)
    return df

orig_faiss = (
    pt.Transformer.from_df(src_df) 
    >> pt.apply.generic(_add_docno)
    >> factory.index_scorer()
)


In [9]:
all_resIdf = []
from IPython.display import display, HTML

for t in ['2019', '2020', 'dev']:
    print(t)
    df = do_exp(
        [baselineApprox,orig_faiss ],
        names=['approx_baseline', 'orig_faiss'],
        topics=t,
        save_dir='/nfs/staticPruning/results/repro/idf_uniform_100_index_%s/'
    ) 
    display(df)
    all_resIdf.append(df)

#pd.concat(all_resIdf)

2019


pt.Experiment:   0%|          | 0/6 [00:00<?, ?batches/s]

Unnamed: 0,name,AP(rel=2)@1000,nDCG@10,nDCG@20,nDCG@100,RR(rel=2)@10,RR(rel=2),RR@10,RR,R(rel=2)@1000,num_ret,mrt
0,approx_baseline,0.405467,0.687114,0.650281,0.576059,0.844961,0.845303,0.936047,0.936047,0.673932,43000.0,230.568591
1,orig_faiss,0.402628,0.688892,0.650414,0.579728,0.845736,0.845922,0.932171,0.932171,0.69536,43000.0,163.985808


2020


pt.Experiment:   0%|          | 0/6 [00:00<?, ?batches/s]

Unnamed: 0,name,AP(rel=2)@1000,nDCG@10,nDCG@20,nDCG@100,RR(rel=2)@10,RR(rel=2),RR@10,RR,R(rel=2)@1000,num_ret
0,approx_baseline,0.447345,0.682683,0.653719,0.605753,0.847399,0.850625,0.92284,0.926066,0.73576,54000.0
1,orig_faiss,0.457357,0.687942,0.658446,0.612369,0.850044,0.851728,0.925485,0.927169,0.750889,54000.0


dev


pt.Experiment:   0%|          | 0/698 [00:00<?, ?batches/s]

Unnamed: 0,name,AP(rel=2)@1000,nDCG@10,nDCG@20,nDCG@100,RR(rel=2)@10,RR(rel=2),RR@10,RR,R(rel=2)@1000,num_ret
0,approx_baseline,0.0,0.391636,0.416701,0.443613,0.0,0.0,0.333394,0.344127,0.0,6980000.0
1,orig_faiss,0.0,0.394758,0.420366,0.448454,0.0,0.0,0.335727,0.346748,0.0,6980000.0
