In [1]:
import faiss
import pyterrier as pt
import ujson
import numpy as np

import itertools
import threading
import queue

from colbert.modeling.inference import ModelInference
from colbert.evaluation.loaders import load_colbert
from pyterrier_colbert import load_checkpoint
# monkeypatch to use our downloading version
import colbert.evaluation.loaders

colbert.evaluation.loaders.load_checkpoint = load_checkpoint
colbert.evaluation.loaders.load_model.__globals__['load_checkpoint'] = load_checkpoint
from colbert.utils.utils import print_message
import pickle
from colbert.indexing.index_manager import IndexManager
from warnings import warn
from transformers import AutoTokenizer, AutoModelForMaskedLM


In [2]:
pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
from pyterrier_colbert.preprocessing import CorpusIteratorPreprocessor

In [4]:
class Object():
    pass

In [5]:
checkpoint="http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"

In [6]:
args = Object()
args.similarity = 'cosine'
args.dim = 128
args.query_maxlen = 32
args.doc_maxlen = 180
args.checkpoint = checkpoint
args.mask_punctuation = False

In [7]:
dataset = pt.get_dataset('vaswani')

In [8]:
wordpiece = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

In [9]:
dataset_cleaned = CorpusIteratorPreprocessor(dataset.get_corpus_iter(), wordpiece)

Downloading vaswani corpus to /home/busolin/.pyterrier/corpora/vaswani/corpus


doc-text.trec: 3.33MiB [00:00, 45.8MiB/s]                                                                                                    


In [10]:
import pyterrier_colbert.indexing

In [12]:
base_indexer = pyterrier_colbert.indexing.ColBERTIndexer(checkpoint, "./indexes", "index.base.vaswani", chunksize=3, num_partitions=1024)
base_indexer.index(dataset.get_corpus_iter(), )

[Mar 03, 14:55:31] [0] 		 #> Local args.bsize = 128
[Mar 03, 14:55:31] [0] 		 #> args.index_root = ./indexes
[Mar 03, 14:55:31] [0] 		 #> self.possible_subset_sizes = [69905]


Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Mar 03, 14:55:32] #> Loading model checkpoint.
[Mar 03, 14:55:32] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip
[Mar 03, 14:55:40] #> checkpoint['epoch'] = 0
[Mar 03, 14:55:40] #> checkpoint['batch'] = 44500




[Mar 03, 14:55:40] #> Note: Output directory ./indexes already exists




[Mar 03, 14:55:40] #> Creating directory ./indexes/index.base.vaswani 


[Mar 03, 14:57:38] [0] 		 #> Completed batch #0 (starting at passage #0) 		Passages/min: 5.8k (overall),  5.9k (this encoding),  10163.3M (this saving)
[Mar 03, 14:57:38] [0] 		 [NOTE] Done with local share.
[Mar 03, 14:57:38] [0] 		 #> Joining saver thread.
[Mar 03, 14:57:38] [0] 		 #> Saved batch #0 to ./indexes/index.base.vaswani/0.pt 		 Saving Throughput = 2.7M passages per minute.

#> num_embeddings = 581496
[Mar 03, 14:57:38] #> Starting..
[Mar 03, 14:57:38] #> Processing slice #1 of 1 (range 0..1).
[Mar 03, 14:57:38] #> Will write to ./indexes/index.base.vaswani/ivfpq.1024.faiss.
[Mar 03, 14:57:38] #>



18.71484136581421
0.0008025169372558594
[Mar 03, 14:57:57] Done training!

[Mar 03, 14:57:57] #> Indexing the vectors...
[Mar 03, 14:57:57] #> Loading ('./indexes/index.base.vaswani/0.pt', None, None) (from queue)...
[Mar 03, 14:57:57] #> Processing a sub_collection with shape (581496, 128)
[Mar 03, 14:57:57] Add data with shape (581496, 128) (offset = 0)..
  IndexIVFPQ size 0 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=1 reserveVecs=33554432
524288/581496 (4.390 s)   Flush indexes to CPU
add(.) time: 4.898 s 		--		 index.ntotal = 581496
[Mar 03, 14:58:02] Done indexing!
[Mar 03, 14:58:02] Writing index to ./indexes/index.base.vaswani/ivfpq.1024.faiss ...
[Mar 03, 14:58:02] 

Done! All complete (for slice #1 of 1)!
#> Faiss encoding complete
#> Indexing complete, Time elapsed 151.23 seconds


In [13]:
cleaned_indexer = pyterrier_colbert.indexing.ColBERTIndexer(checkpoint, "./indexes", "index.clean.vaswani", chunksize=3, num_partitions=1024)
cleaned_indexer.index(dataset_cleaned)

[Mar 03, 14:58:02] [0] 		 #> Local args.bsize = 128
[Mar 03, 14:58:02] [0] 		 #> args.index_root = ./indexes
[Mar 03, 14:58:02] [0] 		 #> self.possible_subset_sizes = [69905]


Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[Mar 03, 14:58:03] #> Loading model checkpoint.
[Mar 03, 14:58:03] #> Loading checkpoint http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip
[Mar 03, 14:58:11] #> checkpoint['epoch'] = 0
[Mar 03, 14:58:11] #> checkpoint['batch'] = 44500




[Mar 03, 14:58:11] #> Note: Output directory ./indexes already exists




[Mar 03, 14:58:11] #> Creating directory ./indexes/index.clean.vaswani 


[Mar 03, 15:00:14] [0] 		 #> Completed batch #0 (starting at passage #0) 		Passages/min: 5.6k (overall),  5.8k (this encoding),  10199.3M (this saving)
[Mar 03, 15:00:14] [0] 		 [NOTE] Done with local share.
[Mar 03, 15:00:14] [0] 		 #> Joining saver thread.
[Mar 03, 15:00:14] [0] 		 #> Saved batch #0 to ./indexes/index.clean.vaswani/0.pt 		 Saving Throughput = 3.3M passages per minute.

#> num_embeddings = 467958
[Mar 03, 15:00:14] #> Starting..
[Mar 03, 15:00:14] #> Processing slice #1 of 1 (range 0..1).
[Mar 03, 15:00:14] #> Will write to ./indexes/index.clean.vaswani/ivfpq.1024.faiss.
[Mar 03, 15:00:14]



15.235949039459229
0.0003101825714111328
[Mar 03, 15:00:29] Done training!

[Mar 03, 15:00:29] #> Indexing the vectors...
[Mar 03, 15:00:30] #> Loading ('./indexes/index.clean.vaswani/0.pt', None, None) (from queue)...
[Mar 03, 15:00:30] #> Processing a sub_collection with shape (467958, 128)
[Mar 03, 15:00:30] Add data with shape (467958, 128) (offset = 0)..
  IndexIVFPQ size 0 -> GpuIndexIVFPQ indicesOptions=0 usePrecomputed=0 useFloat16=1 reserveVecs=33554432
458752/467958 (3.815 s)   Flush indexes to CPU
add(.) time: 3.908 s 		--		 index.ntotal = 467958
[Mar 03, 15:00:34] Done indexing!
[Mar 03, 15:00:34] Writing index to ./indexes/index.clean.vaswani/ivfpq.1024.faiss ...
[Mar 03, 15:00:34] 

Done! All complete (for slice #1 of 1)!
#> Faiss encoding complete
#> Indexing complete, Time elapsed 151.53 seconds


In [14]:
pyterrier_colbert_factory = base_indexer.ranking_factory()

colbert_e2e_base = pyterrier_colbert_factory.end_to_end()

[Mar 03, 15:00:34] #> Loading the FAISS index from ./indexes/index.base.vaswani/ivfpq.1024.faiss ..
[Mar 03, 15:00:34] #> Building the emb2pid mapping..
[Mar 03, 15:00:34] len(self.emb2pid) = 581496
Loading reranking index, memtype=mem


Loading index shards to memory: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.34shard/s]


In [15]:
pyterrier_colbert_factory = cleaned_indexer.ranking_factory()

colbert_e2e_clean = pyterrier_colbert_factory.end_to_end()

[Mar 03, 15:00:35] #> Loading the FAISS index from ./indexes/index.clean.vaswani/ivfpq.1024.faiss ..
[Mar 03, 15:00:35] #> Building the emb2pid mapping..
[Mar 03, 15:00:35] len(self.emb2pid) = 467958
Loading reranking index, memtype=mem


Loading index shards to memory: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 11.44shard/s]


In [16]:
(colbert_e2e_base % 10).search("chemical reactions")


Unnamed: 0,qid,query,docid,query_toks,query_embs,score,docno,rank
2298,1,chemical reactions,4911,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",19.825491,4912,0
2950,1,chemical reactions,7048,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",19.057419,7049,1
2790,1,chemical reactions,6479,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",18.034231,6480,2
677,1,chemical reactions,9373,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",17.137503,9374,3
2732,1,chemical reactions,6278,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.797464,6279,4
1335,1,chemical reactions,2420,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.425671,2421,5
2104,1,chemical reactions,4292,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.188728,4293,6
1391,1,chemical reactions,10702,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.156439,10703,7
2445,1,chemical reactions,5303,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.006428,5304,8
1738,1,chemical reactions,3100,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",15.883968,3101,9


In [17]:
(colbert_e2e_clean % 10).search("chemical reactions")


Unnamed: 0,qid,query,docid,query_toks,query_embs,score,docno,rank
2759,1,chemical reactions,7048,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",19.449532,7049,0
2150,1,chemical reactions,4911,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",18.698568,4912,1
3022,1,chemical reactions,8040,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",18.258787,8041,2
2620,1,chemical reactions,6479,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.970802,6480,3
2372,1,chemical reactions,5587,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.869686,5588,4
1261,1,chemical reactions,2420,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.81522,2421,5
618,1,chemical reactions,9373,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.772886,9374,6
1640,1,chemical reactions,3100,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.384573,3101,7
1315,1,chemical reactions,10702,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",16.16106,10703,8
2273,1,chemical reactions,5303,"[tensor(101), tensor(1), tensor(5072), tensor(...","[[tensor(0.0680), tensor(-0.0083), tensor(0.11...",15.903149,5304,9


In [18]:

dataset = pt.get_dataset("vaswani")

bm25 = pt.BatchRetrieve(dataset.get_index(), wmodel="BM25")

In [23]:
pt.Experiment(
    [colbert_e2e_base, colbert_e2e_clean],
    dataset.get_topics(),
    dataset.get_qrels(),
    eval_metrics=["recip_rank", "ndcg_cut_10", 'mrt'],
    names = ["ColBERT base", "ColBERT clean"],
    baseline=0
)

Unnamed: 0,name,recip_rank,ndcg_cut_10,mrt,recip_rank +,recip_rank -,recip_rank p-value,ndcg_cut_10 +,ndcg_cut_10 -,ndcg_cut_10 p-value
0,ColBERT base,0.703509,0.426514,254.49705,,,,,,
1,ColBERT clean,0.701786,0.430996,255.49435,18.0,20.0,0.935728,41.0,37.0,0.620189
