# Wikipedia Search - ColBERT

In [1]:
import os
import polars as pl
import polars.selectors as cs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from rank_bm25 import BM25Okapi
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer
# from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
# from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# import transformers
# import torch
# import huggingface_hub

In [2]:
pl.Config(fmt_str_lengths=2000);

In [3]:
# conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y

In [4]:
# pip install git+https://github.com/stanford-futuredata/ColBERT.git

In [5]:
# !conda list | grep bitsandbytes

In [6]:
# !conda update -c conda-forge 'auto-gptq[triton]' -y

In [7]:
# huggingface_hub.login(os.environ['HUGGING_FACE_TOKEN'])

In [8]:
df_test = pl.read_csv('data/train.csv')
df_test = df_test.drop(columns="id")
print(f'{df_test.shape[0]:,}')
df_test.columns

200


['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']

In [9]:
wiki_sections = pl.read_parquet('./data/wiki_with_category.parquet')

In [10]:
tokenized_corpus = [doc.split(" ") for doc in wiki_sections['section_text']]

In [11]:
bm25 = BM25Okapi(tokenized_corpus)

In [12]:
len(df_test)

200

In [13]:
query = df_test['prompt'][4]
tokenized_query = "Diffracting object dimensions affect diffraction pattern features' angular spacing".split(" ")
query

'Which of the following statements accurately describes the relationship between the dimensions of a diffracting object and the angular spacing of features in the diffraction pattern?'

In [14]:
def bm25_scores(query):
    tokenized_query = query.split(" ")
    scores = pd.Series(bm25.get_scores(tokenized_query))
    scores = scores.sort_values(ascending=False)
    return scores

In [15]:
scores = pd.Series(bm25.get_scores(tokenized_query))
scores = scores.sort_values(ascending=False)
scores[:10]

12381    40.986317
19553    32.964811
19549    28.569587
40178    28.036671
25130    24.907424
62697    24.664405
25131    24.365572
19545    23.856857
8519     22.810186
47559    22.565054
dtype: float64

In [16]:
for item in scores[:1].items():
    print('*************')
    print(item[0], item[1])
    print(wiki_sections[int(item[0])][['title', 'section_title', 'section_text']].to_numpy())

*************
12381 40.986316880422116
[['Diffraction' 'Patterns'
  "File:Diffraction on elliptic aperture with fft.png\nSeveral qualitative observations can be made of diffraction in general:\n The angular spacing of the features in the diffraction pattern is inversely proportional to the dimensions of the object causing the diffraction. In other words: The smaller the diffracting object, the 'wider' the resulting diffraction pattern, and vice versa. (More precisely, this is true of the sines of the angles.)\n The diffraction angles are invariant under scaling; that is, they depend only on the ratio of the wavelength to the size of the diffracting object.\n When the diffracting object has a periodic structure, for example in a diffraction grating, the features generally become sharper. The third figure, for example, shows a comparison of a Double-slit experiment pattern with a pattern formed by five slits, both sets of slits having the same spacing, between the center of one slit and 

## Using ColBERT to find Wiki Section

In [17]:
passages = wiki_sections.with_row_count('section_id')[['section_id', 'section_text']]
passages = passages.with_columns(pl.col('section_text').str.replace_all('\n', ' '))                      
passages_file = './data/wiki_passages.tsv'
passages[:100].write_csv(passages_file, separator='\t', has_header=False)
queries = df_test.with_row_count('qid')[['qid', 'prompt']]
queries = queries.with_columns(pl.col('prompt').str.replace_all('\n', ' ')) 
queries_file = './data/wiki_queries.tsv'
queries.write_csv(queries_file, separator='\t', has_header=False)

In [18]:
c_collection = Collection(passages_file)
c_queries = Queries(queries_file)
f'Loaded {len(c_queries)} queries and {len(c_collection):,} passages'

[Aug 26, 13:24:19] #> Loading collection...
0M 
[Aug 26, 13:24:19] #> Loading the queries from ./data/wiki_queries.tsv ...
[Aug 26, 13:24:19] #> Got 200 queries. All QIDs are unique.



'Loaded 200 queries and 100 passages'

In [20]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 256   # truncate passages at 300 tokens

index_folder = './data/colbert'
checkpoint = './checkpoints/colbertv2.0'

os.environ['COLBERT_LOAD_TORCH_EXTENSION_VERBOSE'] = 'True'

In [None]:
with Run().context(RunConfig(nranks=1, experiment='wiki-science')):
    config = ColBERTConfig(
        index_path=index_folder,
        doc_maxlen=doc_maxlen,
        nbits=2,
    )
    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=f"wiki_pages_index_{nbits}bits",
                  collection=c_collection,
                  overwrite=True)



[Aug 26, 13:24:47] #> Creating directory ./data/colbert 


#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "index_path": ".\/data\/colbert",
    "nbits": 2,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 400000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 64,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 256,
    "mask_punctuation": true,
    "checkpoint": ".\/checkpoints\/colbertv2.0",
    "triples": "\/future\/u\/okhatt

Using /home/daniel/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Process Process-2:
Traceback (most recent call last):
  File "/home/daniel/anaconda3/envs/colbert/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/daniel/anaconda3/envs/colbert/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/daniel/anaconda3/envs/colbert/lib/python3.8/site-packages/colbert/infra/launcher.py", line 115, in setup_new_process
    return_val = callee(config, *args)
  File "/home/daniel/anaconda3/envs/colbert/lib/python3.8/site-packages/colbert/indexing/collection_indexer.py", line 33, in encode
    encoder.run(shared_lists)
  File "/home/daniel/anaconda3/envs/colbert/lib/python3.8/site-packages/colbert/indexing/collection_indexer.py", line 67, in run
    self.train(shared_lists) # Trains centroids from selected passages
  File "/home/daniel/anaconda3/envs/colbert/lib/python

In [72]:
ColBERTConfig?

[0;31mInit signature:[0m
[0mColBERTConfig[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mquery_token_id[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0mDefaultVal[0m[0;34m([0m[0mval[0m[0;34m=[0m[0;34m'[unused0]'[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdoc_token_id[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0mDefaultVal[0m[0;34m([0m[0mval[0m[0;34m=[0m[0;34m'[unused1]'[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mquery_token[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0mDefaultVal[0m[0;34m([0m[0mval[0m[0;34m=[0m[0;34m'[Q]'[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdoc_token[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0mDefaultVal[0m[0;34m([0m[0mval[0m[0;34m=[0m[0;34m'[D]'[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mncells[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0mDefaultVal[0m[0;34m([0m[0mval[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcentroid_score_t