# Wikipedia Search - ColBERT

In [26]:
import os
import polars as pl
import polars.selectors as cs
import pandas as pd
import pyarrow as pa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from rank_bm25 import BM25Okapi
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher
# from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
# from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# import transformers
# import torch
# import huggingface_hub

In [2]:
pl.Config(fmt_str_lengths=2000);

In [3]:
# conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y

In [4]:
#!pip install rank_bm25

In [5]:
# !conda list | grep bitsandbytes

In [6]:
# !conda install -c conda-forge pyarrow -y

In [7]:
# !conda update -c conda-forge 'auto-gptq[triton]' -y

In [8]:
# huggingface_hub.login(os.environ['HUGGING_FACE_TOKEN'])

In [9]:
df_test = pl.read_csv('data/train.csv')
df_test = df_test.drop(columns="id")
print(f'{df_test.shape[0]:,}')
df_test.columns

200


['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']

In [10]:
wiki_sections = pl.read_parquet('./data/wiki_with_category.parquet')
wiki_sections.shape

(153750, 13)

In [11]:
tokenized_corpus = [doc.split(" ") for doc in wiki_sections['section_text']]

In [12]:
bm25 = BM25Okapi(tokenized_corpus)

In [13]:
len(df_test)

200

In [14]:
query = df_test['prompt'][4]
tokenized_query = "Diffracting object dimensions affect diffraction pattern features' angular spacing".split(" ")
query

'Which of the following statements accurately describes the relationship between the dimensions of a diffracting object and the angular spacing of features in the diffraction pattern?'

In [15]:
def bm25_scores(query):
    tokenized_query = query.split(" ")
    scores = pd.Series(bm25.get_scores(tokenized_query))
    scores = scores.sort_values(ascending=False)
    return scores

In [60]:
scores = []
for question in tqdm(df_test['prompt'][:10]):
    q_scores = pd.Series(bm25.get_scores(question))
    scores.append(q_scores.sort_values(ascending=False)[:10].index.to_list())

  0%|          | 0/10 [00:00<?, ?it/s]

In [93]:
bm25 = pl.Series("bm25", scores, dtype=pl.List(pl.UInt32))
info = df_test[:10][['prompt']].with_columns(bm25)
info = info.with_row_count('qid')
info = info.explode('bm25')
info.with_columns(pl.lit(1).alias("ones"))\
    .select([
        pl.all().exclude("ones"),
        pl.col("ones").cumsum().over("qid").flatten().alias("bm25_idx")
    ])

qid,prompt,bm25,bm25_idx
u32,str,u32,i32
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",4767,1
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",141812,2
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",138900,3
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",124632,4
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",141815,5
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",146258,6
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",129281,7
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",141811,8
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",52568,9
0,"""Which of the following statements accurately describes the impact of Modified Newtonian Dynamics (MOND) on the observed ""missing baryonic mass"" discrepancy in galaxy clusters?""",97959,10


In [17]:
for item in scores[:1].items():
    print('*************')
    print(item[0], item[1])
    print(wiki_sections[int(item[0])][['title', 'section_title', 'section_text']].to_numpy())

*************
12381 40.986316880422116
[['Diffraction' 'Patterns'
  "File:Diffraction on elliptic aperture with fft.png\nSeveral qualitative observations can be made of diffraction in general:\n The angular spacing of the features in the diffraction pattern is inversely proportional to the dimensions of the object causing the diffraction. In other words: The smaller the diffracting object, the 'wider' the resulting diffraction pattern, and vice versa. (More precisely, this is true of the sines of the angles.)\n The diffraction angles are invariant under scaling; that is, they depend only on the ratio of the wavelength to the size of the diffracting object.\n When the diffracting object has a periodic structure, for example in a diffraction grating, the features generally become sharper. The third figure, for example, shows a comparison of a Double-slit experiment pattern with a pattern formed by five slits, both sets of slits having the same spacing, between the center of one slit and 

## Using ColBERT to find Wiki Section

In [18]:
def split_text_into_chunks(text, max_words=350):
    chunks = []
    current_chunk = []
    current_length = 0
    last_period = -1
    section_words = text.split()

    for word in section_words:
        if word.endswith('.'):
            last_period = current_length

        if current_length < max_words:
            current_length += 1
        else:
            if last_period > -1:
                cut_point = last_period + 1
                chunks.append(" ".join(section_words[:cut_point]))
                section_words = section_words[cut_point:]
                last_period = -1
                current_length = 0
            else:
                # If no period exists, just split it at max_words
                chunks.append(" ".join(section_words[:current_length]))
                section_words = section_words[current_length:]
                current_length = 0           

    if section_words:
        chunks.append(" ".join(section_words))
    
    return chunks

passages = wiki_sections.with_row_count('section_id')[['section_id', 'section_text']]
passages = passages.with_columns(
    pl.col("section_text")
      .apply(split_text_into_chunks)\
      .cast(pl.List(pl.Utf8))\
)
passages = passages.explode('section_text')
passages = passages.with_row_count('chunk_id')
passages.shape, passages.columns

((208767, 3), ['chunk_id', 'section_id', 'section_text'])

In [19]:
passages = passages.with_columns(pl.col('section_text').str.replace_all('\n', ' '))                      
passages_file = './data/wiki_passages.tsv'
passages[['chunk_id', 'section_text']].write_csv(passages_file, separator='\t', has_header=False)
queries = df_test.with_row_count('qid')[['qid', 'prompt']]
queries = queries.with_columns(pl.col('prompt').str.replace_all('\n', ' ')) 
queries_file = './data/wiki_queries.tsv'
queries.write_csv(queries_file, separator='\t', has_header=False)

In [20]:
c_collection = Collection(passages_file)
c_queries = Queries(queries_file)
f'Loaded {len(c_queries)} queries and {len(c_collection):,} passages'

[Aug 26, 17:06:08] #> Loading collection...
0M 
[Aug 26, 17:06:09] #> Loading the queries from ./data/wiki_queries.tsv ...
[Aug 26, 17:06:09] #> Got 200 queries. All QIDs are unique.



'Loaded 200 queries and 208,767 passages'

In [35]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 512   # truncate passages at 300 tokens

index_folder = './data/colbert'
checkpoint = './checkpoints/colbertv2.0'
experiment = 'wiki-science'
indexer_name = f"wiki_pages_index_{nbits}bits"

os.environ['COLBERT_LOAD_TORCH_EXTENSION_VERBOSE'] = 'True'

config = ColBERTConfig(
    index_path=index_folder,
    doc_maxlen=doc_maxlen,
    nbits=2,
)

In [22]:
with Run().context(RunConfig(nranks=1, experiment=experiment)):
    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=indexer_name,
                  collection=c_collection,
                  overwrite=True)



[Aug 26, 17:06:09] #> Note: Output directory ./data/colbert already exists


[Aug 26, 17:06:09] #> Will delete 28 files already at ./data/colbert in 20 seconds...
#> Starting...
nranks = 1 	 num_gpus = 1 	 device=0
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "index_path": ".\/data\/colbert",
    "nbits": 2,
    "kmeans_niters": 20,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 400000,
    "save_every": null,
    "warmup": 20000,
    "warmup_bert": null,
    "relu": false,
    "nway": 64,
    "use_ib_negatives": true,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 512,
    "mask_

Using /home/daniel/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/daniel/.cache/torch_extensions/py38_cu117/decompress_residuals_cpp/build.ninja...
Building extension module decompress_residuals_cpp...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
[Aug 26, 17:16:29] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...


Loading extension module decompress_residuals_cpp...
Using /home/daniel/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/daniel/.cache/torch_extensions/py38_cu117/packbits_cpp/build.ninja...
Building extension module packbits_cpp...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
[0.033, 0.034, 0.033, 0.03, 0.032, 0.033, 0.033, 0.03, 0.031, 0.032, 0.031, 0.031, 0.032, 0.033, 0.031, 0.034, 0.029, 0.031, 0.032, 0.031, 0.032, 0.033, 0.032, 0.033, 0.031, 0.032, 0.033, 0.033, 0.032, 0.034, 0.032, 0.036, 0.033, 0.03, 0.032, 0.03, 0.035, 0.031, 0.031, 0.039, 0.034, 0.032, 0.032, 0.034, 0.031, 0.031, 0.032, 0.035, 0.033, 0.031, 0.032, 0.032, 0.033, 0.033, 0.032, 0.032, 0.038, 0.033, 0.039, 0.031, 0.032, 0.034, 0.033, 0.035, 0.033, 0.033, 0.036, 0.034, 0.032, 0.032, 0.035, 0.031, 0.032, 0.033, 0.032, 0.034, 0.035, 0.034, 0.034, 0.035, 0.034, 0.031, 0.034, 0.034, 0.031, 0.033, 0.031, 0.034, 0.031, 0.035, 0.033, 0.035, 0.032, 0.034, 0.032, 0.033, 0.035, 0.031, 0.032, 0.032, 0.032, 0.036, 0.034, 0.032, 0.035, 0.03, 0.031, 0.031, 0.033, 0.032, 0.034, 0.033, 0.034, 0.03, 0.036, 0.031, 0.035, 0.033, 0.031, 0.034, 0.031, 0.032, 0.034, 0.035, 0.03, 0.036, 0.032, 0.032]
[Aug 26, 17:16:30] #> Got bucket_cutoffs_quantiles = tensor([0.2500, 0.5000, 0.7500], de

Loading extension module packbits_cpp...
0it [00:00, ?it/s]

[Aug 26, 17:17:30] [0] 		 #> Saving chunk 0: 	 25,000 passages and 6,095,873 embeddings. From #0 onward.


1it [01:02, 62.18s/it]

[Aug 26, 17:17:32] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:18:32] [0] 		 #> Saving chunk 1: 	 25,000 passages and 6,158,019 embeddings. From #25,000 onward.


2it [02:04, 62.13s/it]

[Aug 26, 17:18:34] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:19:34] [0] 		 #> Saving chunk 2: 	 25,000 passages and 6,205,251 embeddings. From #50,000 onward.


3it [03:06, 62.16s/it]

[Aug 26, 17:19:36] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:20:36] [0] 		 #> Saving chunk 3: 	 25,000 passages and 6,522,781 embeddings. From #75,000 onward.


4it [04:08, 62.24s/it]

[Aug 26, 17:20:39] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:21:39] [0] 		 #> Saving chunk 4: 	 25,000 passages and 6,512,556 embeddings. From #100,000 onward.


5it [05:11, 62.27s/it]

[Aug 26, 17:21:41] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:22:41] [0] 		 #> Saving chunk 5: 	 25,000 passages and 5,919,801 embeddings. From #125,000 onward.


6it [06:13, 62.19s/it]

[Aug 26, 17:22:43] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:23:43] [0] 		 #> Saving chunk 6: 	 25,000 passages and 5,705,500 embeddings. From #150,000 onward.


7it [07:15, 62.12s/it]

[Aug 26, 17:23:45] [0] 		 #> Encoding 25000 passages..
[Aug 26, 17:24:45] [0] 		 #> Saving chunk 7: 	 25,000 passages and 5,825,870 embeddings. From #175,000 onward.


8it [08:17, 62.09s/it]

[Aug 26, 17:24:47] [0] 		 #> Encoding 8767 passages..
[Aug 26, 17:25:08] [0] 		 #> Saving chunk 8: 	 8,767 passages and 1,996,656 embeddings. From #200,000 onward.


9it [08:39, 57.67s/it]
100%|██████████| 9/9 [00:00<00:00, 184.93it/s]

[Aug 26, 17:25:09] [0] 		 #> Checking all files were saved...
[Aug 26, 17:25:09] [0] 		 Found all files!
[Aug 26, 17:25:09] [0] 		 #> Building IVF...
[Aug 26, 17:25:09] [0] 		 #> Loading codes...
[Aug 26, 17:25:09] [0] 		 Sorting codes...
[Aug 26, 17:25:11] [0] 		 Getting unique codes...
[Aug 26, 17:25:11] #> Optimizing IVF to store map from centroids to list of pids..
[Aug 26, 17:25:11] #> Building the emb2pid mapping..
[Aug 26, 17:25:12] len(emb2pid) = 50942307



100%|██████████| 65536/65536 [00:02<00:00, 30637.98it/s]

[Aug 26, 17:25:14] #> Saved optimized IVF to ./data/colbert/ivf.pid.pt
[Aug 26, 17:25:14] [0] 		 #> Saving the indexing metadata to ./data/colbert/metadata.json ..





#> Joined...


In [None]:
ColBERTConfig?

In [40]:
with Run().context(RunConfig(nranks=1, experiment=experiment)):
    config = ColBERTConfig(
            root=index_folder,
        )
    searcher = Searcher(index=indexer_name, config=config)
    ranking = searcher.search_all(c_queries, k=10)

[Aug 26, 18:05:43] #> Loading collection...
0M 
[Aug 26, 18:05:44] #> Loading codec...
[Aug 26, 18:05:44] #> Loading IVF...
[Aug 26, 18:05:44] #> Loading doclens...


100%|██████████| 9/9 [00:00<00:00, 2311.05it/s]

[Aug 26, 18:05:44] #> Loading codes and residuals...



100%|██████████| 9/9 [00:00<00:00, 15.40it/s]
100%|██████████| 200/200 [00:00<00:00, 289.20it/s]


In [45]:
info.with_columns(pl.lit(1).alias("ones"))\
    .select([
        pl.all().exclude("ones"),
        pl.col("ones").cumsum().over("qid").flatten().alias("bm25_idx")
    ])

[(16871, 1, 20.5),
 (17943, 2, 18.4375),
 (2943, 3, 18.03125),
 (26938, 4, 17.359375),
 (55803, 5, 17.21875),
 (16896, 6, 17.203125),
 (1827, 7, 17.140625),
 (16894, 8, 17.078125),
 (85348, 9, 16.953125),
 (26933, 10, 16.859375)]

In [111]:
colbert = [tup[1] for tup in ranking.flat_ranking]
colbert = pl.Series("colbert", colbert[:100], dtype=pl.UInt32)

In [112]:
info.with_colums(colbert)

AttributeError: 'DataFrame' object has no attribute 'with_colums'

In [None]:
bm25 = pl.Series("bm25", scores, dtype=pl.List(pl.UInt32))
info = df_test[:10][['prompt']].with_columns(bm25)
info = info.with_row_count('qid')
info = info.explode('bm25')
info.with_columns(pl.lit(1).alias("ones"))\
    .select([
        pl.all().exclude("ones"),
        pl.col("ones").cumsum().over("qid").flatten().alias("bm25_idx")
    ])