# Osmulski Colbert

In [None]:
import os
import polars as pl
import polars.selectors as cs
import pandas as pd
import pyarrow as pa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries
from colbert import Indexer, Searcher

In [6]:
#!conda install -c conda-forge faiss-gpu -y

In [7]:
deberta_v3_large = 'microsoft/deberta-v3-large'

In [8]:
df_test = pl.read_csv('data/train.csv')
df_test = df_test.drop(columns="id")
df_test.columns

['prompt', 'A', 'B', 'C', 'D', 'E', 'answer']

In [20]:
osmulski = pl.read_csv('data/osmulski_15k.csv')
osmulski.shape, osmulski.columns

((15000, 7), ['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

## Retrieve Wiki Context via ColBERT

In [21]:
queries = osmulski.with_row_count('qid')[['qid', 'prompt']]
queries = queries.with_columns(pl.col('prompt').str.replace_all('\n', ' ')) 
queries_file = './data/wiki_queries.tsv'
queries.write_csv(queries_file, separator='\t', has_header=False)

In [22]:
c_queries = Queries(queries_file)

[Aug 29, 15:01:32] #> Loading the queries from ./data/wiki_queries.tsv ...
[Aug 29, 15:01:32] #> Got 15000 queries. All QIDs are unique.



In [23]:
n_results_per_question = 1

nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 512   # truncate passages at 300 tokens
dim = 128 # default is 128

checkpoint = './checkpoints/open-nq-colbert-xlmr-large'
experiment = 'wiki-science'
indexer_name = f"wiki_pages_index_{nbits}bits"
#os.environ['COLBERT_LOAD_TORCH_EXTENSION_VERBOSE'] = 'True'

config = ColBERTConfig(
    doc_maxlen=doc_maxlen,
    nbits=nbits,
    dim=dim
)

In [24]:
with Run().context(RunConfig(nranks=1, experiment=experiment)):
    searcher = Searcher(index=indexer_name, config=config)
    ranking = searcher.search_all(c_queries, k=n_results_per_question)

[Aug 29, 15:01:33] #> Loading collection...
0M 
[Aug 29, 15:01:34] #> Loading codec...
[Aug 29, 15:01:34] #> Loading IVF...
[Aug 29, 15:01:34] #> Loading doclens...


100%|██████████| 9/9 [00:00<00:00, 2510.22it/s]

[Aug 29, 15:01:34] #> Loading codes and residuals...



100%|██████████| 9/9 [00:00<00:00, 17.03it/s]
100%|██████████| 15000/15000 [01:01<00:00, 242.27it/s]


In [None]:
wiki_passages = pl.read_parquet('./data/wiki_passages.parquet')
wiki_passages.columns

In [28]:
colbert_passage_ids = pl.Series([tup[1] for tup in ranking.flat_ranking])
with_passage = osmulski.with_columns(colbert_passage_id=colbert_passage_ids)
with_passage with_passage.join(wiki_passages['passage_id', '], how='left', on='passage_id')

prompt,A,B,C,D,E,answer,colbert_passage_id
str,str,str,str,str,str,str,i64
"""Who was respon…","""Territorial br…","""First line div…","""Training Reser…","""Second line di…","""British home a…","""C""",80380
"""What film earn…","""Rakshit Shetty…","""Nam Areal Ondi…","""Ulidavaru Kand…","""The informatio…","""Simple Agi Ond…","""C""",41971
"""What is the po…","""Maklavan has a…","""Maklavan has a…","""Maklavan has a…","""Maklavan has a…","""Maklavan has a…","""E""",41493
"""What was the s…","""$90,000""","""$120,000""","""$85,000""","""$100,000""","""$75,000""","""D""",129433
"""What books has…","""Books about Th…","""Books about Th…","""Books about Th…","""Books about Th…","""Books about Th…","""A""",131242
"""Who was awarde…","""Walter Eric Th…","""Arthur Campbel…","""None of the ab…","""Thomas James Y…","""John Henry Wen…","""E""",6726
"""Which disorder…","""Cryofibrinogen…","""Thrombocytopen…","""Von Willebrand…","""Hemophilia A""","""Congenital hyp…","""A""",108087
"""According to D…","""Liantinis sugg…","""Liantinis sugg…","""Liantinis sugg…","""Liantinis sugg…","""Liantinis sugg…","""B""",33229
"""What is a key …","""It has the fir…","""It was found t…","""It was designe…","""It arrived at …","""It has a mediu…","""A""",54974
"""Which language…","""German""","""English""","""Italian""","""Spanish""","""French""","""B""",130844


In [None]:
test_cutoff = 14000
df_train = osmulski[:test_cutoff]
df_test_1 = osmulski[test_cutoff:]
df_test_2 = pl.read_csv('data/osmulski_extra_train.csv')
print(df_train.shape, df_test_1.shape, df_test_2.shape)

In [None]:
ranking.flat_ranking