In [1]:
import pyterrier as pt
import pandas as pd
import os
import json
import gzip
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd
import re
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [15]:
DATASET_PATH = "wikipedia_200k_dataset.jsonl.gz"
TRAIN_DATA_PATH = "hw3_relevance.train.csv"
DEV_DATA_PATH = "hw3_relevance.dev.csv"
TEST_DATA_PATH = "hw3_relevance.test.csv"
NETWORK_STATS_PATH = 'network_stats.csv'
INDEX_PATH = "./main_index"
DOC_CATEGORY_INFO_PATH = 'doc_category_info.json'
RECOGNIZED_CATEGORY_PATH = 'recognized_categories.txt'

tokenizer = RegexpTokenizer(r'\w+')

#### Load Dataset and Indexing

In [4]:
def read_dataset(dataset_path: str, max_docs: int = -1):
    open_func = lambda x: gzip.open(x, 'rb') if x.endswith('.gz') else open(x, 'r')
    with open_func(dataset_path) as f:
        for i, line in tqdm(enumerate(f)):
            if max_docs != -1 and i >= max_docs:
                break
            x = json.loads(line)
            # x["docid"] = str(x["docid"])
            x["docno"] = str(x["docid"])
            yield x

In [5]:
# !rm -rf main_index

In [6]:
pt_index_path = INDEX_PATH

docids = set()
if not os.path.exists(pt_index_path + "/data.properties"):
    # create the index, using the IterDictIndexer indexer 
    indexer = pt.index.IterDictIndexer(pt_index_path, tokenizer=tokenizer, blocks=True, verbose=True)
    NUM_DOCS = -1
    doc_iter = read_dataset(DATASET_PATH, NUM_DOCS)
    doc_iter = read_dataset(DATASET_PATH, NUM_DOCS)
    index_ref = indexer.index(doc_iter, fields=['text'], meta=['docno', 'title'])
else:
    # if you already have the index, use it.
    index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")

index = pt.IndexFactory.of(index_ref)

In [7]:
def load_data(dataset_path, qcol):
    df = pd.read_csv(dataset_path).dropna()
    queries = df[qcol].astype(str).unique()
    query_to_id = {q: str(i) for i, q in enumerate(queries)}
    df["qid"] = df[qcol].apply(lambda x: query_to_id[x])
    df["docno"] = df["docid"].apply(lambda x: str(x))
    df["label"] = df["rel"]
    topics = df[[qcol, "qid"]].copy().drop_duplicates()
    topics.columns = ["query", "qid"]
    topics = topics.astype({"qid": str})
    topics["query"] = topics["query"].apply(lambda x: re.sub(r"'", " ", x))
    qrels = df[["qid", "docno", "label"]].copy()
    return topics, qrels

In [8]:
train_topics, train_qrels = load_data(TRAIN_DATA_PATH, "query")
validation_topics, validation_qrels = load_data(DEV_DATA_PATH, "query")
test_topics, test_qrels = load_data(TEST_DATA_PATH, "query")

#### Basic Scorer

In [9]:
def fetch_my_feat(keyFreq, postings, entryStats, collStats):
    return postings.getFrequency()

In [58]:
# def pivoted_normalization_weighting(keyFreq, posting, entryStats, collStats):
#     b1 = 0.2
#     num_doc = collStats.numberOfDocuments
#     num_token = collStats.numberOfTokens
#     avdl = num_token/num_doc

#     doc_len = posting.getDocumentLength()
#     num_terms_in_doc = posting.getFrequency()
#     df_t = entryStats.getDocumentFrequency()

#     qtf = keyFreq
#     tf_no = 1 + math.log(1 + math.log(num_terms_in_doc))
#     tf = tf_no/(1-b1+b1*doc_len/avdl)
#     idf = math.log((num_doc+1)/df_t)

#     score = idf*tf*qtf
#     return score

In [11]:
query_toks = pt.rewrite.tokenise(tokeniser=lambda x: tokenizer.tokenize(x), matchop=True)
bm25 = query_toks >> pt.BatchRetrieve(index, wmodel="BM25")
get_title = pt.text.get_text(index, 'title')
tf_title = pt.text.scorer(body_attr="title", wmodel=fetch_my_feat)
tf_text = pt.BatchRetrieve(index, wmodel=fetch_my_feat)
tf_idf_title = query_toks >> pt.BatchRetrieve(index, wmodel="TF_IDF")
tf_idf_text = query_toks >> pt.BatchRetrieve(index, wmodel="TF_IDF")
pivoted_doc =  query_toks >> pt.BatchRetrieve(index, wmodel=pivoted_normalization_weighting)
pl2 = query_toks >> pt.BatchRetrieve(index, wmodel="PL2")
dlm = query_toks >> pt.BatchRetrieve(index, wmodel="DirichletLM")
sdm = pt.rewrite.SDM()
qe = pt.rewrite.Bo1QueryExpansion(index)
cm = query_toks >> pt.BatchRetrieve(index, wmodel="CoordinateMatch")

In [12]:
from pyterrier.measures import *
pt.Experiment(
    [
        bm25,
        tf_idf_title,
        tf_idf_text,
        pl2,
        dlm,
        cm
    ],
    train_topics,
    train_qrels,
    filter_by_qrels=True,
    eval_metrics=["map", "ndcg_cut_10"],
    verbose=True,
    names=["BM25", "TF_IDF_Title", "TF_IDF_Text", "PL2", "DLM", "CM"]
)

pt.Experiment:   0%|          | 0/6 [00:00<?, ?system/s]

pt.Experiment: 100%|██████████| 6/6 [00:46<00:00,  7.70s/system]


Unnamed: 0,name,map,ndcg_cut_10
0,BM25,0.071624,0.186296
1,TF_IDF_Title,0.079237,0.198254
2,TF_IDF_Text,0.079237,0.198254
3,PL2,0.052178,0.1767
4,DLM,0.079928,0.227606
5,CM,0.147529,0.144782


In [None]:
pt.Experiment(
    [
        bm25,
        tf_idf_title,
        tf_idf_text,
        pl2,
        dlm,
        cm
    ],
    test_topics,
    test_qrels,
    filter_by_qrels=True,
    eval_metrics=["map", "ndcg_cut_10"],
    verbose=True,
    names=["BM25", "TF_IDF_Title", "TF_IDF_Text", "PL2", "DLM", "CM"]
)

#### Learn-to-Rank

In [16]:
ENCODED_DOCUMENT_EMBEDDINGS_NPY_DATA = 'wiki-200k-vecs.msmarco-MiniLM-L12-cos-v5.npy'
DOCUMENT_ID_TEXT = 'document-ids.txt'

encoded_docs = None
with open(ENCODED_DOCUMENT_EMBEDDINGS_NPY_DATA, 'rb') as file:
    encoded_docs = np.load(file)

document_ids = None
with open(DOCUMENT_ID_TEXT, 'r') as f:
    document_ids = f.read().splitlines()
document_ids = [int(x) for x in document_ids]

recognized_categories = None
with open(RECOGNIZED_CATEGORY_PATH, 'r') as f:
    recognized_categories = f.read().splitlines()

doc_category_info = None
with open(DOC_CATEGORY_INFO_PATH, 'r') as f:
    doc_category_info = json.load(f)

doc_category_info = None
with open(DOC_CATEGORY_INFO_PATH, 'r') as f:
    doc_category_info = json.load(f)

network_features = {}
networks_stats = pd.read_csv(NETWORK_STATS_PATH, index_col=0)
for row in tqdm(networks_stats.iterrows()):
    network_features[row[1]['docid']] = row[1][1:].to_dict()

999841it [00:21, 46036.28it/s]


In [35]:
import torch

class FeatureExtractor:
    def __init__(self, model_name, index, document_ids, encoded_docs, 
            network_features, doc_category_info, tokenizer):
        self.tokenizer = tokenizer
        self.index = index
        self.doc_ids = document_ids
        self.encoded_docs = encoded_docs
        self.encoder = SentenceTransformer(model_name)
        self.query_emb = {}
        self.network_features = network_features
        self.doc_category_info = doc_category_info
        self.category_to_id = {k: v for v, k in enumerate(recognized_categories)}

    def get_document_categories(self, docid):
        doc_categories = [0 for _ in range(len(self.category_to_id))]
        for category in self.doc_category_info[str(docid)]:
            if category in self.category_to_id:
                doc_categories[self.category_to_id[category]] = 1
        return doc_categories
    
    @torch.no_grad()
    def add_features(self, row):
        docid = int(row["docid"])
        docno = int(row["docno"])
        content = row["query"]
        qid = row["qid"]
        f1 = len(tokenizer.tokenize(content))
        query_emb = None
        if qid not in self.query_emb:
            query_emb = self.encoder.encode(content, normalize_embeddings=True)
            self.query_emb[qid] = query_emb
        else:
            query_emb = self.query_emb[qid]
        doc_emb = self.encoded_docs[document_ids.index(docno)]
        f2 = util.dot_score(query_emb, doc_emb).item()
        content = row["title"]
        f3 = len(tokenizer.tokenize(content))
        f4 = index.getDocumentIndex().getDocumentLength(docid)
        return np.array([f1, f2, f3, f4, *self.network_features[docno].values(), *self.get_document_categories(docno)])

fe = FeatureExtractor('sentence-transformers/msmarco-MiniLM-L12-cos-v5', 
            index, document_ids, encoded_docs, network_features, doc_category_info, tokenizer)

In [36]:
pipeline = (bm25 % 100) >> get_title >>  pt.apply.doc_features(fe.add_features)
pipeline.search("information retrieval")

Unnamed: 0,qid,docid,docno,rank,score,query_0,query,title,features
0,1,54874,21022536,0,12.024897,information retrieval,information retrieval,Golden Retriever,"[2.0, 0.1951461136341095, 2.0, 1101.0, 2.00231..."
1,1,199272,375096,1,12.017616,information retrieval,information retrieval,Retriever,"[2.0, 0.24166104197502136, 1.0, 148.0, 1.66391..."
2,1,188673,537218,2,11.959106,information retrieval,information retrieval,Gun dog,"[2.0, 0.0508037805557251, 2.0, 506.0, 2.228500..."
3,1,29258,79280,3,11.778672,information retrieval,information retrieval,Labrador Retriever,"[2.0, 0.15273529291152954, 2.0, 760.0, 2.57849..."
4,1,91010,4743980,4,11.693882,information retrieval,information retrieval,Tip of the tongue,"[2.0, 0.3024318814277649, 4.0, 2116.0, 1.75525..."
...,...,...,...,...,...,...,...,...,...
95,1,128064,1717129,95,10.094906,information retrieval,information retrieval,Explicit memory,"[2.0, 0.13990920782089233, 2.0, 2916.0, 1.4674..."
96,1,29259,79289,96,10.051436,information retrieval,information retrieval,German Shepherd,"[2.0, -0.051451295614242554, 2.0, 2376.0, 2.60..."
97,1,53148,24799509,97,10.048728,information retrieval,information retrieval,Graph database,"[2.0, 0.24931509792804718, 2.0, 1759.0, 4.9765..."
98,1,152617,933503,98,10.031661,information retrieval,information retrieval,Nanobe,"[2.0, -0.007376858964562416, 1.0, 212.0, 3.438..."


In [45]:
RANK_CUTOFF = 100
pipeline = (bm25 % RANK_CUTOFF) >> get_title >> (
    # 
    pt.apply.doc_features(fe.add_features)
    **
    # BM25
    bm25
    **
    # TF_TITLE
    tf_title 
    ** 
    # TF_DOC
    tf_text
    ** 
    # TF_IDF_TITLE
    tf_idf_title
    **
    # TF_IDF_DOC
    tf_idf_text
)

In [47]:
import lightgbm as lgb
# this configures LightGBM as LambdaMART

default_params = {
    'objective': "lambdarank",
    'boosting_type': "gbdt",
    'n_estimators': 20,
    'importance_type': "gain",
    'metric': "ndcg",
    'eval_at': [10],
    'num_leaves': 20,
    'learning_rate': 0.005,
    'max_depth': -1,
    "verbosity": 1,
}


lmart_l = lgb.LGBMRanker(**default_params)
lmart_l_pipe = pipeline >> pt.ltr.apply_learned_model(lmart_l, form="ltr")
lmart_l_pipe.fit(train_topics, train_qrels, validation_topics, validation_qrels)

pt.Experiment(
    [bm25, lmart_l_pipe],
    test_topics,
    test_qrels,
    ["map", "ndcg_cut_10"],
    names=["BM25 Baseline", "LambdaMART (LightGBM)" ],
    verbose=True
)





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2552
[LightGBM] [Info] Number of data points in the train set: 12900, number of used features: 126


pt.Experiment:  50%|█████     | 1/2 [00:01<00:01,  1.98s/system]



pt.Experiment: 100%|██████████| 2/2 [00:11<00:00,  5.59s/system]


Unnamed: 0,name,map,ndcg_cut_10
0,BM25 Baseline,0.061286,0.148466
1,LambdaMART (LightGBM),0.051781,0.238501


In [68]:
lmart_l.booster_.save_model("lmart_l.txt")

<lightgbm.basic.Booster at 0x2c5461990>

In [None]:
def save(self, file_name: str = 'l2r.model.txt') -> None:
        self.model.booster_.save_model(file_name)
    
    def load(self, file_name: str = 'l2r.model.txt') -> None:
        self.model = lightgbm.Booster(model_file=file_name)

In [48]:
pt.Experiment(
    [bm25, lmart_l_pipe],
    validation_topics,
    validation_qrels,
    ["map", "ndcg_cut_10"],
    names=["BM25 Baseline", "LambdaMART (LightGBM)" ]
)





Unnamed: 0,name,map,ndcg_cut_10
0,BM25 Baseline,0.067452,0.196534
1,LambdaMART (LightGBM),0.050117,0.261852


In [60]:
lmart_l_pipe.search("information retrieval")





Unnamed: 0,qid,docid,docno,score,query_0,query,title,features,rank
86,1,39586,31217535,0.046718,information retrieval,information retrieval,Memory,"[2.0, 0.3415633738040924, 1.0, 4833.0, 1.36696...",0
43,1,21734,149354,0.029184,information retrieval,information retrieval,Information science,"[2.0, 0.35018330812454224, 2.0, 2083.0, 1.0993...",1
49,1,21721,149306,-0.001016,information retrieval,information retrieval,National Center for Biotechnology Information,"[2.0, 0.224349245429039, 5.0, 584.0, 0.0001142...",2
96,1,29259,79289,-0.013296,information retrieval,information retrieval,German Shepherd,"[2.0, -0.051451295614242554, 2.0, 2376.0, 2.60...",3
65,1,51121,21312318,-0.014123,information retrieval,information retrieval,Recognition memory,"[2.0, 0.3847505450248718, 2.0, 3203.0, 4.33027...",4
...,...,...,...,...,...,...,...,...,...
88,1,10609,38167907,-0.168926,information retrieval,information retrieval,Lego Marvel Super Heroes (video games),"[2.0, 0.10809655487537384, 6.0, 1044.0, 5.3187...",95
89,1,94077,4700242,-0.168926,information retrieval,information retrieval,Mars sample-return mission,"[2.0, 0.32669755816459656, 4.0, 1051.0, 1.9449...",96
91,1,69739,14559427,-0.168926,information retrieval,information retrieval,File URI scheme,"[2.0, 0.14401458203792572, 3.0, 478.0, 1.74983...",97
93,1,187965,898293,-0.168926,information retrieval,information retrieval,Force play,"[2.0, -0.10385392606258392, 2.0, 493.0, 2.5194...",98


#### Bi-Encoder

In [52]:
bimodel = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L12-cos-v5')

In [55]:
def _biencoder_apply(df : pd.DataFrame):
    query_embs = bimodel.encode(df['query'].values)
    # print(df['docid'].values)
    doc_embs = bimodel.encode(df['query'].values)
    # doc_embs = bimodel.encode(df['text'].values)
    scores = util.dot_score(query_embs, doc_embs)
    # print(query_embs.shape, doc_embs.shape, scores.shape)
    return scores[0]

bi_encT = pt.apply.doc_score(_biencoder_apply, batch_size=128)

In [None]:
pt.Experiment(
    [ bm25, query_toks >>  pt.BatchRetrieve(index) >> bi_encT],
    train_topics,
    train_qrels,
    ["map", "ndcg_cut_10"],
    names=["BM25", "BM25 >> BiEncoder"]
)

In [None]:
pipeline = query_toks >> pt.BatchRetrieve(index) >> bi_encT
pipeline.search("information retrieval")

pt.apply: 100%|██████████| 1/1 [00:00<00:00, 28.77row/s]


Unnamed: 0,qid,docid,docno,score,query_0,query,rank
0,1,8,54301172,1.0,information retrieval,information retrieval,0
1,1,3,52231341,1.0,information retrieval,information retrieval,1
