In [1]:
import pandas as pd
import jsonlines
import pyterrier as pt
if not pt.started():
    pt.init(mem=10000)
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
%env TERRIER_HEAP_MEM=10000


PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


# Indexing

### PyTerrier

In [None]:
with jsonlines.open("data/corpus.jsonl", "r") as fp:
    fp = [
        {
            "docno": line["id"],
            "kw": " ".join(line["keywords"]) + " " + line["title"],
            "body": line["title"] + " " + line["text"] + " " + " ".join(line["keywords"]),
        }
        | line
        for line in fp
    ]
    iter_indexer = pt.IterDictIndexer(
        "./index_body2",
        fields=["text", "title", "keywords", "kw", "body"],
        meta=["docno", "text", "title", "keywords", "kw", "body"],
        meta_lengths=[20, 10000, 10000, 10000, 10000],
        blocks=True,
    )
    iter_indexer.index(fp)


In [None]:
indexref = pt.IndexRef.of("./index")

### Pyserini

In [None]:
import json
with jsonlines.open("data/corpus.jsonl", "r") as fp:
    out = []
    for line in fp:
        out.append({"id": line["id"], "contents": line["title"] + "\n" + line["text"] + "\n" + " ".join(line["keywords"])})

    with open(f"c/corpus.json", "w") as f:
        json.dump(out, f)

!python -m pyserini.index.lucene --collection JsonCollection --input c --index indexes/sparse --generator DefaultLuceneDocumentGenerator --threads 1 --storePositions --storeDocvectors --storeRaw

# BM25

In [None]:
!python -m pyserini.search.lucene --index indexes/sparse/ --topics test_queries.tsv --output runs/run.train.bm25.trec --output-format trec --hits 1000 --bm25 --k1 0.82 --b 0.68
!python process_results.py run/run.train.bm25.trec > out

# LambdaMART

In [None]:
q = pd.read_csv("data/train_queries.csv", dtype={"QueryId": str, "Query": str}).rename(columns={"QueryId": "qid", "Query": "query"})
q_test = pd.read_csv("data/test_queries.csv", dtype={"QueryId": str, "Query": str}).rename(columns={"QueryId": "qid", "Query": "query"})
qrel = pd.read_csv("data/train_qrels.csv", dtype={"QueryId": str, "EntityId": str, "Relevance": int}).rename(columns={"QueryId": "qid", "EntityId": "docno", "Relevance": "label"})
q["query"] = q["query"].str.replace("'s", "")
q["query"] = q["query"].str.replace("'", "")
q["query"] = q["query"].str.replace(r"[^\w\s]", "")
q_test["query"] = q_test["query"].str.replace("'s", "")
q_test["query"] = q_test["query"].str.replace("'", "")
q_test["query"] = q_test["query"].str.replace(r"[^\w\s]", "")

In [None]:
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25", controls={"c" : 0.4, "bm25.k_1": 0.9, "bm25.k_3": 0.5})
tf = pt.BatchRetrieve(indexref, wmodel="Tf")
pl2 = pt.BatchRetrieve(indexref, wmodel="PL2")
cm = pt.BatchRetrieve(indexref, wmodel="CoordinateMatch")

In [None]:
# 0.39578

sdm = pt.rewrite.SDM()
rm3_pipe = bm25 >> pt.rewrite.RM3(indexref) >> bm25

pipeline = (
    bm25
    >> pt.text.get_text(indexref, ["title", "text", "kw"])
    >> (
        pt.transformer.IdentityTransformer()
        ** (sdm >> bm25)
        ** cm
        ** (pt.text.scorer(body_attr="title", wmodel="BM25", background_index=indexref))
        ** (pt.text.scorer(body_attr="kw", wmodel="BM25", background_index=indexref))
        ** (pt.text.scorer(body_attr="title", wmodel="CoordinateMatch", background_index=indexref))
    )
)
lmart = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1,
    max_bin=255,
    num_leaves=31,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[100],
    ndcg_at=[100],
    eval_at=[100],
    learning_rate=0.1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5,
    n_jobs=16
)
train, val = train_test_split(q, test_size=0.2)
lmart_pipe = pipeline >> pt.ltr.apply_learned_model(lmart, form="ltr")
lmart_pipe.fit(train, qrel, val, qrel)


In [None]:
empty = pd.DataFrame(columns=['qid', 'docno', 'label'])
pt.Experiment([lmart_pipe % 100], q_test, empty, ["ndcg_cut_100"], names=["LTR"], save_dir="results/", filter_by_topics=False, filter_by_qrels=False, save_mode="overwrite")
!python process_results.py results/LTR.res.gz

# BERT

In [None]:
from pyterrier_bert.pyt_cedr import CEDRPipeline


dph = pt.BatchRetrieve(indexref, controls={"wmodel" : "DPH"}, verbose=True, metadata=["docno", "body"])
cedrpipe = dph >> CEDRPipeline(max_valid_rank=20)
train, val = train_test_split(q, test_size=0.2)

cedrpipe.fit(train, qrel, val, qrel)

In [None]:
empty = pd.DataFrame(columns=['qid', 'docno', 'label'])
pt.Experiment([cedrpipe % 100], q_test, empty, ["map", "ndcg_cut_100"], names=["BERT"], save_dir="results/", filter_by_topics=False, filter_by_qrels=False, save_mode="overwrite")

!python process_results.py results/BERT.res.gz > out


# Pre-trained rankers

All of pyserini reproduction guides were tested (with the providaded already tuned models)
https://github.com/castorini/pyserini (Did not include them here because they all basically consist of one very similar line, usually only changing the encoder)

In [None]:
import jsonlines
import onir_pt

indexed_epic = onir_pt.indexed_epic.from_checkpoint('https://macavaney.us/epic.msmarco.tar.gz', index_path='./epic_cord19')
with jsonlines.open("data/corpus.jsonl", "r") as fp:
    fp = [{"docno": line["id"]} | line for line in fp]
    indexed_epic.index(fp, fields=('title', 'text'))

# BERT + LTR

In [None]:
q = pd.read_csv("data/train_queries.csv", dtype={"QueryId": str, "Query": str}).rename(columns={"QueryId": "qid", "Query": "query"})
q_test = pd.read_csv("data/test_queries.csv", dtype={"QueryId": str, "Query": str}).rename(columns={"QueryId": "qid", "Query": "query"})
qrel = pd.read_csv("data/train_qrels.csv", dtype={"QueryId": str, "EntityId": str, "Relevance": int}).rename(columns={"QueryId": "qid", "EntityId": "docno", "Relevance": "label"})
q["query"] = q["query"].str.replace("'s", "")
q["query"] = q["query"].str.replace("'", "")
q["query"] = q["query"].str.replace(r"[^\w\s]", "")
q_test["query"] = q_test["query"].str.replace("'s", "")
q_test["query"] = q_test["query"].str.replace("'", "")
q_test["query"] = q_test["query"].str.replace(r"[^\w\s]", "")
empty = pd.DataFrame(columns=['qid', 'docno', 'label'])

qhr = qrel[qrel["label"] > 1] #  high relevance
qlr = qrel[qrel["label"] == 1] # low relevance


In [None]:
# BERT HIGH REL

DPH_br = pt.BatchRetrieve(indexref, controls={"wmodel" : "DPH"}, verbose=True, metadata=["docno", "body"])
cedrpipehr = DPH_br >> CEDRPipeline(max_valid_rank=20)
train, val = train_test_split(q, test_size=0.2)

cedrpipehr.fit(train, qhr, val, qhr)

In [None]:
# BERT LOW REL

DPH_br = pt.BatchRetrieve(indexref, controls={"wmodel" : "DPH"}, verbose=True, metadata=["docno", "body"])
cedrpipelr = DPH_br >> CEDRPipeline(max_valid_rank=20)

cedrpipelr.fit(train, qlr, val, qlr)

In [None]:
# BERT KEYWORDS

DPH_br = pt.BatchRetrieve(indexref, controls={"wmodel" : "DPH"}, verbose=True, metadata=["docno", "body", "kw"])
cedrpipekw = DPH_br >>  CEDRPipeline(max_valid_rank=20, doc_attr="kw")
train, val = train_test_split(q, test_size=0.2)

cedrpipekw.fit(train, qlr, val, qlr)

In [None]:
# BERT TITLES

DPH_br = pt.BatchRetrieve(indexref, controls={"wmodel" : "DPH"}, verbose=True, metadata=["docno", "body", "title"])
cedrpipetitle = DPH_br >> CEDRPipeline(max_valid_rank=20, doc_attr="title")

cedrpipetitle.fit(train, qlr, val, qlr)

In [None]:
# BERT TEXT

DPH_br = pt.BatchRetrieve(indexref, controls={"wmodel" : "DPH"}, verbose=True, metadata=["docno", "body", "text"])
cedrpipetext = DPH_br >> CEDRPipeline(max_valid_rank=20, doc_attr="text")

cedrpipetext.fit(train, qlr, val, qlr)

In [None]:
pipeline = (
    DPH_br
    >> (
        cedrpipetext
        ** cedrpipekw
        ** cedrpipetitle
    )
)

lmart = lgb.LGBMRanker(
    task="train",
    silent=False,
    min_data_in_leaf=1,
    min_sum_hessian_in_leaf=1,
    max_bin=255,
    num_leaves=31,
    objective="lambdarank",
    metric="ndcg",
    ndcg_eval_at=[100],
    ndcg_at=[100],
    eval_at=[100],
    learning_rate=0.1,
    importance_type="gain",
    num_iterations=100,
    early_stopping_rounds=5,
    n_jobs=16
)
train, val = train_test_split(q, test_size=0.2)
lmart_pipe = pipeline >> pt.ltr.apply_learned_model(lmart, form="ltr")
lmart_pipe.fit(train, qrel, val, qrel)


In [None]:
empty = pd.DataFrame(columns=['qid', 'docno', 'label'])

pt.Experiment([lmart_pipe % 100], q_test, empty, ["ndcg_cut_100"], names=["LTR"], save_dir="results/", filter_by_topics=False, filter_by_qrels=False, save_mode="overwrite")


In [None]:
import fastrank
bm25 = pt.BatchRetrieve(indexref, wmodel="BM25", controls={"c" : 0.4, "bm25.k_1": 0.9, "bm25.k_3": 0.5})

ltr = bm25 >> (cedrpipehr ** cedrpipelr)
train_request = fastrank.TrainRequest.coordinate_ascent()
params = train_request.params
params.init_random = True
params.normalize = True
ca_pipe = ltr >> pt.ltr.apply_learned_model(train_request, form='fastrank')


ca_pipe.fit(q, qrel)

In [None]:
empty = pd.DataFrame(columns=['qid', 'docno', 'label'])

pt.Experiment([ca_pipe % 100], q_test, empty, ["ndcg_cut_100"], names=["LTR"], save_dir="results/", filter_by_topics=False, filter_by_qrels=False, save_mode="overwrite")
