In [2]:
import tantivy
import json

In [2]:
def read_queries():
    with open("data/qrels.train.with_queries.tsv", 'r') as file:
        next(file)
        for line in file:
            _, doc_id, query = line.strip().split("\t")
            yield int(doc_id), query

In [11]:
queries = {}

with open("data/quora/queries.jsonl", "r") as file:
    for line in file:
        row = json.loads(line)
        queries[row["_id"]] = { **row, "doc_ids": [] }

In [12]:
with open("data/quora/qrels/test.tsv", "r") as file:
    next(file)
    for line in file:
        query_id, doc_id, _ = line.strip().split("\t")
        queries[query_id]["doc_ids"].append(doc_id)

In [3]:
schema_builder = tantivy.SchemaBuilder()
schema_builder.add_text_field("body", stored=True)
schema_builder.add_integer_field("doc_id", stored=True, indexed=True)
schema = schema_builder.build()
index = tantivy.Index(schema, path="data/bm25.tantivy/")


In [4]:
searcher = index.searcher()

In [16]:
import re
def sanitize_query_for_tantivy(query):
    # remove special characters: ()[]{}^"~*?:
    query = re.sub(r'[\(\)\[\]\{\}\^\"\~\*\?\:]', ' ', query)
    return query


def search_bm25(query, limit):
    query = index.parse_query(sanitize_query_for_tantivy(query), ['body'])
    hits = searcher.search(query, limit).hits
    docs = [
        searcher.doc(doc_address)
        for (score, doc_address) in hits
    ]
    return docs

In [20]:
n = 0
hits = 0
limit = 100
number_of_queries = 100

for doc_id, query in read_queries():
    n += 1
    if n > number_of_queries:
        break
    print(f"Processing query: {query}")
    result = search_bm25(query, limit)
    found_ids = []

    for hit in result:
        found_ids.append(hit["doc_id"][0])

    if doc_id in found_ids:
        hits += 1

print(f"Recall @ {limit}: {hits} out of {n} = {hits/n}")


Processing query: )what was the immediate impact of the success of the manhattan project?
Processing query: _________ justice is designed to repair the harm to victim, the community and the offender caused by the offender criminal act. question 19 options:
Processing query: what color is amber urine
Processing query: is autoimmune hepatitis a bile acid synthesis disorder
Processing query: elegxo meaning
Processing query: how much does an average person make for tutoring
Processing query: can you use a calculator on the compass test
Processing query: what does physical medicine do
Processing query: what does pending mean on listing
Processing query: feeding rice cereal how many times per day
Processing query: most dependable affordable cars
Processing query: lithophile definition
Processing query: what is a flail chest
Processing query: what causes ear infection in adults
Processing query: put yourself on child support in texas
Processing query: mushrooms health benefits
Processing quer