In [1]:
import json
import time
import random
import ir_datasets

In [2]:
dataset = ir_datasets.load("beir/msmarco/test")

In [10]:
print("Indexing query relevance entries.")
qrels = [qrel.doc_id for qrel in dataset.qrels_iter()]
print("Indexing documents.")
all_docs = [doc.doc_id for doc in dataset.docs_iter()]
print("Creating the subset.")
non_qrels = list(set(all_docs) - set(qrels))
subset = qrels + []
for _ in range(len(qrels)):
    idx = random.randint(0, len(non_qrels))
    selected_doc = non_qrels.pop(idx)
    subset.append(selected_doc)
docs_store = dataset.docs_store()

Indexing query relevance entries.
Indexing documents.


In [4]:
all_docs = []
for doc_id in subset:
    doc = docs_store.get(doc_id)
    all_docs.append(doc)

In [13]:
for qrel in qrels:
    if qrel.query_id == "19335":
        if qrel.relevance == 1:
            print(qrel.doc_id)

1720389
1720395
2046505
3045565
3045567
527690
527692
527697
6452949
7122355
7320614
819168
8412685


In [4]:
# instance = [doc for doc in all_docs[:200]]
# instance_text = [doc.text for doc in instance]

queries = [query for query in dataset.queries_iter()]
query_text = queries[0].text

In [12]:
queries[0]

GenericQuery(query_id='19335', text='anthropological definition of environment')

In [6]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim

model = SentenceTransformer('thenlper/gte-base')
tic = time.time()
embeddings = model.encode(instance_text)
query_embedding = model.encode(query_text)
toc = time.time()
result = cos_sim(query_embedding, embeddings)
toc_cos = time.time()
result_dict = {instance.doc_id: score for instance, score in zip(instance, result.flatten().tolist())}
results = dict(sorted(result_dict.items(), key=lambda x: x[1], reverse=True))
toc_sort = time.time()
print(f"Time elapsed:")
print(f" - Embedding: {toc-tic: .3f} s")
print(f" - Cosine: {toc_cos-toc: .3f} s")
print(f" - Sort: {toc_sort-toc_cos: .3f} s")
print("=" * 120)
print(json.dumps(results, indent=2))


  from tqdm.autonotebook import tqdm, trange
2024-06-03 21:25:22.269480: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Time elapsed:
 - Embedding:  1.595 s
 - Cosine:  0.001 s
 - Sort:  0.000 s
{
  "8412682": 0.911795437335968,
  "8412681": 0.9072930812835693,
  "8412683": 0.9023938179016113,
  "527692": 0.8955329656600952,
  "584350": 0.8939870595932007,
  "2304005": 0.8935648202896118,
  "1720395": 0.8934893012046814,
  "1720389": 0.8924204111099243,
  "1720388": 0.8910499811172485,
  "2874503": 0.8907690048217773,
  "527697": 0.8906776905059814,
  "8412684": 0.8888145685195923,
  "2304004": 0.8873568773269653,
  "7122355": 0.886551022529602,
  "8635981": 0.8859370350837708,
  "1720387": 0.8857359290122986,
  "3559596": 0.885237455368042,
  "3175484": 0.8845697045326233,
  "527695": 0.8839197158813477,
  "772234": 0.883503794670105,
  "527690": 0.8833393454551697,
  "5508122": 0.8832810521125793,
  "5652228": 0.8824508190155029,
  "6452949": 0.8819512724876404,
  "3045567": 0.8813877105712891,
  "3045565": 0.8807436227798462,
  "342431": 0.8801847100257874,
  "5078863": 0.87717604637146,
  "4480942":

In [7]:
query_text

'anthropological definition of environment'

In [8]:
print("QUERY:", query_text)
print("DOC:", docs_store.get("8412682").text)

QUERY: anthropological definition of environment
DOC: Environmental anthropology is a sub-specialty within the field of anthropology that takes an active role in examining the relationships between humans and their environment across space and time.pplied anthropology utilizes these understandings to work with people on a local basis as well as trying to satisfy share holders working to gain a resolution for problems related to health, education, social welfare, development and environmental protection.


In [93]:
doc_id_to_whoosh = {i: doc["doc_id"].split("_")[-1] for i, doc in enumerate(s.documents())}

In [2]:
from irise import INDEX_DIR
from irise.indexer import Indexer

In [3]:
indexer = Indexer(path=INDEX_DIR / "irise_index_advanced")

In [4]:
initial_results = indexer.search("field relation health", limit=None, scored=True)

TypeError: Searcher.collector() got an unexpected keyword argument 'group'

In [97]:
initial_results.docset

{12792}

In [89]:
doc_id_to_whoosh[12792]

'8412682'

In [98]:
from whoosh.scoring import BM25F, TF_IDF, TF_IDFScorer
from whoosh.qparser import QueryParser, OrGroup

In [120]:
query = "anthropological definition of environment"
qp = QueryParser("text", schema=indexer.schema, group=OrGroup)
q = qp.parse(query)
s = indexer._get_searcher(weighting=TF_IDF())
r = s.search(q, limit=20)
print(r)

<Top 20 Results for Or([Term('text', 'anthropolog'), Term('text', 'definit'), Term('text', 'environ')]) runtime=0.002961660000437405>


In [130]:
for hit in r[:10]:
    doc_id = hit["doc_id"].split("_")[-1]
    text = docs_store.get(doc_id).text
    print(text)
    print("=" * 120)

Applied anthropology, also known as âpracticing anthropology,â is defined as the practical application of anthropological method and theory to the needs of society.It is, quite simply, anthropology put to good use.This fifth field of anthropology has increased dramatically in the past three decades. Today, most anthropology graduates pursue applied rather than academic careers.t is, quite simply, anthropology put to good use. This fifth field of anthropology has increased dramatically in the past three decades. Today, most anthropology graduates pursue applied rather than academic careers.
TECHNOLOGICAL ENVIRONMENT, DEMOGRAPHIC ENVIRONMENT, ENVIRONMENT, EXTERNAL ENVIRONMENT, OCCUPATIONAL ENVIRONMENT, MARKET CONTROLLED PRICE ENVIRONMENT, OLIGOPOLISTIC ENVIRONMENT, DESIGN FOR ENVIRONMENT, NATURAL ENVIRONMENT, WORK ENVIRONMENT.
Definition 1. A point is that which has no part. Definition 2. A line is breadthless length. Definition 3. The ends of a line are points. Definition 4. A strai

In [44]:
docs_store.get("12792").text

"Qualcomm's wholly owned subsidiary, Qualcomm Technologies, Inc. (QTI), operates substantially all of Qualcomm's R&D activities, as well as its product and services businesses, including its semiconductor business, Qualcomm CDMA Technologies."