In [None]:
#Set up connection
from opensearchpy import OpenSearch

OPENSEARCH_HOST = "opensearch-node"
OPENSEARCH_PORT = 9200
INDEX_NAME = "ir-dataset"

# ----------------------------
# OpenSearch client
# ----------------------------
client = OpenSearch(
    hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
    http_compress=True,
    use_ssl=False,
    verify_certs=False)



In [None]:
##create an index


index_body = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "doc_id": {"type": "keyword"},
            "text": {
                "type": "text",
                "analyzer": "standard"
            }
        }
    }
}
client.info()
client.indices.create(index=INDEX_NAME, body=index_body)



In [None]:
## Index a single document to try 

import ir_datasets
dataset = ir_datasets.load("msmarco-passage/dev/small")

doc = next(dataset.docs_iter())
print(doc)

client.index(
    index=INDEX_NAME,
    id=doc.doc_id,
    body={
        "doc_id": doc.doc_id,
        "text": doc.text
    },
    refresh=True   # IMPORTANT for immediate search
)



In [None]:
client.count(index=INDEX_NAME)
print("Qrels:", dataset.qrels_count())
print("Docs:", dataset.docs_count())
print("Queries:", dataset.queries_count())

In [None]:
from opensearchpy import OpenSearch, helpers
import ir_datasets
import time
dataset = ir_datasets.load("msmarco-passage/dev/small")
from opensearchpy.helpers import bulk
from tqdm import tqdm
MAX_DOCS = 10_000_000
BATCH_SIZE = 1000   # safe value for laptops



def index_docs():
    for doc in dataset.docs_iter():
        client.index(
        index=INDEX_NAME,
        id=doc.doc_id,
        body={
            "doc_id": doc.doc_id,
            "text": doc.text
        },
        refresh=True   # IMPORTANT for immediate search
)

def index_docs_bulk():
    actions = []
    error_count = 0
    success_count = 0

    docs_iter = dataset.docs_iter()

    for i, doc in enumerate(docs_iter):
        if i >= MAX_DOCS:
            break
        yield {
            "_index": INDEX_NAME,
            "_id": doc.doc_id,
            "_source": {
                "doc_id": doc.doc_id,
                "text": doc.text
            }
        }

# -----------------------------
# BULK INDEX WITH PROGRESS
# -----------------------------
start_time = time.time()
doc_count = 0
error_count = 0

with tqdm(total=MAX_DOCS, desc="Indexing documents") as pbar:
    for success, info in helpers.streaming_bulk(
        client,
        index_docs_bulk(),
        chunk_size=BATCH_SIZE,
        request_timeout=120,
    ):
        if success:
            doc_count += 1
        else:
            error_count += 1
        pbar.update(1)

end_time = time.time()

# -----------------------------
# FINALIZE
# -----------------------------
client.indices.put_settings(
    index=INDEX_NAME,
    body={"index": {"refresh_interval": "1s"}}
)
client.indices.refresh(index=INDEX_NAME)

# -----------------------------
# STATS
# -----------------------------
elapsed = end_time - start_time
rate = doc_count / elapsed

count_in_index = client.count(index=INDEX_NAME)["count"]

print("\n====== INGESTION COMPLETE ======")
print(f"Documents indexed: {doc_count}")
print(f"Errors: {error_count}")
print(f"Elapsed time: {elapsed:.2f} seconds")     
print(f"Indexing rate: {rate:.2f} docs/sec")
print(f"Docs in index: {count_in_index}")

client.transport.close()

In [None]:
qrels =dataset.qrels_iter()
dataset.qrels_count

#print(next(qrels))

In [None]:
## bulk index all the documents in the dataset 

import json
from opensearchpy import OpenSearch, helpers

# ----------------------------
# Bulk index
# ----------------------------
print("Indexing documents...")
helpers.bulk(
    client,
    index_docs_bulk(),
    chunk_size=100,
    request_timeout=120
)

print("âœ… Indexing complete")


In [None]:
client.count(index=INDEX_NAME)


In [None]:
import ranx
from ranx import Qrels, Run, evaluate

qrels = Qrels.from_ir_datasets(dataset)
run = Run.from_dict({
    "q1": {"doc1": 1.0, "doc2": 0.8}
})

metrics = evaluate(qrels, run, ["precision@10", "recall@10"])
print(metrics)
