In [None]:
import json
from opensearchpy import OpenSearch, helpers
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# ----------------------------
# CONFIG
# ----------------------------
OPENSEARCH_HOST = "localhost"
OPENSEARCH_PORT = 9200
INDEX_NAME = "movies_vector"
MOVIES_FILE = "E:\\Projects_practice\\movies_dataset_json_search\\Movies-dataset.json"

# ----------------------------
# OpenSearch client
# ----------------------------
client = OpenSearch(
    hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
    http_compress=True,
    use_ssl=False,
    verify_certs=False
)

# ----------------------------
# Load embedding model
# ----------------------------
print("Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

# ----------------------------
# Generator for bulk indexing
# ----------------------------
def generate_actions():
    with open(MOVIES_FILE, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue

            doc = json.loads(line)

            # Text to embed (you can combine fields)
            text_to_embed1 = doc.get("plot", "")
            text_to_embed2 = doc.get("title", "")

            # Generate vector
            vector1 = model.encode(text_to_embed1).tolist()
            vector2 = model.encode(text_to_embed2).tolist()

            # Add vector field
            doc["plot_vector"] = vector1
            doc["title_vector"] = vector2

            yield {
                "_index": INDEX_NAME,
                "_id": doc.get("id"),
                "_source": doc
            }

# ----------------------------
# Bulk index
# ----------------------------
print("Indexing documents...")
helpers.bulk(
    client,
    generate_actions(),
    chunk_size=100,
    request_timeout=120
)

print("âœ… Indexing complete")


^C
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/site-packages/pip/__main__.py", line 22, in <module>
Note: you may need to restart the kernel to use updated packages.


In [None]:
queries = {q.query_id: q.text for q in dataset.queries_iter()}
qrels = {}
for qrel in dataset.qrels_iter():
    qrels.setdefault(qrel.query_id, {})[qrel.doc_id] = qrel.relevance

print("queries:", len(queries))
print("qrels:", len(qrels))