#**Setting up hybrid (Full-text + Vector) search with Elasticsearch**

**Installing Libraries**

In [1]:
!pip install python-docx
!pip install PyMuPDF
!pip install elasticsearch


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3
Collecting elasticsearch
  Downloading elasticsearch-9.0.2-py3-none-any.whl.metadata (8.4 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Downloading elasticsearch-9.0.2-py3-no

**Creating Full-text Index named - 'search-724j' using Elasticsearch**

In [2]:
from elasticsearch import Elasticsearch
try:
  client = Elasticsearch(
  "https://my-elasticsearch-project-e6f5fb.es.us-east-1.aws.elastic.cloud:443",
  api_key="MF94dkVaZ0JjZkJIOGc2OENRTHc6X0dRVlVINTJCRWEzcWZCY0xmTGdCdw=="
)
  client.indices.create(
  index="search-724j",
  mappings={
        "properties": {
            "text": {"type": "text"}
        }
    }
)
except Exception as e:

  print(e)


BadRequestError(400, 'resource_already_exists_exception', 'index [search-724j/RabhzmyxS_WXWVdslEsWsA] already exists')


**Creating Hybrid index using Elasticsearch**

In [3]:
from elasticsearch import Elasticsearch
try:
  client = Elasticsearch(
  "https://my-elasticsearch-project-e6f5fb.es.us-east-1.aws.elastic.cloud:443",
  api_key="MF94dkVaZ0JjZkJIOGc2OENRTHc6X0dRVlVINTJCRWEzcWZCY0xmTGdCdw=="
)
  # Define the hybrid index mapping
  mappings = {
    "mappings": {
        "properties": {
            "text": {
                "type": "text",  # Full-text search field
                "analyzer": "standard"
            },
            "embedding": {
                "type": "dense_vector",  # Vector search field
                "dims": 384,             # Must match dimension of embedding model used
                "index": True,
                "similarity": "cosine"
            }
        }
    }
 }
  # Create the hybrid index
  index_name = "hybrid-index"

# Delete the index first if it already exists (for clean re-run)
  if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)

# Create the new index
  client.indices.create(index=index_name, body=mappings)

  print(f"Index '{index_name}' created with full-text and vector search mapping.")

except Exception as e:
  print(f"Error creating Hybrid index: {e}")


Index 'hybrid-index' created with full-text and vector search mapping.


**Content text extraction from Input documents**

In [4]:
from docx import Document
import fitz
from bs4 import BeautifulSoup
import os

def extract_text_and_images(file_path):
    images = []
    text = ""
    if file_path.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    elif file_path.endswith(".pdf"):
        doc = fitz.open(file_path)
        text = " ".join([page.get_text() for page in doc])
        for page_index in range(len(doc)):
            for img_index, img in enumerate(doc[page_index].get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image_name = f"page{page_index+1}_img{img_index+1}.{image_ext}"
                # image_path = f"./Extracted_images/{image_name}"
                image_path = f"./{image_name}"
                with open(image_path, "wb") as img_file:
                    img_file.write(image_bytes)
                images.append(image_name)
    elif file_path.endswith(".docx"):
        doc = Document(file_path)
        text = " ".join([p.text for p in doc.paragraphs])
    elif file_path.endswith(".html"):
        with open(file_path, "r", encoding="utf-8") as f:
            text = BeautifulSoup(f.read(), "html.parser").get_text()
    return text, images



In [5]:
# !pip install langdetect


**Indexing Full-text with Elasticsearch**

In [6]:
from sentence_transformers import SentenceTransformer
# import faiss
MODEL_NAME = 'all-MiniLM-L6-v2'
embed_model = SentenceTransformer(MODEL_NAME)
cwd = os.getcwd()


for file in os.listdir(cwd):
    if os.path.isfile(file):
      # print(f"📄 file: {file}")

      try:
            text, images = extract_text_and_images(file) # Document extraction

            text_vector_embeddings=embed_model.encode(text, convert_to_numpy=True).tolist() # converting text into vectors i.e muliti dimensional arrays of numbers


            #creating Hybrid indexing
            client.index(
                index="hybrid-index",
                document={
                    "text": text,
                    "embedding": text_vector_embeddings
                }
            )


            print(f"Indexed hybrid document: **{file}**.\n")



      except Exception as e:
            print(f"❌ Error Indexing hybrid for document {file}: {e}")

print(f"All documents (image + text) Index creation done with Elasticserach.\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexed hybrid document: **leave_policies_2025_companyXYZ.pdf**.

Indexed hybrid document: **HR_Onboarding_Guide_2025_CompanyXYZ.docx**.

All documents (image + text) Index creation done with Elasticserach.



**Buidling module for retrieving data on user's input search query**

In [7]:
from elasticsearch import Elasticsearch

MODEL_NAME = 'all-MiniLM-L6-v2'
embed_model = SentenceTransformer(MODEL_NAME)

def search_query(user_input_query):
  client = Elasticsearch(
    "https://my-elasticsearch-project-e6f5fb.es.us-east-1.aws.elastic.cloud:443",
    api_key="MF94dkVaZ0JjZkJIOGc2OENRTHc6X0dRVlVINTJCRWEzcWZCY0xmTGdCdw=="
)

  # converting user's search input query text into vector embeddings using same model used for vectorizing document's content texts
  query_vector = embed_model.encode(user_input_query).tolist()

  search_response = client.search(
        index="hybrid-index",     # make sure this matches your hybrid index
        knn={
            "field": "embedding",
            "query_vector": query_vector,
            "k": 5,
            "num_candidates": 100
        },
        query={
            "match": {
                "text": user_input_query
            }
        }
    )

  results = []
  for hit in search_response["hits"]["hits"]:
        results.append((hit["_source"]["text"], hit["_score"]))

  return results


**Demo - User's search query input & relevant documents contents retrieval using hybrid indexing**


In [17]:
import time


query = input("Type your query here... \n")

results = search_query(query)

print(f"please wait...searching 🔍")
time.sleep(1.2)


if len(results):
  score_set=set()
  print("\nDocument contents found related to your query:\n")
  # Find the highest score from results
  mx = max(score for _, score in results)
  for doc, score in results:
    if score == mx and score not in score_set:
      score_set.add(score)
      print(f"🌟 (Top Recommendation - Score: {score}) : {doc}\n")

        # else:
        #     print(f"(Recommendation Score: {score}) : {doc}\n")
else:
      print("\nNo relevant documents found!")








Type your query here... 
How to connect to HRMS from home?
please wait...searching 🔍

Document contents found related to your query:

🌟 (Top Recommendation - Score: 4.0434194) : HR Onboarding Guide – Company XYZ Welcome to Company XYZ!

We're excited to have you onboard. This guide is designed to help you settle into your new role and navigate our company policies and practices smoothly. 1. Organization Structure Company XYZ operates across multiple verticals including:
- Engineering & Product Development
- Sales & Marketing
- Customer Support
- Human Resources
- Finance & Compliance

Each department is led by a VP who reports to the Executive Leadership Team (ELT). You’ll find the org chart in the internal portal under HR > Org Structure. 2. Contact Points 3. IT Access Steps To ensure secure access, please follow the below steps:
1. Collect your official laptop and accessories from the IT desk.
2. Log in using your employee credentials shared via onboarding email.
3. Reset your tempor