# Embed sample data

## Imports

In [74]:
import os
import re
import time
import requests
import json
import fitz  # PyMuPDF
import pandas as pd
import pytesseract

from uuid import uuid4
from dotenv import load_dotenv
from requests import Response
from PIL import Image

In [2]:
load_dotenv("./.env")

SEARCH_SERVICE_NAME = os.getenv("SEARCH_SERVICE_NAME")
SEARCH_SERVICE_INDEX_NAME = os.getenv("SEARCH_SERVICE_INDEX_NAME")
SEARCH_SERVICE_API_KEY = os.getenv("SEARCH_SERVICE_API_KEY")
SEARCH_SERVICE_API_VERSION = os.getenv("SEARCH_SERVICE_API_VERSION")

EMBEDDING_MODEL_ENDPOINT = os.getenv("EMBEDDING_MODEL_ENDPOINT")
EMBEDDING_MODEL_API_KEY = os.getenv("EMBEDDING_MODEL_API_KEY")

print("SEARCH_SERVICE_NAME:\t\t", SEARCH_SERVICE_NAME)
print("SEARCH_SERVICE_INDEX_NAME:\t", SEARCH_SERVICE_INDEX_NAME)
print("SEARCH_SERVICE_API_KEY (len):\t", len(SEARCH_SERVICE_API_KEY))
print("SEARCH_SERVICE_API_VERSION:\t", SEARCH_SERVICE_API_VERSION)

print()

print("EMBEDDING_MODEL_ENDPOINT:\t", EMBEDDING_MODEL_ENDPOINT)
print("EMBEDDING_MODEL_API_KEY (len):\t", len(EMBEDDING_MODEL_API_KEY))


SEARCH_SERVICE_NAME:		 dta-dguupj-prod-eus-as
SEARCH_SERVICE_INDEX_NAME:	 device-type-agent-index-001
SEARCH_SERVICE_API_KEY (len):	 52
SEARCH_SERVICE_API_VERSION:	 2024-07-01

EMBEDDING_MODEL_ENDPOINT:	 https://enric-m6roqvko-eastus2.openai.azure.com/openai/deployments/text-embedding-3-large/embeddings?api-version=2023-05-15
EMBEDDING_MODEL_API_KEY (len):	 84


## Utils

In [72]:
class AzureAISearchIndex:
    def __init__(
            self,
            service_name: str,
            index_name: str,
            api_key: str,
            api_version: str
    ) -> None:
        self._put_endpoint = f"https://{service_name}.search.windows.net/indexes/{index_name}/docs/index?api-version={api_version}"  # noqa
        self._query_endpoint = f"https://{service_name}.search.windows.net/indexes/{index_name}/docs/search?api-version={api_version}"  # noqa
        self._api_key = api_key
    
    def search(self, query: dict) -> Response:
        headers = self._get_json_headers()
        response = requests.post(self._query_endpoint, headers=headers, json=query)
        return response

    def put_documents(self, documents: list) -> Response:
        if not isinstance(documents, list):
            raise TypeError("put_documents expected an list for 'documents'")
    
        headers = self._get_json_headers()
        payload = {
            "value": documents
        }
        response = requests.post(self._put_endpoint, headers=headers, json=payload)
        return response

    def _get_json_headers(self) -> dict:
        headers = {
            "Content-Type": "application/json",
            "api-key": self._api_key
        }

        return headers


def sanitize_string_for_index_key(name: str) -> str:
    return re.sub(r'[^a-zA-Z0-9\-_]', '_', name)

search_index = AzureAISearchIndex(
    service_name=SEARCH_SERVICE_NAME,
    index_name=SEARCH_SERVICE_INDEX_NAME,
    api_key=SEARCH_SERVICE_API_KEY,
    api_version=SEARCH_SERVICE_API_VERSION
)

search_index

<__main__.AzureAISearchIndex at 0x1502aad90>

In [47]:
resp = search_index.put_documents([
    {
        "id": str(uuid4().hex),
        "deviceID": "device_id_123",
        "deviceTypeID": "device_type_001",
        "documentID": "doc_id_123",
        "documentName": "filename",
        "documentPageNumber": 1,
        "documentPageContent": "content bla bla fix",
        "documentPageContentEmbedding": [0.1] * 3072,
        "metadata_json": json.dumps({
            "dummy": "data"
        })
    }
])

resp.status_code, resp.json()

(200,
 {'@odata.context': "https://dta-dguupj-prod-eus-as.search.windows.net/indexes('device-type-agent-index-001')/$metadata#Collection(Microsoft.Azure.Search.V2024_07_01.IndexResult)",
  'value': [{'key': '82aaff43e93a446aa4c46e0757879138',
    'status': True,
    'errorMessage': None,
    'statusCode': 201}]})

In [15]:
class AzureDocumentEmbeddingModel:
    def __init__(
            self,
            endpoint: str,
            api_key: str
    ) -> None:
        self._endpoint = endpoint
        self._api_key = api_key

    def embed(
            self,
            embed_content: str | list[str],
            max_retries: int = 1,
            current_retries: int = 0
    ) -> Response:
        headers = self._get_json_headers()
        data = {
            "input": embed_content
        }

        response = requests.post(self._endpoint, headers=headers, json=data)

        if response.status_code == 200 or response.status_code != 429:
            return response

        try:
            message = response.json()["error"]["message"]
            retry_time_str = message.split("retry after ")[1].split(" seconds")[0]
            retry_time = int(retry_time_str) + 5

            print(f"Sleep because of retry for: {retry_time}s")
            time.sleep(retry_time)

            new_current_retries = current_retries + 1
            if new_current_retries > max_retries:
                return response

            return self.embed(
                embed_content=embed_content,
                max_retries=max_retries,
                current_retries=new_current_retries
            )
        except Exception:
            return response
            
    def _get_json_headers(self) -> dict:
        headers = {
            "Content-Type": "application/json",
            "api-key": self._api_key
        }

        return headers


embedding_model = AzureDocumentEmbeddingModel(
    endpoint=EMBEDDING_MODEL_ENDPOINT,
    api_key=EMBEDDING_MODEL_API_KEY
)

embedding_model

<__main__.AzureDocumentEmbeddingModel at 0x151351ed0>

## Testing utils

In [17]:
resp = search_index.search({
    "search": "*",
    "count": True,
    "top": 5
})

if resp.status_code == 200:
    print(json.dumps(resp.json(), indent=4))
else:
    print("Error:", resp.text)


{
    "@odata.context": "https://dta-dguupj-prod-eus-as.search.windows.net/indexes('device-type-agent-index-001')/$metadata#docs(*)",
    "@odata.count": 0,
    "value": []
}


In [41]:
embed_string = ["Hello world", "second string"]
resp = embedding_model.embed(embed_content=embed_string)

if resp.status_code == 200:
    resp_data = resp.json()["data"]
    resp_count = len(resp_data)
    embed_dim0 = resp_data[0]["embedding"]
    print("RESPONSE LENGTH:", resp_count)
    print("EMBEDDING DIMENTION (0):", len(embed_dim0))
    print()
    print(json.dumps(resp.json(), indent=4))
else:
    print("Error:", resp.text)
    

RESPONSE LENGTH: 2
EMBEDDING DIMENTION (0): 3072

{
    "object": "list",
    "data": [
        {
            "object": "embedding",
            "index": 0,
            "embedding": [
                -0.00879018,
                -0.010218531,
                0.0061967103,
                0.03174304,
                0.008186044,
                -0.0064383647,
                -0.0041663814,
                0.076708026,
                0.02742778,
                0.028998533,
                0.0020206196,
                -0.011642566,
                -0.020160886,
                -0.019211529,
                -0.0049970686,
                0.036869563,
                -0.012617814,
                -0.002716455,
                -0.0072927857,
                -0.018141344,
                0.02226673,
                0.003780166,
                -0.017502686,
                0.055062693,
                0.0029494788,
                0.024027357,
                -0.014671877,
                

## Save dummy data to search index (local filesystem)

In [77]:
# pip install openai pymupdf pandas pytesseract pillow

def extract_text_from_pdf(pdf_path):
    text_pages = []
    with fitz.open(pdf_path) as doc:
        for page_number, page in enumerate(doc):
            page_number += 1
            text = page.get_text("text")
            final_text = f"""### PAGE METADATA START
document page: {page_number}\n
### PAGE METADATA END
### PAGE CONTENT
{text}
### PAGE CONTENT END
"""
            text_pages.append(final_text)
    return text_pages


def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {image_path}: {e}")
        return ""


def extract_text(file_path) -> list[str]:
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    # elif ext in [".png", ".jpeg", ".jpg"]:
    #     return [extract_text_from_image(file_path)]
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            return [f.read()]
    elif ext == ".csv":
        df = pd.read_csv(file_path)
        return [df.to_string()]
    else:
        return None  # Unsupported formats return empty text

In [84]:
DATA_FOLDER = "../../../../resources/data/dummy"

file_data = {}

for root, _, files in os.walk(DATA_FOLDER):
    for filename in files:
        # if not filename.startswith("nik"):
        #     continue
        file_path = os.path.join(root, filename)
        text_data = extract_text(file_path)
        if text_data is None:
            print("FILE\t", filename, "\tNot supported\n")
            continue

        print("FILE\t", filename, "\tProcess started")

        file_data[filename] = text_data
        resp = embedding_model.embed(text_data)

        if resp.status_code != 200:
            print(f"Error while embed file '{filename}'. Response: {resp.text}\n")
            continue

        embeddings = resp.json()["data"]
        document_id = "document_id_follows"
        
        documents = [
            {
                "id": sanitize_string_for_index_key(f"{document_id}_{filename}_page_{page_number + 1}"),
                "deviceID": "device_id_123",  # TODO: add imt device id
                "deviceTypeID": "device_type_001",  # TODO: add imt device type id
                "documentID": "doc_id_123",  # TODO: add blob document id
                "documentName": filename,
                "documentPageNumber": page_number + 1,
                "documentPageContent": text_data[page_number],
                "documentPageContentEmbedding": embedding["embedding"],
                "metadata_json": json.dumps({
                    "dummy": "data"
                })
            } for page_number, embedding in enumerate(embeddings)
        ]

        resp = search_index.put_documents(documents)
        if resp.status_code == 200:
            print("FILE\t", filename, "\tSuccessfully processed\n")
            continue

        print(f"Error while saving embeddings of {filename}. Error: {resp.text}\n")

print("FINISHED PROCESS")


FILE	 receipt_example.png 	Not supported

FILE	 nike2019.pdf 	Process started
FILE	 nike2019.pdf 	Successfully processed

FILE	 deskpicture.jpeg 	Not supported

FILE	 1009.txt 	Process started
FILE	 1009.txt 	Successfully processed

FILE	 1008.txt 	Process started
FILE	 1008.txt 	Successfully processed

FILE	 1005.txt 	Process started
FILE	 1005.txt 	Successfully processed

FILE	 1004.txt 	Process started
FILE	 1004.txt 	Successfully processed

FILE	 1010.txt 	Process started
FILE	 1010.txt 	Successfully processed

FILE	 1006.txt 	Process started
FILE	 1006.txt 	Successfully processed

FILE	 1007.txt 	Process started
FILE	 1007.txt 	Successfully processed

FILE	 1003.txt 	Process started
FILE	 1003.txt 	Successfully processed

FILE	 1002.txt 	Process started
FILE	 1002.txt 	Successfully processed

FILE	 HR-Guide_-Policy-and-Procedure-Template.pdf 	Process started
FILE	 HR-Guide_-Policy-and-Procedure-Template.pdf 	Successfully processed

FILE	 XDroneManual.pdf 	Process started
FILE	 XDr

## Use Embedding-Vector search

In [85]:
# Define the search query

search_term = "What can i do, when my drone is not working?"
resp = embedding_model.embed(search_term)
assert resp.status_code == 200

search_term_vector = resp.json()["data"][0]["embedding"]

search_query = {
    # "search": search_term,  # Text search query
    "vectorQueries": [
        {
            "vector": search_term_vector,
            "k": 5,  # Number of nearest neighbors to return
            "fields": "documentPageContentEmbedding",
            "kind": "vector"
        }
    ],
    # "searchFields": "documentPageContent",  # Keyword search field
    "select": "id, documentName, documentPageNumber, documentPageContent",
    "top": 100
    ,"count": True  # TODO: only for debugging
}

search_response = search_index.search(search_query)

if search_response.status_code == 200:
    search_results = search_response.json()
    print(json.dumps(search_results, indent=2))  # Pretty print the JSON response
else:
    print(f"Error: {search_response.status_code}, {search_response.text}")

{
  "@odata.context": "https://dta-dguupj-prod-eus-as.search.windows.net/indexes('device-type-agent-index-001')/$metadata#docs(*)",
  "@odata.count": 5,
  "value": [
    {
      "@search.score": 0.6687783,
      "id": "document_id_follows_XDroneManual_pdf_page_3",
      "documentName": "XDroneManual.pdf",
      "documentPageNumber": 3,
      "documentPageContent": "### PAGE METADATA START\ndocument page: 3\n\n### PAGE METADATA END\n### PAGE CONTENT\n\u2022 \nStore in a cool, dry place away from direct sunlight. \n6. Troubleshooting \n6.1 Common Issues \n\u2022 \nIf the drone is not responding, refer to Section 6.1.1. \n\u2022 \nTroubleshooting steps for connectivity issues are detailed in Section 6.1.2. \n7. Technical Specifications \n\u2022 \nWeight: 1.5 kg \n\u2022 \nMaximum Flight Time: 30 minutes \n\u2022 \nCamera Resolution: 12 MP \n\u2022 \nConnectivity: Wi-Fi, Bluetooth \n8. Contact Information \nFor further assistance or inquiries, please contact our customer support at \nsuppo