# Elasticsearch

In [None]:
import logging
import os
import sys

from dotenv import load_dotenv

load_dotenv("../../.env", override=True)

azure_openai_endpoint = os.environ["AZURE_OPENAI_SERVICE_BASEURL"]
azure_openai_key = os.environ["AZURE_OPENAI_SERVICE_TOKEN"]
azure_openai_version = os.environ["AZURE_OPENAI_SERVICE_API_VERSION"]
azure_openai_embedding_deployment_name = os.environ["AZURE_OPENAI_SERVICE_EMBEDDING_DEPLOYMENT"]
azure_openai_embedding_model_name = os.environ["AZURE_OPENAI_SERVICE_EMBEDDING_MODEL_NAME"]
azure_openai_chat_deployment_name = os.environ["AZURE_OPENAI_SERVICE_CHAT_DEPLOYMENT"]
azure_openai_chat_model_name = os.environ["AZURE_OPENAI_SERVICE_CHAT_MODEL_NAME"]

database_connection_string = os.environ["DATABASE_CONNECTION_STRING"]

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## Configure Azure OpenAI models

In [None]:
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import Settings

azure_openai_chat_model = AzureOpenAI(
    model=azure_openai_chat_model_name,
    deployment_name=azure_openai_chat_deployment_name,
    api_key=azure_openai_key,
    azure_endpoint=azure_openai_endpoint,
    api_version=azure_openai_version,
)

azure_openai_embedding_model = AzureOpenAIEmbedding(
    model=azure_openai_embedding_model_name,
    deployment_name=azure_openai_embedding_deployment_name,
    api_key=azure_openai_key,
    azure_endpoint=azure_openai_endpoint,
    api_version=azure_openai_version,
)

Settings.llm = azure_openai_chat_model
Settings.embed_model = azure_openai_embedding_model

## Create vector store

In [None]:
from llama_index.vector_stores.elasticsearch import ElasticsearchStore

vector_store = ElasticsearchStore(
    index_name="index", es_url="http://elasticsearch:9200"
)

## Create pipeline

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=2000, chunk_overlap=500),
        azure_openai_embedding_model,
    ],
    vector_store=vector_store,
)

pipeline.run(documents=SimpleDirectoryReader("../../data/raw/").load_data())

## Create search engines

In [None]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
)

retriever_engine = index.as_retriever()
retriever_hybrid_engine = index.as_retriever(vector_store_query_mode="hybrid", sparse_top_k=3)

## Retrieve using vector search

In [None]:
result = retriever_engine.retrieve("Hva er fartsgrensen i boliggater?")

In [None]:
# First result
print(result[0].text)

## Query using hybrid search
Denne kommer til å feile fordi vi ikke har Elasticsearch premium

In [None]:
#response = retriever_hybrid_engine.retrieve("Hva er fartsgrensen i boliggater?")

## Query using HNSW EF Search

In [None]:
# TODO Investigate how to use

## Query using ES client

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://elasticsearch:9200")

### Keyword search

In [None]:
response_keyword = es.search(
    index="kibok-index",
    query={
        "match": {
            "content": {
                "query": "fasaderekker",
                "boost": 0.9
            }
        }
    },
    size=5,
    fields=["content"],
    _source=False
)


In [None]:
print(response_keyword)

### Vector search

In [None]:
query_embedding = azure_openai_embedding_model.get_text_embedding("Fartgrense i tettbebygd strøk")

In [None]:
response_vector = es.search(
    index="kibok-index",
    query={
        "knn": {
            "field": "embedding",  # Ensure this matches the name of your dense_vector field
            "query_vector": query_embedding,
            "num_candidates": 50,  # This is optional but can help with performance/accuracy trade-offs
            "boost": 0.1,
        }
    },
    fields=["content"],
    _source=False,
)

In [None]:
print(response_vector)

### Hybrid søk

In [None]:
query_hybrid_embedding = azure_openai_embedding_model.get_text_embedding("Hva er ÅDT for sambruksområder")
query_hybrid_keyword = "sambruksområder"

In [None]:
response_hybrid = es.search(
    index="kibok-index",
    query={
        "match": {"content": {"query": query_hybrid_keyword, "boost": 0.9}},
    },
    knn={
        "field": "embedding",
        "query_vector": query_hybrid_embedding,
        "k": 5,
        "num_candidates": 50,
        "boost": 0.1,
    },
    fields=["content"],
    _source=False,
)

In [None]:
response_hybrid