# 0. Setting up Elastic DB and Kibana

Kibana is optional

-----

Follow quick start guide using Docker for Elastic DB: https://www.elastic.co/guide/en/elasticsearch/reference/7.17/getting-started.html

Also run a terminal to get to kibana from the gui download the sample global flight dataset

test in another terminal to see if your db is working
bash: curl -X GET http://localhost:9200/

Install and run Elasticsearch

Install and start Docker Desktop.
Run:

```python
docker network create elastic
docker pull docker.elastic.co/elasticsearch/elasticsearch:7.17.25
docker run --name es01-test --net elastic -p 127.0.0.1:9200:9200 -p 127.0.0.1:9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.17.25
```
Install and run Kibana

To analyze, visualize, and manage Elasticsearch data using an intuitive UI, install Kibana.

In a new terminal session, run:

```python
docker pull docker.elastic.co/kibana/kibana:7.17.27
docker run --name kib01-test --net elastic -p 127.0.0.1:5601:5601 -e "ELASTICSEARCH_HOSTS=http://es01-test:9200" docker.elastic.co/kibana/kibana:7.17.27
To access Kibana, go to http://localhost:5601
```

Hooking up your API Key (takes about 50¢ to add 1536 dimension vector embeddings to 13000 entries)

Using Ollama:
1. https://ollama.com/download

2. `ollama run llama3.2` in terminal 

# Code

In [11]:
import os
import json
from tqdm import tqdm  # For progress tracking

from elasticsearch import Elasticsearch, helpers
from elasticsearch.helpers import reindex
import time
import tiktoken # for truncating long inputs, not necessary if you can tailor inputs ahead of embedding

es = Elasticsearch("http://localhost:9200", basic_auth=("elastic"))
print(es.info())
source_index = "kibana_sample_data_flights"
target_index = "flights_with_embeddings"


{'name': '4d52facd65b5', 'cluster_name': 'docker-cluster', 'cluster_uuid': '0IO0Xm81SVe13I2_Y-cAZA', 'version': {'number': '7.17.25', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'f9b6b57d1d0f76e2d14291c04fb50abeb642cfbf', 'build_date': '2024-10-16T22:06:36.904732810Z', 'build_snapshot': False, 'lucene_version': '8.11.3', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'}


  print(es.info())


In [18]:
# delete the target index
es_client.indices.delete(index=target_index, ignore=[404])

  es_client.indices.delete(index=target_index, ignore=[404])
  es_client.indices.delete(index=target_index, ignore=[404])


ObjectApiResponse({'error': {'root_cause': [{'type': 'index_not_found_exception', 'reason': 'no such index [flights_with_embeddings_ollama_lc]', 'resource.type': 'index_or_alias', 'resource.id': 'flights_with_embeddings_ollama_lc', 'index_uuid': '_na_', 'index': 'flights_with_embeddings_ollama_lc'}], 'type': 'index_not_found_exception', 'reason': 'no such index [flights_with_embeddings_ollama_lc]', 'resource.type': 'index_or_alias', 'resource.id': 'flights_with_embeddings_ollama_lc', 'index_uuid': '_na_', 'index': 'flights_with_embeddings_ollama_lc'}, 'status': 404})

In [23]:
import os
import json
import time
from tqdm import tqdm

import requests
from langchain.embeddings.base import Embeddings
from langchain.vectorstores import ElasticsearchStore
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Elasticsearch + helpers
from elasticsearch import Elasticsearch, helpers

# ------------------------------------------------------------------------
# 1. OLLAMA EMBEDDINGS CLASS
# ------------------------------------------------------------------------
OLLAMA_EMBEDDINGS_URL = "http://localhost:11434/api/embeddings"
OLLAMA_MODEL_NAME = "nomic-embed-text"

class OllamaEmbeddings(Embeddings):
    """
    A simple LangChain Embeddings wrapper that calls Ollama's REST API.
    """
    def __init__(self, model_name: str = OLLAMA_MODEL_NAME):
        self.model_name = model_name

    def embed_documents(self, texts):
        """Compute embeddings for a list of texts."""
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        """Compute embedding for a single query text."""
        return self._embed(text)

    def _embed(self, text: str):
        payload = {"model": self.model_name, "prompt": text}
        response = requests.post(OLLAMA_EMBEDDINGS_URL, json=payload)
        response.raise_for_status()
        data = response.json()
        return data["embedding"]  # expect list[float]

# ------------------------------------------------------------------------
# 2. ELASTICSEARCH SETUP
# ------------------------------------------------------------------------
# Connect to Elasticsearch
es_client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "changeme")  # Update password if needed
)

# Indices
source_index = "kibana_sample_data_flights"
target_index = "flights_with_embeddings_ollama_lc"

# ------------------------------------------------------------------------
# 3. CREATE A LANGCHAIN VECTOR STORE
# ------------------------------------------------------------------------
ollama_embeddings = OllamaEmbeddings()
es_store = ElasticsearchStore(
    embedding=ollama_embeddings,
    index_name=target_index,
    es_connection=es_client
)

# ------------------------------------------------------------------------
# 4. HELPER FUNCTIONS TO FETCH & CONVERT DOCS
# ------------------------------------------------------------------------
def fetch_documents_in_batches(index, es_client, batch_size=100):
    """
    Generator that scans the 'source_index' in Elasticsearch and 
    yields documents in batches of `batch_size`.
    """
    query = {"query": {"match_all": {}}}
    scan = helpers.scan(
        es_client,
        index=index,
        query=query,
        scroll='1m'
    )
    batch = []
    for doc in scan:
        batch.append(doc)
        if len(batch) >= batch_size:
            yield batch
            batch = []
    # yield the last batch if leftover
    if batch:
        yield batch

def convert_es_docs_to_lc_docs(es_docs):
    """
    Convert a list of Elasticsearch docs into LangChain Document objects.
    Decide how you want to build the page_content from the fields.
    """
    lc_docs = []
    for d in es_docs:
        source = d['_source']
        text = "\n".join(f"{k}: {v}" for k, v in source.items())
        
        lc_docs.append(
            Document(
                page_content=text,
                metadata={
                    "_id": d["_id"],
                    "_index": d["_index"],
                }
            )
        )
    return lc_docs

# Optional splitter if docs are large
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# ------------------------------------------------------------------------
# 5. INDEX DOCUMENTS INTO THE NEW VECTOR INDEX
# ------------------------------------------------------------------------
def index_documents_from_source(
    source_idx: str,
    target_store,
    batch_size: int = 50,
    max_docs: int | None = None
):
    """
    1. Scan the source Elasticsearch index in batches.
    2. Convert to LangChain Documents.
    3. (Optional) split them.
    4. Add them to the VectorStore (which embeds + indexes).
    """
    total_docs = es_client.count(index=source_idx)['count']
    if max_docs is not None:
        total = min(total_docs, max_docs)
    else:
        total = total_docs

    pbar = tqdm(total=total, desc="Indexing")
    
    docs_embedded = 0
    for batch in fetch_documents_in_batches(source_idx, es_client, batch_size):
        if max_docs is not None and docs_embedded >= max_docs:
            break
        
        remaining = max_docs - docs_embedded if max_docs is not None else len(batch)
        partial_batch = batch[:remaining]
        
        lc_docs = convert_es_docs_to_lc_docs(partial_batch)
        splitted_docs = splitter.split_documents(lc_docs)
        
        target_store.add_documents(splitted_docs)
        
        docs_embedded += len(partial_batch)
        pbar.update(len(partial_batch))

    pbar.close()
    print(
        f"Completed indexing from '{source_idx}' to '{target_index}' "
        f"with {docs_embedded} docs processed."
    )

# ------------------------------------------------------------------------
# 6. SCRIPT_SCORE SEARCH (Cosine Similarity) -- EXCLUDING THE VECTOR FIELD
# ------------------------------------------------------------------------
def script_score_search(index_name: str, query_vector: list[float], top_k: int = 5):
    """
    Perform a script_score cosine similarity query on the field 'vector'.
    Returns the raw Elasticsearch response, excluding the 'vector' field.
    """
    body = {
        "size": top_k,
        "query": {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                    "params": {
                        "query_vector": query_vector
                    }
                }
            }
        }
    }
    # We tell Elasticsearch not to return the 'vector' field in the results
    return es_client.search(
        index=index_name,
        body=body,
        _source_excludes=["vector"]  # This ensures the 'vector' field is omitted
    )

# ------------------------------------------------------------------------
# 7. EXTRACT FLIGHT INFO FROM HITS
# ------------------------------------------------------------------------
def extract_context_from_hits(hits):
    """
    Format flight-related fields from each document, parsing them from the
    '_source.text' string. Returns a multi-line string with flight info.
    """
    context = ""
    for hit in hits:
        # 'text' holds lines like "FlightNum: NF156JH\nDestCountry: IN\n..."
        text_str = hit["_source"]["text"]
        
        # Parse each line into a dict, e.g. {"FlightNum": "NF156JH", "DestCountry": "IN", ...}
        lines_dict = {}
        for line in text_str.split("\n"):
            if ": " in line:  # ensure the line is key: value format
                key, val = line.split(": ", 1)
                lines_dict[key] = val.strip()
        
        # Convert certain numeric/string fields
        def to_float(s):
            try:
                return float(s)
            except ValueError:
                return 0.0

        flight_num = lines_dict.get("FlightNum", "N/A")
        carrier = lines_dict.get("Carrier", "N/A")
        origin_city = lines_dict.get("OriginCityName", "N/A")
        origin_country = lines_dict.get("OriginCountry", "N/A")
        dest_city = lines_dict.get("DestCityName", "N/A")
        dest_country = lines_dict.get("DestCountry", "N/A")
        distance_km = to_float(lines_dict.get("DistanceKilometers", "0"))
        distance_mi = to_float(lines_dict.get("DistanceMiles", "0"))
        flight_time_hr = to_float(lines_dict.get("FlightTimeHour", "0"))
        origin_weather = lines_dict.get("OriginWeather", "N/A")
        dest_weather = lines_dict.get("DestWeather", "N/A")
        avg_ticket_price = to_float(lines_dict.get("AvgTicketPrice", "0"))
        timestamp = lines_dict.get("timestamp", "N/A")
        
        # Convert booleans
        flight_delay_str = lines_dict.get("FlightDelay", "False")
        flight_delay = (flight_delay_str.lower() == "true")
        cancelled_str = lines_dict.get("Cancelled", "False")
        cancelled = (cancelled_str.lower() == "true")

        # Format final text block
        flight_info = (
            f"Flight Number: {flight_num}\n"
            f"Carrier: {carrier}\n"
            f"From: {origin_city} ({origin_country})\n"
            f"To: {dest_city} ({dest_country})\n"
            f"Distance: {distance_km:,.2f} km ({distance_mi:,.2f} miles)\n"
            f"Flight Time: {flight_time_hr:.2f} hours\n"
            f"Weather at Origin: {origin_weather}\n"
            f"Weather at Destination: {dest_weather}\n"
            f"Average Ticket Price: ${avg_ticket_price:,.2f}\n"
            f"Delay: {'Yes' if flight_delay else 'No'}\n"
            f"Cancellation: {'Yes' if cancelled else 'No'}\n"
            f"Date: {timestamp}\n"
            "----------------------------------------\n"
        )
        context += flight_info

    return context

# ------------------------------------------------------------------------
# 8. COMBINED QUERY FUNCTION
# ------------------------------------------------------------------------
def query_vector_store(query_text: str, top_k=5):
    """
    1. Embed the query with OllamaEmbeddings.
    2. Run a script_score search on the newly created index.
    3. Return the hits so we can do something with them (like extract flight info).
    """
    print(f"\nQuery: {query_text}")
    query_vec = ollama_embeddings.embed_query(query_text)

    response = script_score_search(target_index, query_vec, top_k)
    hits = response["hits"]["hits"]
    return hits


-----

In [20]:
index_documents_from_source(source_index, es_store, batch_size=50, max_docs=200)


  total_docs = es_client.count(index=source_idx)['count']
  for doc in scan:
  break
Indexing: 100%|██████████| 200/200 [01:21<00:00,  2.45it/s]

Completed indexing from 'kibana_sample_data_flights' to 'flights_with_embeddings_ollama_lc' with 200 docs processed.





In [21]:
# 2) Example query
query_text = "show me a cheap flight to mexico from canada"
hits = query_vector_store(query_text, top_k=5)


Query: show me a cheap flight to mexico from canada


  return es_client.search(


In [24]:
# 3) Extract context from hits
context_str = extract_context_from_hits(hits)
print("EXTRACTED CONTEXT:\n", context_str)

EXTRACTED CONTEXT:
 Flight Number: FZ1FWP0
Carrier: Kibana Airlines
From: Mexico City (MX)
To: Ottawa (CA)
Distance: 3,589.92 km (2,230.67 miles)
Flight Time: 3.52 hours
Weather at Origin: Rain
Weather at Destination: Clear
Average Ticket Price: $937.73
Delay: No
Cancellation: No
Date: 2025-01-13T20:42:03
----------------------------------------
Flight Number: NF156JH
Carrier: Kibana Airlines
From: Mexico City (MX)
To: Hyderabad (IN)
Distance: 15,939.01 km (9,904.04 miles)
Flight Time: 18.98 hours
Weather at Origin: Rain
Weather at Destination: Hail
Average Ticket Price: $655.36
Delay: No
Cancellation: No
Date: 2025-01-13T05:32:47
----------------------------------------
Flight Number: U9MODFN
Carrier: ES-Air
From: Mexico City (MX)
To: San Francisco (US)
Distance: 3,027.21 km (1,881.02 miles)
Flight Time: 2.66 hours
Weather at Origin: Hail
Weather at Destination: Heavy Fog
Average Ticket Price: $928.53
Delay: No
Cancellation: No
Date: 2025-01-13T08:31:31
-------------------------------