## Initial setup

In [3]:
#!pip install tqdm --q # in case it is not installed in VS code

In [4]:
#!pip install ipywidgets --q 

In [5]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [6]:
# We should pull and run a docker container with Elasticsearch 8.9.0 or higher in order to use reranking based on RRF algorithm
# run in VS code terminal:
'''
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.17.6
''';

In [7]:
# build our documents list from json-preparsed FAG
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/05-best-practices/documents-with-ids.json
# here we need to get the link to a raw file, otherwise it will download HTML only

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [8]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [9]:
# get our model for embeddings

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)
model #  'architecture': 'BertModel','word_embedding_dimension': 384 - this is our vector size

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

## Indexing stage

In [10]:
# prepare embeddings for question, answer and both q+a and add to documents - takes 2-3 min or so....

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = model.encode(question)
    doc['text_vector'] = model.encode(text)
    doc['question_text_vector'] = model.encode(qt)

  0%|          | 0/948 [00:00<?, ?it/s]

In [12]:
# documents[0] # to see vectors added

In [13]:
# define ES index fields - ES in docker may take some time to start...

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [14]:
# finally create ES index

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

## Retrieval stage

In [18]:
#!pip install langchain --q

In [20]:
#!pip install langchain_elasticsearch --q

In [23]:
#!pip install langchain-community --q

In [24]:
# Retrieval stage

from langchain.embeddings import SentenceTransformerEmbeddings
from typing import Dict
from langchain_elasticsearch import ElasticsearchRetriever

In [25]:
# our elasticsearch instance link
es_url = 'http://localhost:9200'

In [26]:
query = 'I just discovered the course. Can I still join it?'
course = "data-engineering-zoomcamp"

In [27]:
# create embedding vector for our query
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/multi-qa-MiniLM-L6-cos-v1")


In [29]:
# why do we need to return such complex json? I guess to feed to our ES instance...
# hybrid query takes both text and vector embedding of the question and performs keyword and vector search

def hybrid_query(search_query: str) -> Dict:
    vector = embeddings.embed_query(search_query)  # same embeddings as for indexing
    return {
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": search_query,
                        "fields": ["question", "text", "section"],
                        "type": "best_fields",
                        "boost": 0.5,
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        },
        "knn": {
            "field": "question_text_vector",
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10000,
            "boost": 0.5,
            "filter": {
                "term": {
                    "course": course
                }
            }
        },
        "size": 5,
        # "rank": {"rrf": {}},
    }


hybrid_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=hybrid_query,
    content_field='text',
    url=es_url,
)


In [30]:
# now lets search our index
hybrid_results = hybrid_retriever.invoke(query)

In [32]:
# and parse the search results json
# original query was query = 'I just discovered the course. Can I still join it?'
# and course filter was course = "data-engineering-zoomcamp"

for result in hybrid_results:
    print(result.metadata['_source']['question'], result.metadata['_source']['course'], result.metadata['_score'])

# best match looks good and score is good too:
# Course - Can I still join the course after the start date? data-engineering-zoomcamp 12.559774

Course - Can I still join the course after the start date? data-engineering-zoomcamp 12.559774
Course - Can I follow the course after it finishes? data-engineering-zoomcamp 9.399914
Course - What can I do before the course starts? data-engineering-zoomcamp 7.306914
Course - Can I get support if I take the course in the self-paced mode? data-engineering-zoomcamp 7.1085525
Course - When will the course start? data-engineering-zoomcamp 6.75216


## Hybrid search and measuring against ground truth