In [40]:
import json

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)


documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [41]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 


In [42]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector":{"type":"dense_vector","dims": 768,"index":True,"similarity": "cosine"
        },
        }
    }
}

index_name = "course-questions"

# Create the index
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [43]:
from tqdm.auto import tqdm

In [44]:
# initialize pre-trained model

# This is a new library compared to the previous modules. 
# Please perform "pip install SentenceTransformer==2.7.0"

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work



In [45]:
#created the dense vector using the pre-trained model
operations = []
for doc in documents:
    #operations.append({"index": {"_index": "doc_ix"}})
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

In [46]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [47]:
search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

query = {
    "field" : "text_vector",
    "query_vector" :  vector_search_term,
    "k" : 5,
    "num_candidates" : 10000, 
}

res = es_client.search(index=index_name, knn=query,source=["text","section","question","course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': '8r09XpABu_nDrJkJuUvq',
  '_score': 0.7147919,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'Bb0-XpABu_nDrJkJcU9g',
  '_score': 0.6134734,
  '_source': {'question': 'WSL instructions',
   'course': 'mlops-zoomcamp',
   'section': 'Module 1: Introduction',
   'text': 'If you wish to use WSL on your windows machine, here are the setup instructions:\nCommand: Sudo apt install wget\nGet Anaconda download address here. wget <download address>\nTurn on Docker Desktop WFree Download | AnacondaSL2\nCommand: git clone <github repository address>\nVSCODE on WSL\nJupyter: pip3 install jupyter\nAdded by Gregory Morris (gwm1980@gmail.com)\nAll in all softwares at

In [48]:
response = es_client.search(
    index=index_name,
    query={
        "bool": {
            "must": {
             "multi_match": 
                        {"query": "windows or python?", 
                         "fields": ["text", "question","course","title"],
                         "type": "best_fields"
                        }
                    },
            "filter": {
                "term": {
                        "course": "data-engineering-zoomcamp"
            }
        }
        }
    }
)

In [49]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            id = hit["_id"]
            text = hit["_source"]["text"]
            score = hit["_score"]
            section = hit["_source"]["section"]
            question = hit["_source"]["question"]
            course = hit["_source"]["course"]
            pretty_output = f"\nID: {id}\text: {text}\nSection: {section}\nQuestion: {question}\nCourse: {course}\nScore: {score}"
            print(pretty_output)


In [50]:
pretty_response(response)


ID: OL09XpABu_nDrJkJykwM	ext: Problem: If you have already installed pgcli but bash doesn't recognize pgcli
On Git bash: bash: pgcli: command not found
On Windows Terminal: pgcli: The term 'pgcli' is not recognized…
Solution: Try adding a Python path C:\Users\...\AppData\Roaming\Python\Python39\Scripts to Windows PATH
For details:
Get the location: pip list -v
Copy C:\Users\...\AppData\Roaming\Python\Python39\site-packages
3. Replace site-packages with Scripts: C:\Users\...\AppData\Roaming\Python\Python39\Scripts
It can also be that you have Python installed elsewhere.
For me it was under c:\python310\lib\site-packages
So I had to add c:\python310\lib\Scripts to PATH, as shown below.
Put the above path in "Path" (or "PATH") in System Variables
Reference: https://stackoverflow.com/a/68233660
Section: Module 1: Docker and Terraform
Question: PGCLI - pgcli: command not found
Course: data-engineering-zoomcamp
Score: 7.728908

ID: _709XpABu_nDrJkJvEvN	ext: “wget is not recognized as an int