In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x70ac9f25a150>

### Docker
```
docker run -it \
    -v ollama:/root/.ollama \
    -p 11434:11434 \
    --name ollama \
    ollama/ollama

docker exec -it ollama bash
ollama pull phi3
```

In [14]:
from openai import OpenAI

In [15]:
openai_client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [16]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [17]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [18]:
def llm(prompt):
    response = openai_client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [19]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [20]:
rag('how do I run kafka?')

'To run the Kafka consumer in Python, follow these steps: Firstly, make sure you have Docker and Dockers Java up (for Windows use Gitbash). In Unix/Linux systems navigate to your project directory. Inside a new window or terminal session within this directory, create a virtual environment by entering `python3 -m venv my_env`. Activate the newly created environment with `. env/bin/activate` command on macOS and Linux (or `%Scripts/activate` for Windows). Now you need to install Kafka dependencies. Run `pip install "kafka-python>=2"` as well as other necessary libraries:\n \n```cmdline\npip3 install py4j dill kafka pika keras tensorflow requests faker gin psutil moto numpy click opencv-python pillow colorama yaml jmespath configargv joblib jsonschema PyYAML networkx mozila-browser termcolor ctypes ujson\n``` \nCreate the producer.py Python file in your project folder by inserting code from Module M06_partK (it should look similar to Producer class with method `run()`). To start, create i

In [21]:
rag('the course has already started, can I still enroll?')

"Based on the provided context from FAQs related to a course in DataTalks Club, if the course has already started by your current time frame, unfortunately there seems to be no further enrollment option available afterward as per that specific information given herein. The advice was primarily for those attempting to join prior or before class began and did not suggest any possibility of late-stage registration post commencement in this context.\nRegardless the course materials will remain accessible, you’re still encouraged though it may be beneficial as a preparatory step if there aren't deadlines specified for such later involvement within your given FAQ source document that I currently have access to: please confirm or clarify from official sources regarding post start enrollment policies.\nFor questions about the course and its related activities, like submissions of homeworks and final projects, it is essential not to procrastinate as deadlines approach according to the informati

# RAG with Vector Search

### Docker

```
docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
```

In [22]:
from qdrant_client import QdrantClient, models

In [23]:
qd_client = QdrantClient("http://localhost:6333")

In [24]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [25]:
collection_name = "zoomcamp-faq"

In [26]:
qd_client.delete_collection(collection_name=collection_name)

False

In [27]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [28]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [29]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [30]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [31]:
question = 'I just discovered the course. Can I still join it?'

In [32]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [33]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [34]:
rag('how do I run kafka?')

vector_search is used


'To run Kafka, first ensure that your broker Docker container is operating properly as suggested for a "NoBrokersAvailable" error. Use the docker ps command to verify its status and start all instances by running `docker-compose up -d` in the appropriate directory with your configurations set correctly such as updating secrets in Secrets.java, where KAFKA_CLUSTER_KEY is defined as a CONFLUX_SECRET key.\n\nFor issues related to missing kafka module or dependencies on certain operating systems like Windows, you may need to establish a Python virtual environment by running `python -m venv env`, then activate it with `.env/bin/activate` and install necessary packages (like psycopg2) via another terminal using the updated instructions I provided earlier.\n\nLastly, remember that for scripts like seed_kafka.py which are specific to certain frameworks in your project setup or cloud providers such as RisingWave with PostgreSQL connector issues mentioned by Alexey and myself above also required