# Cosine Similarity

In [12]:
# Load documents with id
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [14]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [16]:
# Load ground truth data set

import pandas as pd
df_ground_truth = pd.read_csv('ground-truth-data.csv')
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [17]:
ground_truth[0]

{'question': 'What is the intention behind the FAQ document for the Machine Learning Zoomcamp?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [18]:
# Index id to be able to retrieve answer using id
doc_idx = {d['id']: d for d in documents}
doc_idx['c02e79ef']['text']

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [19]:
from sentence_transformers import SentenceTransformer

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

In [20]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [21]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    doc['question_text_vector'] = model.encode(question + ' ' + text)

    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:03<00:00, 15.03it/s]


In [22]:
def elastic_search_knn(field, vector, course):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [23]:
question_text_vector_knn(dict(
    question='Are sessions recorded if I miss one?',
    course='machine-learning-zoomcamp'
))

[{'text': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
  'section': 'General course-related questions',
  'question': 'What if I miss a session?',
  'course': 'machine-learning-zoomcamp',
  'id': '5170565b'},
 {'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
  'section': 'General course-related questions',
  'question': 'Is it going to be live? When?',
  'course': 'machine-learning-zoomcamp',
  'id': '39fda9f0'},
 {'text': '(Hrithik Kumar Advani)',
  'section': '2. Machine Learning for Regression',
  'question': 'Useful Resource for

In [24]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [25]:
from openai import OpenAI

client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [26]:
# previously: rag(query: str) -> str
def rag(query: dict, model='gpt-4o') -> str:
    search_results = question_text_vector_knn(query)
    prompt = build_prompt(query['question'], search_results)
    answer = llm(prompt, model=model)
    return answer

In [27]:
rag(ground_truth[0])

'The intention behind the FAQ document for the Machine Learning Zoomcamp is to capture frequently asked technical questions. This approach was inspired by its successful implementation in their data engineering course.'

In [28]:
doc_idx['c02e79ef']['text']

"The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."

In [29]:
answer_original = "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."
answer_llm = 'The intention behind the FAQ document for the Machine Learning Zoomcamp is to capture frequently asked technical questions. This approach was previously used for their data engineering course and was found to be effective. The document serves as a resource for participants to find structured answers and guidance on technical queries related to the course.'

v_llm = model.encode(answer_llm)
v_original = model.encode(answer_original)

v_llm.dot(v_original)

np.float32(0.16684134)

In [30]:
# Using GPT 4o-mini

rag(ground_truth[0], model='gpt-4o-mini')

'The intention behind the FAQ document for the Machine Learning Zoomcamp is to capture frequently asked technical questions, similar to what was done for the data engineering course. The aim is to provide a structured resource for course participants to reference.'

In [31]:
# Because of the time it takes, lets use multi threading or parallel processing to generate answers for every questions

from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

pool = ThreadPoolExecutor(max_workers=6) # Using 6 concurrent threads

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results

In [32]:
def process_record_4o_mini(rec):
    model = 'gpt-4o-mini'
    answer_llm = rag(rec, model=model)
    
    doc_id = rec['document']
    original_doc = doc_idx[doc_id]
    answer_original = original_doc['text']

    return {
        'answer_llm': answer_llm,
        'answer_original': answer_original,
        'document': doc_id,
        'question': rec['question'],
        'course': rec['course'],
    }

In [33]:
# results_gpt35 = map_progress(pool, ground_truth, process_record)

In [34]:
process_record_4o_mini(ground_truth[0])

{'answer_llm': 'The intention behind the FAQ document for the Machine Learning Zoomcamp is to capture frequently asked technical questions. This approach was successfully implemented in their data engineering course, serving as a model for structuring questions and answers.',
 'answer_original': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document': '0227b872',
 'question': 'What is the intention behind the FAQ document for the Machine Learning Zoomcamp?',
 'course': 'machine-learning-zoomcamp'}

In [35]:
results_gpt4o_mini = []

In [36]:
for record in tqdm(ground_truth):
    result = process_record_4o_mini(record)
    results_gpt4o_mini.append(result)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1838/1838 [46:59<00:00,  1.53s/it]


In [37]:
df_gpt4o_mini = pd.DataFrame(results_gpt4o_mini)
df_gpt4o_mini.to_csv('results_gpt4o_mini.csv', index=False)

In [38]:
results_gpt4o_mini = df_gpt4o_mini.to_dict(orient='records')

In [39]:
record = results_gpt4o_mini[0]

In [41]:
def compute_similarity(record):
    answer_original = record['answer_original']
    answer_llm = record['answer_llm']
    
    v_llm = model.encode(answer_llm)
    v_original = model.encode(answer_original)
    
    return v_llm.dot(v_original)

In [42]:
similarity = []

for record in tqdm(results_gpt4o_mini):
    sim = compute_similarity(record)
    similarity.append(sim)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 1838/1838 [02:39<00:00, 11.51it/s]


In [43]:
df_gpt4o_mini['cosine'] = similarity
df_gpt4o_mini['cosine'].describe()

count    1838.000000
mean        0.687138
std         0.203796
min        -0.142122
25%         0.597458
50%         0.734756
75%         0.831409
max         0.984759
Name: cosine, dtype: float64

# LLM As A Judge

In [5]:
# Load ground truth data set

import pandas as pd
df_gpt4o_mini = pd.read_csv('results_gpt4o_mini.csv')

In [6]:
df_gpt4o_mini

Unnamed: 0,answer_llm,answer_original,document,question,course
0,The intention behind the FAQ document for the ...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,What is the intention behind the FAQ document ...,machine-learning-zoomcamp
1,You can find a model for structuring questions...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I find a model for structuring quest...,machine-learning-zoomcamp
2,"To find the sign-up link for the course, you c...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where should I look to find the sign-up link f...,machine-learning-zoomcamp
3,The resource used to create frequently asked q...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,What resource was used to create frequently as...,machine-learning-zoomcamp
4,You can access the sign-up link for the Machin...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I access the sign-up link for the Mach...,machine-learning-zoomcamp
...,...,...,...,...,...
1833,You can list the Machine Learning Zoomcamp exp...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles I can use to li...,machine-learning-zoomcamp
1834,It is not appropriate to list the Machine Lear...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Is it appropriate to list the Machine Learning...,machine-learning-zoomcamp
1835,You can include your Machine Learning Zoomcamp...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Which LinkedIn sections can I use to include m...,machine-learning-zoomcamp
1836,To showcase your Machine Learning Zoomcamp pro...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,How can I showcase my Machine Learning Zoomcam...,machine-learning-zoomcamp


In [7]:
# We are randomly sampling just 150 records for this exercise as we know this takes a lot of time

df_sample = df_gpt4o_mini.sample(n=150, random_state=1)

In [8]:
samples = df_sample.to_dict(orient='records')

In [9]:
record = samples[0]
record

{'answer_llm': "A key consideration when using DictVectorizer in Homework 3, Question 6, is to avoid fitting the DictVectorizer on the validation dataset. Doing so would give the model access to labels that it should not know, compromising the validity of the performance estimate on unseen data. Instead, you should fit the DictVectorizer on the training data and then only transform the validation and test sets. This ensures that the validation results reflect the model's performance on truly unseen data.",
 'answer_original': "You need to use all features. and price for target. Don't include the average variable we created before.\nIf you use DictVectorizer then make sure to use sparce=True to avoid convergence errors\nI also used StandardScalar for numerical variable you can try running with or without this\n(Peter Pan)",
 'document': '4a55c510',
 'question': 'What is a key consideration when using DictVectorizer in Homework 3, Question 6?',
 'course': 'machine-learning-zoomcamp'}

In [10]:
# Two prompts because in case 1 we have all 3 requirements - Question, Original Answer and LLM Answer, but we will have another case where we dont always have the access to the original answer for which we generated questions which we may not have in production.

prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_original}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [11]:
prompt = prompt1_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: You need to use all features. and price for target. Don't include the average variable we created before.
If you use DictVectorizer then make sure to use sparce=True to avoid convergence errors
I also used StandardScalar for numerical variable you can try running with or without this
(Peter Pan)
Generated Question: What is a key consideration when using DictVectorizer in Homework 3, Question 6?
Generated Answer: A key consideration when using DictVectorizer in Homework 3, Question 6, is to avoid fitting the DictVectorizer on the validation dataset. Doing so would give the model access to labels tha

In [13]:
from openai import OpenAI

client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [16]:
answer = llm(prompt, model='gpt-4o-mini')

In [17]:
import json

In [19]:
from tqdm.auto import tqdm

evaluations = []

for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [03:51<00:00,  1.54s/it]


In [21]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

In [22]:
df_evaluations = pd.DataFrame(json_evaluations)

In [23]:
df_evaluations.Relevance.value_counts()

Relevance
RELEVANT           133
PARTLY_RELEVANT     11
NON_RELEVANT         6
Name: count, dtype: int64

In [24]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT'] #.to_dict(orient='records')

Unnamed: 0,Relevance,Explanation
13,NON_RELEVANT,The generated answer discusses how to calculat...
26,NON_RELEVANT,The generated answer does not address the spec...
43,NON_RELEVANT,The generated answer addresses a completely di...
54,NON_RELEVANT,The generated answer addresses general queries...
83,NON_RELEVANT,The generated answer addresses a different iss...
113,NON_RELEVANT,The generated answer does not address the cont...


In [25]:
prompt = prompt2_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: How do I configure TensorFlow for GPU on an Ubuntu system?
Generated Answer: To configure TensorFlow for GPU on an Ubuntu system, you need to install CUDA and cuDNN. Here are the steps you can follow:

1. Install Nvidia drivers. You can find them at [Nvidia's official site](https://www.nvidia.com/download/index.aspx).
2. Install the CUDA toolkit (version 11.x.x) from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive).
3. Download and install cuDNN from the [cuDNN download page](https://developer.nvidia.com/rdp/cudnn-download).
4. Ensure that TensorFlow is installed, preferably using the `tensorflow-gpu` package.

For a more detailed g

In [26]:
evaluation = llm(prompt, model='gpt-4o-mini')
print(evaluation)

{
  "Relevance": "RELEVANT",
  "Explanation": "The generated answer directly addresses the question by providing specific steps to configure TensorFlow for GPU on an Ubuntu system. It includes essential components such as installing Nvidia drivers, CUDA, and cuDNN, which are crucial for GPU configuration. Additionally, it suggests using the `tensorflow-gpu` package and provides links for further resources, making it highly relevant to the user's query."
}


In [27]:
evaluations_2 = []

for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations_2.append(evaluation)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [03:41<00:00,  1.48s/it]


In [31]:
json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    json_eval = json.loads(str_eval)
    json_evaluations_2.append(json_eval)

In [32]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)

In [33]:
df_evaluations_2.Relevance.value_counts()

Relevance
RELEVANT           134
PARTLY_RELEVANT     16
Name: count, dtype: int64

In [34]:
df_evaluations_2[df_evaluations_2.Relevance == 'NON_RELEVANT']

Unnamed: 0,Relevance,Explanation


In [35]:
df_evaluations.to_csv('evaluations-aqa.csv', index=False)
df_evaluations_2.to_csv('evaluations-qa.csv', index=False)