# Module 3: Vector-Search Homework

# 

# 

## Getting the data

In [2]:
import pandas as pd

In [3]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [4]:
df = df.iloc[:300]

## Q1. Getting the embeddings model

In [5]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-mpnet-base-dot-v1"

embedding_model = SentenceTransformer(model_name)


  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [6]:
answer_llm = df.iloc[0].answer_llm


In [8]:
embedding = embedding_model.encode(answer_llm)


In [9]:
print(f"The first value of the resulting vector is {embedding[0]:.2f}")

The first value of the resulting vector is -0.42


# 

# 

## Q2. Computing the dot product


In [13]:
import numpy as np

In [18]:
for idx, row in df.iterrows():
    
    llm_answer_embedded = embedding_model.encode(row.answer_llm)
    original_answer_embedded = embedding_model.encode(row.answer_orig)
    
    #df.loc[idx, 'embedded_llm_answer'] = llm_answer_embedded
    #df.loc[idx, 'embedding_orig_answer'] = original_answer_embedded


    dot_product_score = np.dot(original_answer_embedded, llm_answer_embedded)

    df.loc[idx, 'dot_product_score'] = dot_product_score 


df

Unnamed: 0,answer_llm,answer_orig,document,question,course,dot_product_score
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,17.515993
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,13.418406
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,25.313251
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,12.147413
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,18.747730
...,...,...,...,...,...,...
295,An alternative way to load the data using the ...,Above users showed how to load the dataset dir...,8d209d6d,What is an alternative way to load the data us...,machine-learning-zoomcamp,34.001778
296,You can directly download the dataset from Git...,Above users showed how to load the dataset dir...,8d209d6d,How can I directly download the dataset from G...,machine-learning-zoomcamp,33.690857
297,You can fetch data for homework using the `req...,Above users showed how to load the dataset dir...,8d209d6d,Could you share a method to fetch data for hom...,machine-learning-zoomcamp,34.491524
298,If the status code is 200 when downloading dat...,Above users showed how to load the dataset dir...,8d209d6d,What should I do if the status code is 200 whe...,machine-learning-zoomcamp,27.538349


In [22]:
percentile_75 = np.percentile(df['dot_product_score'], 75)

print(f"The 75% percentile of the score is {percentile_75:.2f}")


The 75% percentile of the score is 31.67


# 

# 

## Q3. Computing the cosine


In [23]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [24]:
for idx, row in df.iterrows():
    
    llm_answer_embedded = embedding_model.encode(row.answer_llm)
    original_answer_embedded = embedding_model.encode(row.answer_orig)

    normalized_llm_answer_embeddings = normalize_vector(llm_answer_embedded)
    normalized_original_answer_embeddings = normalize_vector(original_answer_embedded)

    normalized_dot_product_score = np.dot(normalized_original_answer_embeddings, normalized_llm_answer_embeddings)

    df.loc[idx, 'normalized_dot_product_score'] = normalized_dot_product_score 




In [25]:
cosine_percentile_75 = np.percentile(df['normalized_dot_product_score'], 75)

print(f"The 75% percentile of the cosine similarity score is {cosine_percentile_75:.2f}")


The 75% percentile of the cosine similarity score is 0.84


# 

# 

## Q4. Rouge

In [37]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])[10]

scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [38]:
idx_10_rouge1_f_score = scores.get('rouge-1').get('f')

print(f"The F1 score for Rouge-1 of the document at index 10 is {idx_10_rouge1_f_score:.2f}")


The F1 score for Rouge-1 of the document at index 10 is 0.45


# 

# 

## Q5. Average rouge score

In [45]:
f1_scores_acum = 0
qty = 0

for rouge in scores:

    f1_scores_acum += scores.get(rouge).get('f')
    qty += 1



In [46]:
avg_f1_scores = f1_scores_acum / qty

print(f"The average F1 score for Rouge scores of the document at index 10 is {avg_f1_scores:.2f}")

The average F1 score for Rouge scores of the document at index 10 is 0.35


# 

# 

## Q6. Average rouge score for all the data points

In [51]:
for idx, row in df.iterrows():

    rouge2_scores = rouge_scorer.get_scores(df.loc[idx, 'answer_llm'], df.loc[idx, 'answer_orig'])[0].get('rouge-2')

    rouge2_f1_score = rouge2_scores.get('f')

    df.loc[idx, 'rouge2_f1_score'] = rouge2_f1_score 

df

Unnamed: 0,answer_llm,answer_orig,document,question,course,dot_product_score,normalized_dot_product_score,rouge2_f1_score
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,17.515993,0.506754,0.028169
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,13.418406,0.388549,0.055556
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,25.313251,0.718599,0.177778
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,12.147413,0.337266,0.047059
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,18.747730,0.521792,0.033898
...,...,...,...,...,...,...,...,...
295,An alternative way to load the data using the ...,Above users showed how to load the dataset dir...,8d209d6d,What is an alternative way to load the data us...,machine-learning-zoomcamp,34.001778,0.914175,0.540984
296,You can directly download the dataset from Git...,Above users showed how to load the dataset dir...,8d209d6d,How can I directly download the dataset from G...,machine-learning-zoomcamp,33.690857,0.902190,0.460432
297,You can fetch data for homework using the `req...,Above users showed how to load the dataset dir...,8d209d6d,Could you share a method to fetch data for hom...,machine-learning-zoomcamp,34.491524,0.904733,0.564516
298,If the status code is 200 when downloading dat...,Above users showed how to load the dataset dir...,8d209d6d,What should I do if the status code is 200 whe...,machine-learning-zoomcamp,27.538349,0.726781,0.132231


In [52]:
avg_rouge2_f1_scores = np.mean(df['rouge2_f1_score'])

print(f"The average F1 score for Rouge-2 scores of all the documents is {avg_rouge2_f1_scores:.2f}")

The average F1 score for Rouge-2 scores of all the documents is 0.21
