### Getting the data

In [36]:
# Import all necessary libraries for this homework
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

In [6]:
# Loading files from Github
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]

In [8]:
# Check the resulting documents
df

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
295,An alternative way to load the data using the ...,Above users showed how to load the dataset dir...,8d209d6d,What is an alternative way to load the data us...,machine-learning-zoomcamp
296,You can directly download the dataset from Git...,Above users showed how to load the dataset dir...,8d209d6d,How can I directly download the dataset from G...,machine-learning-zoomcamp
297,You can fetch data for homework using the `req...,Above users showed how to load the dataset dir...,8d209d6d,Could you share a method to fetch data for hom...,machine-learning-zoomcamp
298,If the status code is 200 when downloading dat...,Above users showed how to load the dataset dir...,8d209d6d,What should I do if the status code is 200 whe...,machine-learning-zoomcamp


### Q1 Getting the embeddings model

In [15]:
# Get the embedding model
embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Pulling the first answer
answer_llm = df.iloc[0].answer_llm

In [35]:
# Creating embedding
vector = embedding_model.encode(answer_llm)
print("Answer 1: ", round(vector[0], 2))

Answer 1:  -0.42


### Q2 Computing the dot product

In [24]:
# Function to compute dot product
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [26]:
# Putting data in a dictionary
df_dict = df.to_dict(orient='records')

# List to store results
evaluations = []

# Iterating through each record to compute the dot product
for record in tqdm(df_dict):
    product = compute_similarity(record)
    evaluations.append(product)

  0%|          | 0/300 [00:00<?, ?it/s]

In [34]:
df['cosine'] = evaluations
print(df['cosine'].describe())
print('\nAnswer 2: ', round(df['cosine'].describe().loc['75%'], 2))

count    300.000000
mean      27.495996
std        6.384743
min        4.547921
25%       24.307843
50%       28.336860
75%       31.674306
max       39.476017
Name: cosine, dtype: float64

Answer 2:  31.67


### Q3 Computing the cosine

In [57]:
# Function to normalise vectors
def compute_normilised(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    llm_norm = np.sqrt((v_llm * v_llm).sum())
    v_llm_norm = v_llm / llm_norm
    
    v_orig = embedding_model.encode(answer_orig)
    orig_norm = np.sqrt((v_orig * v_orig).sum())
    v_orig_norm = v_orig / orig_norm
    
    return v_llm_norm.dot(v_orig_norm)

evaluations_norm = []

for record in tqdm(df_dict):
    product = compute_normilised(record)
    evaluations_norm.append(product)

  0%|          | 0/300 [00:00<?, ?it/s]

In [59]:
# Getting the 75%
df['cosine_norm'] = evaluations_norm

# Pring the answer
print(df['cosine_norm'].describe())
print('Answer 3:', df['cosine_norm'].describe().loc['75%'])

count    300.000000
mean       0.728392
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_norm, dtype: float64
Answer 3: 0.8362347334623337


### Q4 Rouge