## Getting the data

In [1]:
import pandas as pd
import numpy as np

github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

In [2]:
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

# Select only a part of the downloaded data
df = df.iloc[:300]

# Confirm the data
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


## Q1. Getting the embeddings model

In [3]:
from sentence_transformers import SentenceTransformer

model_name = "multi-qa-mpnet-base-dot-v1"

embedding_model = SentenceTransformer(model_name)

  from .autonotebook import tqdm as notebook_tqdm
You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [4]:
# Get the first llm answer
answer_llm = df.iloc[0].answer_llm

# Encode the answer
sentence_embedding = embedding_model.encode(answer_llm)

# Extract the first number
first_digit = sentence_embedding[0]

print(f"The first digit is: {first_digit}")

The first digit is: -0.42244666814804077


## Q2. Computing the dot product

In [5]:
def calculate_cosine_similarity(row):

    answer_llm = row["answer_llm"]
    answer_orig = row["answer_orig"]

    # Get the encoding for the rows
    answer_llm_embed = embedding_model.encode(answer_llm)
    answer_orig_embed = embedding_model.encode(answer_orig)

    return answer_llm_embed.dot(answer_orig_embed)

In [6]:
df["cosine_similarity"] =  df[["answer_llm", "answer_orig"]].apply(calculate_cosine_similarity, axis=1)

In [7]:
df["cosine_similarity"].describe()

count    300.000000
mean      27.495996
std        6.384743
min        4.547924
25%       24.307845
50%       28.336865
75%       31.674305
max       39.476017
Name: cosine_similarity, dtype: float64

## Q3. Computing the cosine

In [15]:
def normalize(orig_vect: pd.Series) -> pd.Series:

    norm = np.sqrt((orig_vect * orig_vect).sum())
    v_norm = orig_vect / norm

    return v_norm

In [16]:
df["cosine_sim_norm"] = normalize(df["cosine_similarity"])

In [17]:
df["cosine_sim_norm"].describe()

count    300.000000
mean       0.056244
std        0.013060
min        0.009303
25%        0.049722
50%        0.057964
75%        0.064790
max        0.080749
Name: cosine_sim_norm, dtype: float64

## Q4. Rouge


In [18]:
from rouge import Rouge

rouge_scorer = Rouge()

In [23]:
r = df.iloc[10]

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [24]:
scores["rouge-1"]["f"]

0.45454544954545456

In [25]:
average_f_score = (scores["rouge-1"]["f"] + scores["rouge-2"]["f"] + scores["rouge-l"]["f"]) / 3
average_f_score

0.35490034990035496

In [46]:
def compute_rouge(row):

    scores = rouge_scorer.get_scores(row["answer_llm"], row["answer_orig"])[0]

    return scores["rouge-2"]["f"]

In [47]:
df["rouge-2_f"] = df.apply(compute_rouge, axis=1)

In [49]:
df["rouge-2_f"].mean()

0.20696501983423318