In [1]:
import numpy as np
import pandas as pd
from rouge import Rouge
from typing import Literal
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
github_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/04-monitoring/data/results-gpt4o-mini.csv"
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [3]:
assert df.shape[0] == 300

## Q1. Getting the embeddings model

In [4]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [5]:
answer_llm = df.iloc[0].answer_llm
answer_llm

'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).'

In [6]:
embedding_vector = embedding_model.encode(answer_llm)
embedding_vector[0], embedding_vector.shape

(-0.42244658, (768,))

## Q2. Computing the dot product

In [7]:
def compute_dot_product(
        model: SentenceTransformer,
        row: pd.Series) -> np.float32:
    return model.encode(row.answer_llm) @ model.encode(row.answer_orig)

evaluations = df.apply(
    lambda row: compute_dot_product(embedding_model, row),
    axis=1
)

In [8]:
np.percentile(evaluations, 75.0)

31.674307823181152

## Q3. Computing the cosine

In [9]:
def compute_cos_similarity(
        model: SentenceTransformer,
        row: pd.Series) -> np.float32:
    lhs = model.encode(row.answer_llm)
    rhs = model.encode(row.answer_orig)
    return (lhs / np.linalg.norm(lhs, 2)) @ (rhs / np.linalg.norm(rhs, 2))
    


evaluations = df.apply(
    lambda row: compute_cos_similarity(embedding_model, row),
    axis=1
)

np.percentile(evaluations, 75.0)

0.8362347930669785

## Q4. Rouge

In [10]:
r = df.iloc[10]

In [11]:
assert r.document == "5170565b"

In [12]:

rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]

In [13]:
scores["rouge-1"]["p"]

0.45454545454545453

## Q5. Average rouge score

In [14]:
np.mean([scores["rouge-1"]["p"], scores["rouge-2"]["p"], scores["rouge-l"]["p"]])

0.35490035490035493

## Q6. Average rouge score for all the data points

In [15]:
rouge_scorer = Rouge()
scores = rouge_scorer.get_scores(df['answer_llm'], df['answer_orig'])

In [16]:
def get_score(scores: list[dict], 
              rtype: Literal["rouge-1", "rouge-2", "rouge-l"]) -> list:
    return [score[rtype]["f"] for score in scores]

In [18]:
rtypes = ["rouge-1", "rouge-2", "rouge-l"]

In [19]:
rouge_df = pd.DataFrame.from_dict(data={
    rtype: get_score(scores, rtype) for rtype in rtypes
})

In [20]:
rouge_df["rouge-2"].mean()

0.20696501983423318