In [95]:
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [56]:
tqdm.pandas()

In [7]:
DATA_PATH = "../data/results-gpt4o-mini.csv"
model_name = "multi-qa-mpnet-base-dot-v1"

## Getting the data

In [3]:
df = pd.read_csv(DATA_PATH)

We will use only the first 300 documents:

In [5]:
df = df.iloc[:300]

## Q1. Getting the embeddings model

In [8]:
embedding_model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Create the embeddings for the first LLM answer:

In [11]:
answer_llm = df.iloc[0].answer_llm
embedded_answer_llm = embedding_model.encode(answer_llm)

What's the first value of the resulting vector?

In [13]:
embedded_answer_llm[0]

-0.42244673

## Q2. Computing the dot product

In [46]:
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [158]:
df["dot_product"] = df.loc[:, ["answer_llm", "answer_orig"]].progress_apply(
    lambda x: embedding_model.encode(x[0]) @ embedding_model.encode(x[1]),
    axis=1
)

  0%|          | 0/300 [00:00<?, ?it/s]

  lambda x: embedding_model.encode(x[0]) @ embedding_model.encode(x[1]),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["dot_product"] = df.loc[:, ["answer_llm", "answer_orig"]].progress_apply(


What's the 75% percentile of the score?

In [59]:
df["dot_product"].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547923
25%       24.307844
50%       28.336872
75%       31.674311
max       39.476013
Name: dot_product, dtype: float64

## Q3. Computing the cosine

In [157]:
df["cosine"] = df.loc[:, ["answer_llm", "answer_orig"]].progress_apply(
    lambda x: (
        lambda v1, v2: v1 / np.linalg.norm(v1) @ v2 / np.linalg.norm(v2)
    )(embedding_model.encode(x[0]), embedding_model.encode(x[1])),
    axis=1
)

  0%|          | 0/300 [00:00<?, ?it/s]

  )(embedding_model.encode(x[0]), embedding_model.encode(x[1])),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cosine'] = df.loc[:, ['answer_llm', 'answer_orig']].progress_apply(


What's the 75% cosine in the scores?

In [94]:
df["cosine"].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine, dtype: float64

## Q4. Rouge

In [97]:
df.iloc[10]

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
dot_product                                            32.344711
cosine                                                  0.777956
Name: 10, dtype: object

In [98]:
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(df.iloc[10]["answer_llm"], df.iloc[10]["answer_orig"])[0]

What's the F score for rouge-1?

In [100]:
scores["rouge-1"]["f"]

0.45454544954545456

## Q5. Average rouge score

In [105]:
np.mean([score["f"] for score in scores.values()])

0.35490034990035496

## Q6. Average rouge score for all the data points

In [156]:
df["rouge_1_r"], df["rouge_1_p"], df["rouge_1_f"], df["rouge_2_r"], df["rouge_2_p"], df["rouge_2_f"], df["rouge_l_r"], df["rouge_l_p"], df["rouge_l_f"] = zip(*df.loc[:, ["answer_llm", "answer_orig"]].progress_apply(
    lambda x: (
        lambda r: (
            r["rouge-1"]["r"],
            r["rouge-1"]["p"],
            r["rouge-1"]["f"],
            r["rouge-2"]["r"],
            r["rouge-2"]["p"],
            r["rouge-2"]["f"],
            r["rouge-l"]["r"],
            r["rouge-l"]["p"],
            r["rouge-l"]["f"]
        )
    )(rouge_scorer.get_scores(x[0], x[1])[0]),
    axis=1
))

  0%|          | 0/300 [00:00<?, ?it/s]

  )(rouge_scorer.get_scores(x[0], x[1])[0]),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rouge_1_r'], df['rouge_1_p'], df['rouge_1_f'], df['rouge_2_r'], df['rouge_2_p'], df['rouge_2_f'], df['rouge_l_r'], df['rouge_l_p'], df['rouge_l_f'] = zip(*df.loc[:, ['answer_llm', 'answer_orig']].progress_apply(


What's the average rouge_2 across all the records?

In [148]:
df["rouge_2_f"].describe()

count    300.000000
mean       0.206965
std        0.153550
min        0.000000
25%        0.097809
50%        0.178671
75%        0.286181
max        0.739130
Name: rouge_2_f, dtype: float64