In [1]:
import pandas as pd

In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'

url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [3]:
df = df.iloc[:300]

## Q1. Getting the embeddings model

What's the first value of the resulting vector?

In [4]:
model_name = 'multi-qa-mpnet-base-dot-v1'

In [5]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name)

You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [6]:
answer_llm = df.iloc[0].answer_llm

In [7]:
embedding_model.encode(answer_llm)[0]

-0.4224466

## Q2. Computing the dot product


Now for each answer pair, let's create embeddings and compute dot product between them

We will put the results (scores) into the `evaluations` list

What's the 75% percentile of the score?

* 21.67
* 31.67
* 41.67
* 51.67

In [8]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [9]:
df.head

<bound method NDFrame.head of                                             answer_llm  \
0    You can sign up for the course by visiting the...   
1    You can sign up using the link provided in the...   
2    Yes, there is an FAQ for the Machine Learning ...   
3    The context does not provide any specific info...   
4    To structure your questions and answers for th...   
..                                                 ...   
295  An alternative way to load the data using the ...   
296  You can directly download the dataset from Git...   
297  You can fetch data for homework using the `req...   
298  If the status code is 200 when downloading dat...   
299  If the file download fails when using the requ...   

                                           answer_orig  document  \
0    Machine Learning Zoomcamp FAQ\nThe purpose of ...  0227b872   
1    Machine Learning Zoomcamp FAQ\nThe purpose of ...  0227b872   
2    Machine Learning Zoomcamp FAQ\nThe purpose of ...  0227b872   
3

In [11]:
from tqdm.auto import tqdm

In [15]:
evaluations = []

for index, record in tqdm(df.iterrows()):
    sim = compute_similarity(record)
    evaluations.append(sim)

0it [00:00, ?it/s]

In [16]:
df['cosine'] = evaluations
df['cosine'].describe()

count    300.000000
mean      27.495996
std        6.384742
min        4.547926
25%       24.307844
50%       28.336872
75%       31.674310
max       39.476013
Name: cosine, dtype: float64

## Q3. Computing the cosine
What's the 75% cosine in the scores?

In [20]:
import numpy as np

In [17]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    v_norm = v / norm
    return v_norm

In [18]:
def compute_similarity_norm(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalize_vector(embedding_model.encode(answer_llm))
    v_orig = normalize_vector(embedding_model.encode(answer_orig))
    
    return v_llm.dot(v_orig)

In [21]:
evaluations_norm = []

for index, record in tqdm(df.iterrows()):
    sim = compute_similarity_norm(record)
    evaluations_norm.append(sim)

0it [00:00, ?it/s]

In [22]:
df['cosine_norm'] = evaluations_norm
df['cosine_norm'].describe()

count    300.000000
mean       0.728393
std        0.157755
min        0.125357
25%        0.651273
50%        0.763761
75%        0.836235
max        0.958796
Name: cosine_norm, dtype: float64

## Q4. Rouge
There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and precision, recall and F1 score for each.

* `rouge-1` - the overlap of unigrams,
* `rouge-2` - bigrams,
* `rouge-l` - the longest common subsequence

What's the F score for `rouge-1`?

In [23]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [26]:
r = df.iloc[10]

In [27]:
from rouge import Rouge
rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]

In [28]:
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

## Q5. Average rouge score
Let's compute the average F-score between `rouge-1`, `rouge-2` and `rouge-l` for the same record from Q4

In [32]:
f_scores = []
for k, v in scores.items():
    f_scores.append(v['f'])

print(sum(f_scores)/len(f_scores))

0.35490034990035496


## Q6. Average rouge score for all the data points
Now let's compute the F-score for all the records and create a dataframe from them.

What's the average F-score in `rouge_2` across all the records?

In [33]:
def count_rouge_scores(r):
    scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
    return scores

In [37]:
rouge_2_all = []
for r in tqdm(df.to_dict(orient="records")):
    scores = count_rouge_scores(r)
    rouge_2_all.append(scores['rouge-2']['f'])

  0%|          | 0/300 [00:00<?, ?it/s]

In [39]:
print(sum(rouge_2_all)/len(rouge_2_all))

0.20696501983423318
