In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import numpy as np
from rouge import Rouge

In [31]:
github_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv"

In [32]:
url = f'{github_url}?raw=1'
df = pd.read_csv(url)

In [33]:
df = df.iloc[:300]

# Q1

In [None]:
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [35]:
answer_llm = df.iloc[0].answer_llm
v_llm = embedding_model.encode(answer_llm)
v_llm[0]

-0.42244673

# Q2

In [36]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [37]:
evaluations = []

for idx, record in tqdm(df.iterrows()):
    evaluations.append(compute_similarity(record))

eval_df = pd.DataFrame(evaluations)
eval_df.describe()

0it [00:00, ?it/s]

Unnamed: 0,0
count,300.0
mean,27.495996
std,6.384744
min,4.547924
25%,24.307847
50%,28.336862
75%,31.674308
max,39.476013


# Q3

In [38]:
def normalize(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [39]:
def compute_similarity_normalized(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = normalize(embedding_model.encode(answer_llm))
    v_orig = normalize(embedding_model.encode(answer_orig))
    
    return v_llm.dot(v_orig)

In [40]:
evaluations_norm = []

for idx, record in tqdm(df.iterrows()):
    evaluations_norm.append(compute_similarity_normalized(record))

eval_df_norm = pd.DataFrame(evaluations_norm)
eval_df_norm.describe()

0it [00:00, ?it/s]

Unnamed: 0,0
count,300.0
mean,0.728392
std,0.157755
min,0.125357
25%,0.651273
50%,0.763761
75%,0.836235
max,0.958796


# Q4

In [41]:
rouge_scorer = Rouge()

r = df.iloc[10]
scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

# Q5

In [42]:
def average_f_score(s):
    return np.average([s['rouge-1']['f'], s['rouge-2']['f'], s['rouge-l']['f']])

In [43]:
average_f_score(scores)

0.35490034990035496

# Q6

In [44]:
r2_scores = []

for idx, record in tqdm(df.iterrows()):
    r2_scores.append(rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]['rouge-2'])

df1 = pd.DataFrame(r2_scores)
np.average(df1['f'])

0it [00:00, ?it/s]

0.20696501983423318