In [1]:
import requests
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from openai import OpenAI
import seaborn as sns
import matplotlib.pyplot as plt
import json
import numpy as np
from rouge import Rouge

  from tqdm.autonotebook import tqdm, trange


In [2]:
github_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv'
url = f'{github_url}?raw=1'
df = pd.read_csv(url)
df = df.iloc[:300]
df.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [3]:
model_name = 'multi-qa-mpnet-base-dot-v1'
embedding_model = SentenceTransformer(model_name)

In [4]:
answer_llm = df.iloc[0].answer_llm

What's the first value of the resulting vector?

In [5]:
embedding = embedding_model.encode(answer_llm)
embedding[0]

-0.42244664

In [6]:
def compute_similarity(answer_orig, answer_llm):
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    return v_llm.dot(v_orig)

In [7]:
evaluations = []

for index, record in tqdm(df.iterrows(), total=len(df)):
    sim = compute_similarity(record['answer_orig'], record['answer_llm'])
    evaluations.append(sim)


  0%|          | 0/300 [00:00<?, ?it/s]

In [8]:
percentile_75 = np.percentile(evaluations, 75)

print(f"The 75th percentile of evaluations is: {percentile_75}")

The 75th percentile of evaluations is: 31.674304008483887


In [9]:
def normalize_vector(v):
    norm = np.sqrt((v * v).sum())
    return v / norm

In [10]:
def compute_similarity_normalized(answer_orig, answer_llm):
    v_llm = embedding_model.encode(answer_llm)
    v_orig = embedding_model.encode(answer_orig)
    
    # Normalize the vectors
    v_llm_norm = normalize_vector(v_llm)
    v_orig_norm = normalize_vector(v_orig)
    
    # Compute the similarity (dot product) between the normalized vectors
    similarity = np.dot(v_llm_norm, v_orig_norm)
    
    return similarity

In [11]:
# Example usage
evaluations = []

for index, record in tqdm(df.iterrows(), total=len(df)):
    sim = compute_similarity_normalized(record['answer_orig'], record['answer_llm'])
    evaluations.append(sim)

  0%|          | 0/300 [00:00<?, ?it/s]

In [12]:
# Calculate the 75th percentile of evaluations
percentile_75 = np.percentile(evaluations, 75)
print(f"The 75th percentile of evaluations is: {percentile_75}")


The 75th percentile of evaluations is: 0.8362347632646561


In [14]:
rouge_scorer = Rouge()

In [15]:
# Function to compute ROUGE score between two texts
def compute_rouge_score(answer_llm, answer_orig):
    scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
    return scores


In [17]:
# Access the answers at the specific index (10)
index = 10
answer_llm = df.loc[index, 'answer_llm']
answer_orig = df.loc[index, 'answer_orig']

# Compute the ROUGE score
scores = compute_rouge_score(answer_llm, answer_orig)

print(f"ROUGE scores at index {index}: {scores}")

ROUGE scores at index 10: {'rouge-1': {'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}, 'rouge-2': {'r': 0.21621621621621623, 'p': 0.21621621621621623, 'f': 0.21621621121621637}, 'rouge-l': {'r': 0.3939393939393939, 'p': 0.3939393939393939, 'f': 0.393939388939394}}


In [21]:
# Function to compute ROUGE scores between two texts
def compute_rouge_scores(answer_llm, answer_orig):
    scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
    return scores

In [22]:
# Function to compute the average of ROUGE-1, ROUGE-2, and ROUGE-L scores
def compute_average_rouge(scores):
    rouge_1_f1 = scores['rouge-1']['f']
    rouge_2_f1 = scores['rouge-2']['f']
    rouge_l_f1 = scores['rouge-l']['f']
    
    average_score = (rouge_1_f1 + rouge_2_f1 + rouge_l_f1) / 3
    return average_score


In [23]:
# Access the answers at the specific index (10)
index = 10
answer_llm = df.loc[index, 'answer_llm']
answer_orig = df.loc[index, 'answer_orig']

In [24]:
# Compute the ROUGE scores
scores = compute_rouge_scores(answer_llm, answer_orig)

# Compute the average of ROUGE-1, ROUGE-2, and ROUGE-L scores
average_rouge_score = compute_average_rouge(scores)

print(f"ROUGE scores at index {index}: {scores}")
print(f"Average ROUGE score (ROUGE-1, ROUGE-2, ROUGE-L) at index {index}: {average_rouge_score}")


ROUGE scores at index 10: {'rouge-1': {'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}, 'rouge-2': {'r': 0.21621621621621623, 'p': 0.21621621621621623, 'f': 0.21621621121621637}, 'rouge-l': {'r': 0.3939393939393939, 'p': 0.3939393939393939, 'f': 0.393939388939394}}
Average ROUGE score (ROUGE-1, ROUGE-2, ROUGE-L) at index 10: 0.35490034990035496


In [26]:
# Lists to store the ROUGE scores
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []
rouge_avg_scores = []

In [27]:
# Loop through the DataFrame to compute ROUGE scores for each record
for index, record in tqdm(df.iterrows(), total=len(df)):
    scores = compute_rouge_scores(record['answer_llm'], record['answer_orig'])
    
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    rouge_avg = (rouge_1 + rouge_2 + rouge_l) / 3
    
    rouge_1_scores.append(rouge_1)
    rouge_2_scores.append(rouge_2)
    rouge_l_scores.append(rouge_l)
    rouge_avg_scores.append(rouge_avg)


  0%|          | 0/300 [00:00<?, ?it/s]

In [28]:
# Create a DataFrame from the ROUGE scores
rouge_scores_df = pd.DataFrame({
    'rouge-1': rouge_1_scores,
    'rouge-2': rouge_2_scores,
    'rouge-l': rouge_l_scores,
    'rouge-avg': rouge_avg_scores
})

In [29]:
# Compute the average ROUGE-2 score across all records
average_rouge_2 = rouge_scores_df['rouge-2'].mean()

print(f"The average ROUGE-2 score across all records is: {average_rouge_2}")

The average ROUGE-2 score across all records is: 0.20696501983423318
