### Question 1. Getting the embeddings model: First value (1 point)

In [None]:
import pandas as pd

url = 'data/results-gpt4o-mini.csv'
df = pd.read_csv(url)

In [None]:
df = df.iloc[:300]

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   answer_llm   300 non-null    object
 1   answer_orig  300 non-null    object
 2   document     300 non-null    object
 3   question     300 non-null    object
 4   course       300 non-null    object
dtypes: object(5)
memory usage: 11.8+ KB


In [None]:
from sentence_transformers import SentenceTransformer
model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = SentenceTransformer(model_name)

In [48]:
answer_llm = df.iloc[0].answer_llm

# Generate the embeddings for the first LLM answer
embeddings = embedding_model.encode(answer_llm)

# Get the first value of the resulting vector
first_value = embeddings[0]
print("First value of the resulting vector:", first_value)

First value of the resulting vector: -0.42244658


### Question 2. Dot product: 75 percentile (1 point)

In [30]:
import numpy as np
v_llm = embedding_model.encode(df['answer_llm'].tolist())
v_orig = embedding_model.encode(df['answer_orig'].tolist())

dot_products = np.sum(v_llm * v_orig, axis=1)

# Compute the 75th percentile of the similarity scores
percentile_75 = np.percentile(dot_products, 75)
print("75th percentile of the scores:", percentile_75)

75th percentile of the scores: 31.674304008483887


### Question 3. Cosine 75 percentile (1 point)

In [49]:
def normalize(v):
    norm = np.sqrt((v * v).sum(axis=1, keepdims=True))
    return v / norm

v_llm_norm = normalize(v_llm)
v_orig_norm = normalize(v_orig)

cosine_similarities = np.sum(v_llm_norm * v_orig_norm, axis=1)

percentile_75 = np.percentile(cosine_similarities, 75)
print("75th percentile of the cosine similarities:", percentile_75)

75th percentile of the cosine similarities: 0.8362347483634949


### Question 4. Rouge 1 F (1 point)

In [40]:
sample_row = df.iloc[10]  # Retrieve row with doc_id=5170565b

from rouge import Rouge

rouge_scorer = Rouge()

scores = rouge_scorer.get_scores(sample_row['answer_llm'], sample_row['answer_orig'])[0]

rouge_1_f1_score = scores['rouge-1']['f']

print("ROUGE-1 F1 score:", rouge_1_f1_score)

ROUGE-1 F1 score: 0.45454544954545456


### Question 5. Average Rouge (1 point)

In [41]:
f1_rouge_1 = scores['rouge-1']['f']
f1_rouge_2 = scores['rouge-2']['f']
f1_rouge_l = scores['rouge-l']['f']

average_f1_score = (f1_rouge_1 + f1_rouge_2 + f1_rouge_l) / 3

print("Average F1 score:", average_f1_score)

Average F1 score: 0.35490034990035496


### Question 6. Average Rouge 2 (1 point)

In [50]:
rouge_2_f1_scores = []

# Iterate over each row in the DataFrame
for _, row in df.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    f1_rouge_2 = scores['rouge-2']['f']
    rouge_2_f1_scores.append(f1_rouge_2)

rouge_2_df = pd.DataFrame(rouge_2_f1_scores, columns=['rouge_2_f1'])
average_rouge_2_f1 = rouge_2_df['rouge_2_f1'].mean()

print("Average ROUGE-2 F1 score across all records:", average_rouge_2_f1)

Average ROUGE-2 F1 score across all records: 0.20696501983423318
