# Evaluate Finetuned Embed Model

In [1]:
import os
import pandas as pd
import torch
import gc

In [2]:
from pydantic import BaseModel

class Config(BaseModel):
    testing: bool = False

    data_fp: str = "../data/yelp_dataset/sample/sample_100_biz/denom_review.parquet"
    storage_context_persist_dp: str = "./data/finetune_embedding/storage_context"
    gen_qa_embedding_pairs_dp: str = "./data/finetune_embedding/gen_qa_embedding_pairs"
    ft_model_dp: str = "./data/finetune_embedding/finetuned_model"
    eval_dp: str = "./data/finetune_embedding/eval"

    num_questions_per_chunk: int = 2

cfg = Config()

In [3]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path
import shutil

def evaluate_st(
    dataset,
    model_id,
    name,
    output_path: str = None
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )

    model = SentenceTransformer(model_id)

    if os.path.exists(output_path):
        shutil.rmtree(output_path)
    Path(output_path).mkdir(exist_ok=True, parents=True)
    
    result = evaluator(model, output_path=output_path)

    # Clear GPU memory
    del model
    torch.cuda.empty_cache()
    gc.collect()

    return result

In [4]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

val_output_path = f"{cfg.gen_qa_embedding_pairs_dp}/val/qa_finetuned_dataset.json"
val_dataset = EmbeddingQAFinetuneDataset.from_json(val_output_path)

In [5]:
%%time
name = 'finetuned'
model_id = cfg.ft_model_dp
output_csv_filename = f"Information-Retrieval_evaluation_{name}_results.csv"
output_path = f"{cfg.eval_dp}/{name}"
evaluate_st(val_dataset, model_id, name, output_path=output_path)
ft_eval_results = pd.read_csv(f"{output_path}/{output_csv_filename}")

You try to use a model that was created with version 2.7.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





CPU times: user 22.2 s, sys: 1.07 s, total: 23.3 s
Wall time: 21.2 s


In [6]:
ft_eval_results.T

Unnamed: 0,0
epoch,-1.0
steps,-1.0
cos_sim-Accuracy@1,0.3984
cos_sim-Accuracy@3,0.544364
cos_sim-Accuracy@5,0.612347
cos_sim-Accuracy@10,0.693327
cos_sim-Precision@1,0.3984
cos_sim-Recall@1,0.3984
cos_sim-Precision@3,0.181455
cos_sim-Recall@3,0.544364


In [7]:
%%time
name = 'bge'
model_id = "BAAI/bge-large-en"
output_csv_filename = f"Information-Retrieval_evaluation_{name}_results.csv"
output_path = f"{cfg.eval_dp}/{name}"
evaluate_st(val_dataset, model_id, name, output_path=output_path)
bge_eval_results = pd.read_csv(f"{output_path}/{output_csv_filename}")

CPU times: user 1min 5s, sys: 828 ms, total: 1min 5s
Wall time: 1min 8s


In [8]:
bge_eval_results.T

Unnamed: 0,0
epoch,-1.0
steps,-1.0
cos_sim-Accuracy@1,0.169708
cos_sim-Accuracy@3,0.265434
cos_sim-Accuracy@5,0.31892
cos_sim-Accuracy@10,0.395151
cos_sim-Precision@1,0.169708
cos_sim-Recall@1,0.169708
cos_sim-Precision@3,0.088478
cos_sim-Recall@3,0.265434


In [9]:
name = 'arctic'
model_id = "Snowflake/snowflake-arctic-embed-m-v1.5"
output_csv_filename = f"Information-Retrieval_evaluation_{name}_results.csv"
output_path = f"{cfg.eval_dp}/{name}"
evaluate_st(val_dataset, model_id, name, output_path=output_path)
arctic_eval_results = pd.read_csv(f"{output_path}/{output_csv_filename}")

You try to use a model that was created with version 2.7.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m-v1.5 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
arctic_eval_results

Unnamed: 0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
0,-1,-1,0.122719,0.187453,0.226943,0.284679,0.122719,0.122719,0.062484,0.187453,...,0.122719,0.062484,0.187453,0.045389,0.226943,0.028468,0.284679,0.167267,0.194887,0.174883


In [11]:
eval_df = pd.concat([
    pd.Series(arctic_eval_results.loc[0], name='arctic'),
    pd.Series(bge_eval_results.loc[0], name='bge'),
    pd.Series(ft_eval_results.loc[0], name='ft'),
], axis=1)

(
    eval_df
    .style
    .background_gradient(axis=1, low=0, high=1)
)

Unnamed: 0,arctic,bge,ft
epoch,-1.0,-1.0,-1.0
steps,-1.0,-1.0,-1.0
cos_sim-Accuracy@1,0.122719,0.169708,0.3984
cos_sim-Accuracy@3,0.187453,0.265434,0.544364
cos_sim-Accuracy@5,0.226943,0.31892,0.612347
cos_sim-Accuracy@10,0.284679,0.395151,0.693327
cos_sim-Precision@1,0.122719,0.169708,0.3984
cos_sim-Recall@1,0.122719,0.169708,0.3984
cos_sim-Precision@3,0.062484,0.088478,0.181455
cos_sim-Recall@3,0.187453,0.265434,0.544364


# Sanity check

In [12]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.schema import MetadataMode

val_docstore = SimpleDocumentStore.from_persist_dir(persist_dir=f"{cfg.storage_context_persist_dp}/val")

In [13]:
model_id = "Snowflake/snowflake-arctic-embed-m-v1.5"
base_model = SentenceTransformer(model_id)

You try to use a model that was created with version 2.7.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



Some weights of BertModel were not initialized from the model checkpoint at Snowflake/snowflake-arctic-embed-m-v1.5 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
i = 1
sanity_node = list(val_dataset.relevant_docs.values())[i][0]
sanity_question = list(val_dataset.queries.values())[i]

In [15]:
print(f"""
Generated Question:
{sanity_question}

From Context:
{val_docstore.get_node(sanity_node).get_content(metadata_mode=MetadataMode.LLM)}
"""
)


Generated Question:
What are some highly recommended ice cream shops known for unique flavors?

From Context:
review_stars: 5
biz_name: Mike's Ice Cream
biz_address: 129 2nd Ave N
biz_city: Nashville
biz_state: TN
biz_categories: ['Ice Cream & Frozen Yogurt', 'Coffee & Tea', 'Restaurants', 'Sandwiches', 'Food']

Some unique flavors too. Not much else to say but an amazing ice cream store that others should try to emulate.



In [16]:
import numpy as np
from numpy.linalg import norm

def cosine_sim(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

def calculate_sim(model, text1, text2):
    text1_embeddings = model.encode(text1)
    text2_embeddings = model.encode(text2)

    return cosine_sim(text1_embeddings, text2_embeddings)

calculate_sim(base_model, sanity_question, sanity_node)

0.47004718

In [17]:
model_id = cfg.ft_model_dp
ft_model = SentenceTransformer(model_id)

You try to use a model that was created with version 2.7.0.dev0, however, your version is 2.7.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [18]:
calculate_sim(ft_model, sanity_question, sanity_node)

0.09254191

## Check score between review text and review text + metadata

In [25]:
review_text = "Some unique flavors too. Not much else to say but an amazing ice cream store that others should try to emulate."

review_text_metadata = """
review_stars: 5
biz_name: Mike's Ice Cream
biz_address: 129 2nd Ave N
biz_city: Nashville
biz_state: TN
biz_categories: ['Ice Cream & Frozen Yogurt', 'Coffee & Tea', 'Restaurants', 'Sandwiches', 'Food']

Some unique flavors too. Not much else to say but an amazing ice cream store that others should try to emulate.
"""

In [26]:
calculate_sim(base_model, review_text, review_text_metadata)

0.54210067

In [27]:
calculate_sim(ft_model, review_text, review_text_metadata)

0.5803141