# POC semantic search E2E analysis

The goal of this POC notebook is to import a model from hugging face, apply it to the dataset, evaluate on metrics, benchmark and create some visualisations. Essentially make sure the semantic search is working end to end.

In [15]:
import plotly.express as px
import pandas as pd
import numpy as np
from ranx import Qrels, Run, evaluate
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
import random

In [3]:
df_examples = pd.read_parquet('../data/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('../data/shopping_queries_dataset_products.parquet')
df_sources = pd.read_csv("../data/shopping_queries_dataset_sources.csv")

In [4]:
# https://github.com/amazon-science/esci-data: suggested filter for task 1: Query-Product Ranking 
# Query-Product Ranking: Given a user specified query and a list of matched products, the goal of this 
# task is to rank the products so that the relevant products are ranked above the non-relevant ones.
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

df_task_1 = df_examples_products[df_examples_products["small_version"] == 1]
df_task_1_train = df_task_1[df_task_1["split"] == "train"]
df_task_1_test = df_task_1[df_task_1["split"] == "test"]

In [29]:
# use code provided on huggingface to get started on multilingual text 
# semantic search siamese bert

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('SeyedAli/Multilingual-Text-Semantic-Search-Siamese-BERT-V1')

df_task_1_train_light = df_task_1_train[df_task_1_train['query_id'].isin([1,2,3,4])]

query_embeddings = model.encode(df_task_1_train_light['query'].tolist(), convert_to_tensor=True)
example_embeddings = model.encode(df_task_1_train_light['product_title'].tolist(), convert_to_tensor=True)

In [49]:
# calculate cosine similarity and get diagonal
similarities_mx = cosine_similarity(np.array(query_embeddings), np.array(example_embeddings))
similarities_diag = np.diag(similarities_mx)

  similarities_mx = cosine_similarity(np.array(query_embeddings), np.array(example_embeddings))


In [44]:
# apply esci mapping to esci label
esci_weighting = {
    'E': 3,
    'S': 2,
    'C': 1,
    'I': 0
}

df_task_1_train_light['relevance'] = df_task_1_train_light['esci_label'].map(esci_weighting)

In [45]:
# update index of subset df
df_task_1_train_light = df_task_1_train_light.reset_index(drop=True)
df_task_1_train_light.index = df_task_1_train_light.index + 1

In [58]:
qrels_dict = {}
run_dict = {}
top_n = 5

for query_id, group in df_task_1_train_light.groupby("query_id"):
    query_id_str = str(query_id)
    # get actuals
    qrels_dict[query_id_str] = {str(example): int(relevance) for example, relevance in zip(group["example_id"], group["relevance"])}
    
    # get scores paired to each example
    examples = group["example_id"].tolist()
    example_score_pairs = list(zip(examples, similarities_diag[:len(examples)]))
    
    # filter for top_n examples per query
    example_score_pairs_top_k = sorted(example_score_pairs, key=lambda x: x[1], reverse=True)[:top_n]

    # get predicted
    run_dict[query_id_str] = {str(example): score for example, score in example_score_pairs}
    
    print(f"Query ID: {query_id}")
    for example, score in example_score_pairs_top_k:
        print(f"{score:.2f}, example id {example}")

qrels = Qrels(qrels_dict)
run = Run(run_dict)

results = evaluate(qrels, run, metrics=["ndcg@10"])
print(results)

Query ID: 1
0.50, example id 17
0.49, example id 21
0.45, example id 28
0.44, example id 31
0.42, example id 26
Query ID: 3
0.66, example id 103
0.65, example id 89
0.63, example id 90
0.63, example id 88
0.62, example id 93
0.72157852690807
