# Compare Candidate Search Approaches

#### Imports

In [5]:
import polars as pl

from sentence_transformers import SentenceTransformer, util

from sklearn.metrics import DistanceMetric
import numpy as np
import matplotlib.pyplot as plt

#### Load data

In [3]:
df = pl.read_parquet('data/video-transcripts.parquet')
df_eval = pl.read_csv('data/eval-raw.csv')
df.head()
df.shape


(83, 4)

#### Embed titles and transcripts

In [4]:
# define "parameters"
column_to_embed_list = ['title', 'transcript']
model_name_list = ["all-MiniLM-L6-v2", "multi-qa-distilbert-cos-v1", "multi-qa-mpnet-base-dot-v1"]#embedding modules


In [7]:
# generate embeddings for each combination of column and model

# initialize dict to keep track of all text embeddings
text_embedding_dict = {}

for model_name in model_name_list:

    # define embedding model
    model = SentenceTransformer(model_name)

    for column_name in column_to_embed_list:

        # define text embedding identifier
        key_name = model_name + "_" + column_name
        print(key_name)

        # generate embeddings for text under column_name
        %time embedding_arr = model.encode(df[column_name].to_list())
        print('')

        # append embeddings to dict
        text_embedding_dict[key_name] = embedding_arr



all-MiniLM-L6-v2_title
CPU times: total: 2.78 s
Wall time: 832 ms

all-MiniLM-L6-v2_transcript
CPU times: total: 47 s
Wall time: 9.9 s

multi-qa-distilbert-cos-v1_title
CPU times: total: 6.14 s
Wall time: 1.54 s

multi-qa-distilbert-cos-v1_transcript
CPU times: total: 2min 4s
Wall time: 42.1 s



You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.6.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





model.safetensors:  65%|######4   | 283M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

multi-qa-mpnet-base-dot-v1_title
CPU times: total: 12.9 s
Wall time: 2.88 s

multi-qa-mpnet-base-dot-v1_transcript
CPU times: total: 4min 6s
Wall time: 1min 23s



#### Embed queries

In [8]:
query_embedding_dict = {}

for model_name in model_name_list:

    # define embedding model
    model = SentenceTransformer(model_name)
    print(model_name)

    # embed query text
    %time embedding_arr = model.encode(df_eval['query'].to_list())
    print('')

    # append embedding to dict
    query_embedding_dict[model_name] = embedding_arr



all-MiniLM-L6-v2
CPU times: total: 4.72 s
Wall time: 1.03 s

multi-qa-distilbert-cos-v1
CPU times: total: 9.97 s
Wall time: 3.25 s



You try to use a model that was created with version 3.0.0.dev0, however, your version is 2.6.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





multi-qa-mpnet-base-dot-v1
CPU times: total: 40.2 s
Wall time: 8.9 s



#### Evaluate search methods

In [9]:
def returnVideoID_index(df: pl.dataframe.frame.DataFrame, df_eval:  pl.dataframe.frame.DataFrame, query_n: int) -> int:
    """
        Function to return the index of a dataframe corresponding to the nth row in evaluation dataframe
    """

    return [i for i in range(len(df)) if df['video_id'][i]==df_eval['video_id'][query_n]][0]

In [10]:
def evalTrueRankings(dist_arr_isorted: np.ndarray, df:  pl.dataframe.frame.DataFrame, df_eval:  pl.dataframe.frame.DataFrame) -> np.ndarray:
    """
        Fun to return "true" video ID rankings for each evaluation query
    """

    # initialize array to store rankings of "correct" search result
    true_rank_arr = np.empty((1, dist_arr_isorted.shape[1]))

    # evaluate ranking of correct result for each query
    for query_n in range(dist_arr_isorted.shape[1]):

        # return "true" video ID's in df
        video_id_idx = returnVideoID_index(df, df_eval, query_n)

        # evaluate the ranking of the "true" video ID
        true_rank = np.argwhere(dist_arr_isorted[:,query_n] == video_id_idx)[0][0]

        # store the "true" video ID's ranking in array
        true_rank_arr[0, query_n] = true_rank

    return true_rank_arr

In [11]:
# initialize distance metrics to experiment
dist_name_list = ['euclidean', 'manhattan', 'chebyshev']
sim_name_list = ['cos_sim', 'dot_score']

In [13]:
# evaluate all possible combinations of model, columns to embed, and distance metrics

# initialise list to store results
eval_results = []

# loop through all models
for model_name in model_name_list:

    # generate query embedding
    query_embedding = query_embedding_dict[model_name]

    # loop through text columns
    for column_name in column_to_embed_list:

        # generate column embedding
        embedding_arr = text_embedding_dict[model_name+'_'+column_name]

        # loop through distance metrics
        for dist_name in dist_name_list:

            # compute distance between video text and query
            dist = DistanceMetric.get_metric(dist_name)
            dist_arr = dist.pairwise(embedding_arr, query_embedding)

            # sort indexes of distance array
            dist_arr_isorted = np.argsort(dist_arr, axis=0)

            # define label for search method
            method_name = "_".join([model_name, column_name, dist_name])

            # evaluate the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)

        # loop through sbert similarity scores
        for sim_name in sim_name_list:

            # apply similarity score from sbert
            cmd = "dist_arr = -util." + sim_name + "(embedding_arr, query_embedding)"
            exec(cmd)

            # sort indexes of distance array 
            dist_arr_isorted = np.argsort(dist_arr, axis=0)

            # define label for search method
            method_name = "_".join([model_name, column_name, sim_name.replace("_","-")])

            # define the ranking of the ground truth
            true_rank_arr = evalTrueRankings(dist_arr_isorted, df, df_eval)

            # store results
            eval_list = [method_name] + true_rank_arr.tolist()[0]
            eval_results.append(eval_list)



In [14]:
cmd

'dist_arr = -util.dot_score(embedding_arr, query_embedding)'

In [None]:
# compute rankings for title + transcripts embedding
for model_name in model_name_list:

    # generate embeddings
    embedding_arr1 = text_embedding_dict[model_name+'_title']
    embedding_arr2 = text_embedding_dict[model_name+'_transcript']
    query_embedding = query_embedding_dict[model_name]


    for dist_name in dist_name_list:

        # compute distance between video text and query
        dist = DistanceMetric.get_metric(dist_name)
        dist_arr = dist.pairwise(embdding_arr1, query_embedding) + dist.pairwise(embdding_arr2, query_embedding)

        # sort indexes of distance array
        dist_arr_isorted = mp.argsort(dist_arr, axis=0)

        