# Semantic Search Function

#### Imports

In [3]:
import polars as pl
import gradio as gr
from sklearn.metrics import DistanceMetric
import numpy as np
from sentence_transformers import SentenceTransformer

OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\adlercohen\AppData\Roaming\Python\Python311\site-packages\torch\lib\shm.dll" or one of its dependencies.

#### Load data, model and metric

In [9]:
%time df = pl.scan_parquet('data/video-index.parquet')

CPU times: total: 0 ns
Wall time: 28.6 ms


In [10]:
model_name = 'all-MiniLM-L6-v2'
%time model = SentenceTransformer(model_name)



CPU times: total: 688 ms
Wall time: 7.24 s


In [11]:
dist_name = 'manhattan'
%time dist = DistanceMetric.get_metric(dist_name)

CPU times: total: 0 ns
Wall time: 997 µs


#### Search function

In [13]:
def returnSearchResults(quert: str, index: pl.lazyframe.frame.LazyFrame) -> np.array:
    """
        Function to return indexes of top search results
    """

    # embed query
    query_embedding = model.encode(query).reshape(1, -1)

    # compute distances between query and titles/transcripts
    dist_arr = dist.pairwise(df.select(df.columns[4:388]).collect(), query_embedding) + dist.pairwise(df.select(df.columns[388:]).collect(), query_embedding)

    # search parameters 
    threshold = 400 # eye balled threshold for manhatten distance
    top_k = 5

    # evaluate videos close to query based on threshold
    idx_below_threshold = np.argwhere(dist_arr.flatten()<threshold).flatten()

    # keep top k closest videos
    idx_sorted = np.argsort(dist_arr[idx_below_threshold], axis=0).flatten()

    # return indexes of search results
    return idx_below_threshold[idx_sorted][:top_k]

In [14]:
query = "LLM"
idx_result = returnSearchResults(query, df)

print(df.select(['video_id', 'title']).collect()[idx_result])

shape: (5, 2)
┌─────────────┬───────────────────────────────────┐
│ video_id    ┆ title                             │
│ ---         ┆ ---                               │
│ str         ┆ str                               │
╞═════════════╪═══════════════════════════════════╡
│ ytmK_ErTWss ┆ LLMs EXPLAINED in 60 seconds #ai  │
│ ZLbVdvOoTKM ┆ How to Build an LLM from Scratch… │
│ Ylz779Op9Pw ┆ How to Improve LLMs with RAG (Ov… │
│ tFHeUSJAYbE ┆ A Practical Introduction to Larg… │
│ eC6Hd1hFvos ┆ Fine-tuning Large Language Model… │
└─────────────┴───────────────────────────────────┘


In [15]:
df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

{'title': ['LLMs EXPLAINED in 60 seconds #ai',
  'How to Build an LLM from Scratch | An Overview',
  'How to Improve LLMs with RAG (Overview + Python Code)',
  'A Practical Introduction to Large Language Models (LLMs)',
  'Fine-tuning Large Language Models (LLMs) | w/ Example Code'],
 'video_id': ['ytmK_ErTWss',
  'ZLbVdvOoTKM',
  'Ylz779Op9Pw',
  'tFHeUSJAYbE',
  'eC6Hd1hFvos']}

#### Interface

In [16]:
def pseudoSearchAPI(query: str):

    # return top 5 search results
    idx_result = returnSearchResults(query, df)
    response = df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

    return response

In [17]:
def formatResultText(title: str, video_id: str):

    text = markdown_text = f"""<br> <br>
# {title}<br>
    
🔗 [Video Link](https://youtu.be/{video_id})"""

    return text
