# Semantic Search Function

Code authored by: Shaw Talebi <br>

Video link: https://youtu.be/6qCrvlHRhcM

### imports

In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer
from sklearn.metrics import DistanceMetric
import numpy as np
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


### load data, model, and metric

In [2]:
%time df = pl.scan_parquet('data/video-index.parquet')

CPU times: total: 0 ns
Wall time: 3 ms


In [3]:
model_name = 'all-MiniLM-L6-v2'
%time model = SentenceTransformer(model_name)

CPU times: total: 3.16 s
Wall time: 4.74 s


In [4]:
dist_name = 'manhattan'
%time dist = DistanceMetric.get_metric(dist_name)

CPU times: total: 0 ns
Wall time: 0 ns


### search function

In [5]:
def returnSearchResults(query: str, index: pl.LazyFrame) -> np.ndarray:
    """
    Function to return indexes of top search results using semantic search.
    
    Args:
        query (str): Search query to encode and compare against index
        index (pl.LazyFrame): LazyFrame containing embeddings and metadata
        
    Returns:
        np.ndarray: Array of indices for top matching results
    """
    # Get schema once at the start to avoid repeated schema resolution
    schema = index.collect_schema()
    col_names = schema.names()
    
    # Find embedding column ranges
    title_emb_cols = col_names[4:388]  # Title embeddings
    transcript_emb_cols = col_names[388:]  # Transcript embeddings
    
    # Embed query once
    query_embedding = model.encode(query).reshape(1, -1)
    
    # Compute distances more efficiently by collecting embeddings once
    embeddings_df = index.select(title_emb_cols + transcript_emb_cols).collect()
    title_embeddings = embeddings_df.select(title_emb_cols)
    transcript_embeddings = embeddings_df.select(transcript_emb_cols)
    
    # Calculate combined distances
    dist_arr = (
        dist.pairwise(title_embeddings, query_embedding) + 
        dist.pairwise(transcript_embeddings, query_embedding)
    )

    # Search parameters 
    threshold = 40  # Threshold for Manhattan distance
    top_k = 5
    
    # Get indices of results below threshold
    idx_below_threshold = np.argwhere(dist_arr.flatten() < threshold).flatten()
    
    # If no results found below threshold, return top_k closest overall
    if len(idx_below_threshold) == 0:
        return np.argsort(dist_arr.flatten())[:top_k]
        
    # Sort filtered results and return top_k
    idx_sorted = np.argsort(dist_arr[idx_below_threshold], axis=0).flatten()
    return idx_below_threshold[idx_sorted][:top_k]


In [6]:
# Example usage:
query = "LLM"
idx_result = returnSearchResults(query, df)

# Get results with just needed columns
results = df.select(['video_id', 'title']).collect()[idx_result]
print(results)

shape: (5, 2)
┌─────────────┬─────────────────────────────────┐
│ video_id    ┆ title                           │
│ ---         ┆ ---                             │
│ str         ┆ str                             │
╞═════════════╪═════════════════════════════════╡
│ ytmK_ErTWss ┆ LLMs EXPLAINED in 60 seconds #… │
│ ZLbVdvOoTKM ┆ How to Build an LLM from Scrat… │
│ Ylz779Op9Pw ┆ How to Improve LLMs with RAG (… │
│ tFHeUSJAYbE ┆ A Practical Introduction to La… │
│ eC6Hd1hFvos ┆ Fine-tuning Large Language Mod… │
└─────────────┴─────────────────────────────────┘


In [7]:
df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

{'title': ['LLMs EXPLAINED in 60 seconds #ai',
  'How to Build an LLM from Scratch | An Overview',
  'How to Improve LLMs with RAG (Overview + Python Code)',
  'A Practical Introduction to Large Language Models (LLMs)',
  'Fine-tuning Large Language Models (LLMs) | w/ Example Code'],
 'video_id': ['ytmK_ErTWss',
  'ZLbVdvOoTKM',
  'Ylz779Op9Pw',
  'tFHeUSJAYbE',
  'eC6Hd1hFvos']}

### interface

In [8]:
def pseudoSearchAPI(query: str):
    
    # return top 5 search results
    idx_result = returnSearchResults(query, df)
    response = df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

    return response

In [9]:
def formatResultText(title: str, video_id: str):
    
    text = markdown_text = f"""<br> <br>
# {title}<br>

🔗 [Video Link](https://youtu.be/{video_id})"""

    return text

In [10]:
def formatVideoEmbed(video_id: str):

    # other options
    # embed = '<iframe width="640" height="360" src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" </iframe>'
    # embed = '<a href="https://youtu.be/'+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    # embed = '<a href="www.youtube.com/watch?v='+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    
    return '<iframe width="576" height="324" src="https://www.youtube.com/embed/'+ video_id +'"></iframe>'

In [11]:
import requests
import json
def searchAPI(query):
    # URL de votre service AWS 
    url = 'http://34.203.203.145:80/search'
    
    # Paramètres de la requête
    params = {"query": query}
    
    # Faire la requête GET vers votre service
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Vérifie si la requête a réussi
        return json.loads(response.text)
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return {"title": [], "video_id": []}  # Retourne un résultat vide en cas d'erreur

In [None]:
def searchResults(query):
    # pseudo API call
    # response = pseudoSearchAPI(query)
    response = searchAPI(query)
    # print(f"response : {response}")
    # format search results

    # initialize list of outputs
    output_list = []

    # compute number of null search results (out of 5)
    num_empty_results = 5-len(response['title'])

    # display search results
    for i in range(len(response['title'])):
        video_id = response['video_id'][i]
        title = response['title'][i]

        embed = gr.HTML(value = formatVideoEmbed(video_id), visible=True)
        text = gr.Markdown(value = formatResultText(title, video_id), visible=True)

        output_list.append(embed)
        output_list.append(text)

    # make null search result slots invisible
    for i in range(num_empty_results):
        
        # if no search results display "No results." text
        if num_empty_results==5 and i==0:
            embed = gr.HTML(visible=False)
            text = gr.Markdown(value = "No results. Try rephrasing your query.", visible=True)

            output_list.append(embed)
            output_list.append(text)
            continue

        embed = gr.HTML(visible=False)
        text = gr.Markdown(visible=False)

        output_list.append(embed)
        output_list.append(text)
        
    return output_list

In [13]:
# demo
output_list = []

with gr.Blocks() as demo:
    gr.Markdown("# YouTube Search")

    with gr.Row():
        inp = gr.Textbox(placeholder="What are you looking for?", label="Query", scale=3)
        btn = gr.Button("Search")
        btn.click(fn=searchResults, inputs=inp, outputs=output_list)
    
    for i in range(5):
        with gr.Row():
            output_list.append(gr.HTML())
            output_list.append(gr.Markdown())
             
    inp.submit(fn=searchResults, inputs=inp, outputs=output_list)

demo.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


