# Semantic Search Function

#### Imports

In [3]:
import polars as pl
import gradio as gr
from sklearn.metrics import DistanceMetric
import numpy as np
from sentence_transformers import SentenceTransformer

#### Load data, model and metric

In [4]:
%time df = pl.scan_parquet('data/video-index.parquet')

CPU times: total: 0 ns
Wall time: 34 ms


In [5]:
model_name = 'all-MiniLM-L6-v2'
%time model = SentenceTransformer(model_name)



CPU times: total: 578 ms
Wall time: 7.41 s


In [6]:
dist_name = 'manhattan'
%time dist = DistanceMetric.get_metric(dist_name)

CPU times: total: 0 ns
Wall time: 1.01 ms


#### Search function

In [8]:
def returnSearchResults(quert: str, index: pl.lazyframe.frame.LazyFrame) -> np.array:
    """
        Function to return indexes of top search results
    """

    # embed query
    query_embedding = model.encode(query).reshape(1, -1)

    # compute distances between query and titles/transcripts
    dist_arr = dist.pairwise(df.select(df.columns[4:388]).collect(), query_embedding) + dist.pairwise(df.select(df.columns[388:]).collect(), query_embedding)

    # search parameters 
    threshold = 400 # eye balled threshold for manhatten distance
    top_k = 5

    # evaluate videos close to query based on threshold
    idx_below_threshold = np.argwhere(dist_arr.flatten()<threshold).flatten()

    # keep top k closest videos
    idx_sorted = np.argsort(dist_arr[idx_below_threshold], axis=0).flatten()

    # return indexes of search results
    return idx_below_threshold[idx_sorted][:top_k]

In [9]:
query = "LLM"
idx_result = returnSearchResults(query, df)

print(df.select(['video_id', 'title']).collect()[idx_result])

shape: (5, 2)
┌─────────────┬───────────────────────────────────┐
│ video_id    ┆ title                             │
│ ---         ┆ ---                               │
│ str         ┆ str                               │
╞═════════════╪═══════════════════════════════════╡
│ ytmK_ErTWss ┆ LLMs EXPLAINED in 60 seconds #ai  │
│ ZLbVdvOoTKM ┆ How to Build an LLM from Scratch… │
│ Ylz779Op9Pw ┆ How to Improve LLMs with RAG (Ov… │
│ tFHeUSJAYbE ┆ A Practical Introduction to Larg… │
│ eC6Hd1hFvos ┆ Fine-tuning Large Language Model… │
└─────────────┴───────────────────────────────────┘


In [10]:
df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

{'title': ['LLMs EXPLAINED in 60 seconds #ai',
  'How to Build an LLM from Scratch | An Overview',
  'How to Improve LLMs with RAG (Overview + Python Code)',
  'A Practical Introduction to Large Language Models (LLMs)',
  'Fine-tuning Large Language Models (LLMs) | w/ Example Code'],
 'video_id': ['ytmK_ErTWss',
  'ZLbVdvOoTKM',
  'Ylz779Op9Pw',
  'tFHeUSJAYbE',
  'eC6Hd1hFvos']}

#### Interface

In [11]:
def pseudoSearchAPI(query: str):

    # return top 5 search results
    idx_result = returnSearchResults(query, df)
    response = df.select(['title', 'video_id']).collect()[idx_result].to_dict(as_series=False)

    return response

In [12]:
def formatResultText(title: str, video_id: str):

    text = markdown_text = f"""<br> <br>
# {title}<br>
    
🔗 [Video Link](https://youtu.be/{video_id})"""

    return text


In [14]:
def formatVideoEmbed(video_id: str):
     # other options
    # embed = '<iframe width="640" height="360" src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" </iframe>'
    # embed = '<a href="https://youtu.be/'+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    # embed = '<a href="www.youtube.com/watch?v='+ video_id +'"> <img src="https://img.youtube.com/vi/'+ video_id +'/0.jpg" style="width:576;height:324;"></a>'
    return '<iframe width="576" height"324" src="https://www.youtube.com/embed/"'+ video_id + '></iframe>'


In [13]:
def searchResults(query):
    # pseudo API call
    response = pseudoSearchAPI(query)

    # Format search results

    # initialize list of outputs
    output_list = []

    # compute number of null search results (out of 5)
    num_empty_results = 5-len(response['title'])

    # display search results
    for i in range(len(response['title'])):
        video_id = response['video_id'][i]
        title = response['title'][i]

        embed = gr.HTML(value= formatVideoEmbed(video_id), visible=True)
        text = gr.Markdown(value= formatResultText(title, video_id), visible=True)

        output_list.append(embed)
        output_list.append(text)

    # make null search reslut slots invisible
    for i in range(num_empty_results):

        # if no search results display "No results." text
        if num_empty_results==5 and i==0:
            embed = gr.HTML(visible=False)
            text = gr.Markdown(value= "No results. Try rephrasing your query.", visible=True)

            output_list.append(embed)
            output_list.append(text)
            continue
        
        embed = gr.HTML(visible=False)
        text = gr.Markdown(visible=False)

        output_list.append(embed)
        output_list.append(text)

    return output_list

#### Demo

In [15]:
output_list = []

with gr.Blocks() as demo:
    gr.Markdown("# Youtube Search")

    with gr.Row():
        inp = gr.Textbox(placeholder="What are you looking for?", label="Query", scale=3)
        btn = gr.Button("Search")
        btn.click(fn=searchResults, inputs=inp, outputs=output_list)

    for i in range(5):
        with gr.Row():
            output_list.append(gr.HTML())
            output_list.append(gr.Markdown())

    inp.submit(fn=searchResults, inputs=inp, outputs=output_list)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


