In [32]:
### IMPORT MODULES AND SET CONSTANTS

from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

THRESHOLD = 0.2

model = SentenceTransformer("google/embeddinggemma-300m")
path = "/workspaces/generative-ai-for-beginners/08-building-search-applications/embedding_index_3m.json"


In [33]:
### LOAD DATASET 
##### import embedding json as Panda Dataframe

def load_dataset(path: str) -> pd.core.frame.DataFrame:
    data = pd.read_json(path)
    return data

In [34]:
def cosine_similarity(a, b):
    if len(a) > len(b):
        b = np.pad(b, (0, len(a) - len(b)), 'constant')
    if len(a) < len(b):
        a = np.pad(a, (0,len(b) - len(a)), 'constant')
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [35]:
### GET VIDEOS 
###### embbed the input of the user
###### define cosine similarity to compare 2 embeddings
###### create new column which is the result of comparing the input to each embedded text using cosine similarity
###### filter out results below thresold
###### select the top 5 similar outputs

def get_videos(query: str, data: pd.core.frame.DataFrame, number_results: int) -> pd.core.frame.DataFrame:
    query_embedding = model.encode(query)
    video_vectors = data.copy()
    video_vectors['similarity'] = video_vectors['ada_v2'].apply(lambda x: cosine_similarity(x,query_embedding))
    #video_vectors = video_vectors[video_vectors['similarity'] > THRESHOLD].copy()
    video_vectors = video_vectors.sort_values(by='similarity', ascending = False).head(number_results)
    return video_vectors
    

In [36]:
### DISPLAY RESULTS
##### output with following format for each video result : - video name, url, ..

def display_results(videos: pd.core.frame.DataFrame, query: str) -> str:
    text = f"Here are the results for : {query}"
    for video in videos.itertuples():
        text += f"""
        - {video.title}
        URL : https://youtu.be/{video.videoId}?t={video.seconds}
        Summary : {video.summary}
        Speaker : {video.speaker}
        Similarity score : {round(video.similarity,3)}
        """
    print(text)
    return


In [37]:
data = load_dataset(path)

while True:
    query = input("What re you interested in ?") # Ask user input for search
    if query == "stop":
        break
    videos = get_videos(query, data, 5)
    display_results(videos, query)


Here are the results for : embedding
        - What's new with Speech: Custom Neural Voice now in GA
        URL : https://youtu.be/HG7HxkTYGzw?t=924
        Summary : The video discusses the process of creating a synthetic voice using a new microphone. It emphasizes the importance of obtaining consent from voice actors and ensuring their involvement in the training process. The video also highlights the use of speech-to-text and speaker recognition technologies to validate the accuracy and authenticity of the voice recordings. Finally, it mentions the recommended minimum number of lines for training and the availability of pre-built test lines.
        Speaker : Seth, Edward, Sarah
        Similarity score : 0.038
        
        - AI Show Custom Skills In Azure Cognitive Search
        URL : https://youtu.be/fHLCE-NZeb4?t=735
        Summary : The video demonstrates how to connect an Azure Function to the rest of a pipeline for consuming data. The Azure Function is a custom skill th