In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from openai import AzureOpenAI

In [2]:
load_dotenv()

client = AzureOpenAI(
  api_key=os.environ['AZURE_OPENAI_API_KEY'],  # this is also the default, it can be omitted
  api_version = "2023-05-15"
  )

model = os.environ['AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT']

SIMILARITIES_RESULTS_THRESHOLD = 0.75
DATASET_NAME = "embedding_index_3m.json"

In [3]:
def load_dataset(source: str) -> pd.core.frame.DataFrame:
    pd_vectors = pd.read_json(source)
    return pd_vectors.drop(columns = "text",errors = "ignore").fillna("")

In [4]:
pd_vectors = load_dataset(DATASET_NAME)
pd_vectors.head(20)

Unnamed: 0,speaker,title,videoId,start,seconds,summary,ada_v2
0,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:00:00,0,Join Seth Juarez as he discusses ethical conce...,"[0.004357332363724, -0.028409153223037, 0.0111..."
1,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:03:07,187,"In this video, the speaker discusses the chall...","[-0.0038613036740570003, -0.004626247566193000..."
2,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:06:13,373,The video discusses the limitations of general...,"[0.00287682027556, -0.012365541420876001, 0.02..."
3,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:09:21,561,The video discusses the importance of consider...,"[0.015913352370262, 0.000721095071639, 0.02349..."
4,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:12:24,744,The video discusses the importance of understa...,"[5.447878720588051e-06, -0.011837740428745, 0...."
5,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:15:27,927,The quality of speech recognition can be signi...,"[-0.003642795607447, 0.026637941598892004, 0.0..."
6,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:18:30,1110,The video discusses the different types of mic...,"[-0.0029493020847430004, -0.003765580477192000..."
7,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:21:33,1293,Human beings have differences and judge each o...,"[0.0036908765323460002, 0.005433321464806001, ..."
8,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:24:35,1475,The conversation highlights the challenge of b...,"[0.009933217428624, -0.009805609472095, -0.001..."
9,"Seth Juarez, Josh Lovejoy, Sarah Bird",You're Not Solving the Problem You Think You'r...,-tJQm4mSh1s,00:27:44,1664,The video discusses the importance of embracin...,"[0.006041203159838001, -0.005441128276288, 0.0..."


In [5]:
def cosine_similarity(a, b):
    if len(a) > len(b):
        b = np.pad(b, (0, len(a) - len(b)), 'constant')
    elif len(b) > len(a):
        a = np.pad(a, (0, len(b) - len(a)), 'constant')
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_videos(
    query: str, dataset: pd.core.frame.DataFrame, rows: int
) -> pd.core.frame.DataFrame:
    # create a copy of the dataset
    video_vectors = dataset.copy()

    # get the embeddings for the query    
    query_embeddings = client.embeddings.create(input=query, model=model).data[0].embedding

    # create a new column with the calculated similarity for each row
    video_vectors["similarity"] = video_vectors["ada_v2"].apply(
        lambda x: cosine_similarity(np.array(query_embeddings), np.array(x))
    )

    # filter the videos by similarity
    mask = video_vectors["similarity"] >= SIMILARITIES_RESULTS_THRESHOLD
    video_vectors = video_vectors[mask].copy()

    # sort the videos by similarity
    video_vectors = video_vectors.sort_values(by="similarity", ascending=False).head(
        rows
    )

    # return the top rows
    return video_vectors.head(rows)

In [6]:

def display_results(videos: pd.core.frame.DataFrame, query: str):
    def _gen_yt_url(video_id: str, seconds: int) -> str:
        """convert time in format 00:00:00 to seconds"""
        return f"https://youtu.be/{video_id}?t={seconds}"

    print(f"\nVideos similar to '{query}':")
    for _, row in videos.iterrows():
        youtube_url = _gen_yt_url(row["videoId"], row["seconds"])
        print(f" - {row['title']}")
        print(f"   Summary: {' '.join(row['summary'].split()[:15])}...")
        print(f"   YouTube: {youtube_url}")
        print(f"   Similarity: {row['similarity']}")
        print(f"   Speakers: {row['speaker']}")

In [7]:
while True:
    query = input("Enter a query: ")
    if query == "exit":
        break
    videos = get_videos(query, pd_vectors, 5)
    display_results(videos, query)


Videos similar to 'machine learning':
 - Automation to accelerate apply and outperform your machine learning models
   Summary: Nilesh Acharya, a Senior Program Manager at Microsoft, explains how automated machine learning (AutoML) can...
   YouTube: https://youtu.be/l8c-4iDPE0M?t=0
   Similarity: 0.8707700349040124
   Speakers: Nilesh Acharya
 - Teach a Bot with Project Conversation Learner
   Summary: The video discusses a machine learning process that is driven by code and allows for...
   YouTube: https://youtu.be/9DJcWyRkqBI?t=181
   Similarity: 0.8610728904105737
   Speakers: Jason Williams
 - Learn how to be a ML Hero using AutoML
   Summary: Aniththa Umamahesan, a Program Manager on the Azure Machine Learning Team at Microsoft, explains how...
   YouTube: https://youtu.be/vCVsrP1NLuw?t=1
   Similarity: 0.857715868735773
   Speakers: Aniththa Umamahesan
 - Automated Machine Learning on Azure
   Summary: In this episode of The AI Show, Kiana from the Azure Machine Learning team 