In [1]:
from typing import List, Tuple
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import CouldNotRetrieveTranscript
import urllib.request
import re, os
import numpy as np
import pandas as pd
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
local_llm = Ollama(
    model="llama3.2", 
    request_timeout=60.0
)

# local_embed_model = OllamaEmbedding(
#     model_name="mxbai-embed-large",
#     base_url="http://localhost:11434",
#     ollama_additional_kwargs={"mirostat": 0},
# )

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
local_embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

llm = AzureOpenAI(
    engine = "gpt-35-turbo",
    model = "gpt-35-turbo",
    api_key = os.getenv('AZURE_OPENAI_API_KEY'),  
    api_version = "2024-02-01",
    azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
)

embed_model = AzureOpenAIEmbedding(
    model = "text-embedding-3-large",
    deployment_name = "text-embedding-3-large",
    api_key = os.getenv('AZURE_OPENAI_API_KEY'),
    api_version = "2024-02-01",
    azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
)

Settings.llm = llm
Settings.embed_model = embed_model

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_youtube_videos(search_keyword: str) -> list:
    html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_keyword.replace(' ','+'))
    video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
    unique_ids = []
    for id in video_ids:
        if len(unique_ids) == 10:
            break

        if id not in unique_ids:
            try:
                text = YouTubeTranscriptApi.get_transcript(id)
                if len(text) > 2:
                    unique_ids.append(id)
            except CouldNotRetrieveTranscript:
                pass
            
    return ["https://www.youtube.com/watch?v="+i for i in unique_ids]

get_youtube_videos_tool = FunctionTool.from_defaults(
    fn=get_youtube_videos,
    description="Returns list of 10 youtube video links for given search keyword."
)

In [5]:
youtube_search_agent = ReActAgent.from_tools(
    tools=[get_youtube_videos_tool],
    llm=llm,
    verbose=True,
    context="Purpose: The primary role of this agent is to provide youtube video links for given query. Provide full link in answer."
)

In [6]:
response1 = youtube_search_agent.query("Can you give me 10 videos for Jupiter?")

> Running step 62bcf95e-dc5a-414a-92ca-b9ac814cd2c1. Step input: Can you give me 10 videos for Jupiter?
[1;3;38;5;200mThought: The user is asking for 10 YouTube videos related to the topic "Jupiter".
Action: get_youtube_videos
Action Input: {'search_keyword': 'Jupiter'}
[0m[1;3;34mObservation: ['https://www.youtube.com/watch?v=07Pmjxhuo4k', 'https://www.youtube.com/watch?v=XWVBBPcxoZE', 'https://www.youtube.com/watch?v=MHj0dksesho', 'https://www.youtube.com/watch?v=3vzagdNuOCo', 'https://www.youtube.com/watch?v=FZJ5w3xHrAc', 'https://www.youtube.com/watch?v=TDq3-wmihw4', 'https://www.youtube.com/watch?v=zmWcR-Mj2QQ', 'https://www.youtube.com/watch?v=jO9h2qez9Wg', 'https://www.youtube.com/watch?v=czrgVVRPzJY', 'https://www.youtube.com/watch?v=LmnRH4aWaug']
[0m> Running step 77e817d8-ea8a-4eab-9378-8545934ee1be. Step input: None
[1;3;38;5;200mThought: I have the YouTube video links related to "Jupiter".
Answer: Here are the YouTube video links related to Jupiter:
1. https://www.yout

In [7]:
print(response1.response)

Here are the YouTube video links related to Jupiter:
1. https://www.youtube.com/watch?v=07Pmjxhuo4k
2. https://www.youtube.com/watch?v=XWVBBPcxoZE
3. https://www.youtube.com/watch?v=MHj0dksesho
4. https://www.youtube.com/watch?v=3vzagdNuOCo
5. https://www.youtube.com/watch?v=FZJ5w3xHrAc
6. https://www.youtube.com/watch?v=TDq3-wmihw4
7. https://www.youtube.com/watch?v=zmWcR-Mj2QQ
8. https://www.youtube.com/watch?v=jO9h2qez9Wg
9. https://www.youtube.com/watch?v=czrgVVRPzJY
10. https://www.youtube.com/watch?v=LmnRH4aWaug


In [8]:
def summarize_text(text: str) -> str:
    prompt = f"Summarize the following text in a concise way:\n\n{text}"
    
    response = llm.chat([
        ChatMessage(role="user",content=prompt)
    ])
    
    return response.message.content

def youtube_links_to_summary(youtube_links: List[str]) -> bool:
    loader = YoutubeTranscriptReader()
    documents = loader.load_data(ytlinks=youtube_links)
    df = pd.DataFrame([{'doc_id':doc.doc_id,'text':summarize_text(doc.text)} for doc in documents])
    df.to_csv("top_10_summaries.csv",index=False, sep="|")
    return True

# def youtube_links_to_summary(youtube_links: List[str]) -> List[str]:
#     loader = YoutubeTranscriptReader()
#     documents = loader.load_data(ytlinks=youtube_links)
#     df = pd.DataFrame([{'doc_id':doc.doc_id,'text':summarize_text(doc.text)} for doc in documents])
#     df.to_csv("top_10_summaries.csv",index=False, sep="|")
#     return df.text.to_list()

summarize_youtube_video_tool = FunctionTool.from_defaults(
    fn=youtube_links_to_summary,
    # description="Returns array of summaries of all youtube links."
    description="Returns True if summaries are stored in file."
)

In [9]:
summarize_youtube_agent = ReActAgent.from_tools(
    tools=[summarize_youtube_video_tool],
    llm=llm,
    verbose=True,
    # context="Purpose: The primary role of this agent is to give summary of each youtube video link given in input."
    context="Purpose: The primary role of this agent is to save summary of each youtube video link in csv file. Except True on successful execution."
)

In [11]:
response2 = summarize_youtube_agent.query("Save summary for each youtube links given here: "+response1.response)
# response2 = summarize_youtube_agent.query("Give summary for each youtube links given here: "+response1.response)

> Running step b27af1dc-62cd-44cd-bc38-d9f4da35cf21. Step input: Save summary for each youtube links given here: Here are the YouTube video links related to Jupiter:
1. https://www.youtube.com/watch?v=07Pmjxhuo4k
2. https://www.youtube.com/watch?v=XWVBBPcxoZE
3. https://www.youtube.com/watch?v=MHj0dksesho
4. https://www.youtube.com/watch?v=3vzagdNuOCo
5. https://www.youtube.com/watch?v=FZJ5w3xHrAc
6. https://www.youtube.com/watch?v=TDq3-wmihw4
7. https://www.youtube.com/watch?v=zmWcR-Mj2QQ
8. https://www.youtube.com/watch?v=jO9h2qez9Wg
9. https://www.youtube.com/watch?v=czrgVVRPzJY
10. https://www.youtube.com/watch?v=LmnRH4aWaug
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: youtube_links_to_summary
Action Input: {'youtube_links': ['https://www.youtube.com/watch?v=07Pmjxhuo4k', 'https://www.youtube.com/watch?v=XWVBBPcxoZE', 'https://www.youtube.com/watch?v=MHj0dksesho', 'https://www.youtube.com/watch?v=

In [12]:
print(response2.response)

The summaries for the YouTube video links related to Jupiter have been successfully saved.


In [4]:
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def get_top_n_similar_videos(top_n: int) -> List[dict]:
    df = pd.read_csv("top_10_summaries.csv",sep="|")
    documents = []
    for t in df.itertuples():
        documents.append(
            Document(
                doc_id=t.doc_id,
                text=t.text,
                metadata={'video_id':t.doc_id},
                embedding=local_embed_model.get_text_embedding(t.text)
            )
        )

    sim_matrix = np.zeros(shape=(10,10))
    for i in range(len(documents)):
        for j in range(i+1,len(documents)):
            sim_matrix[i][j] = cosine_similarity(documents[i].embedding,documents[j].embedding)
    
    sim_df = pd.DataFrame(sim_matrix, columns=["https://www.youtube.com/watch?v="+doc.doc_id for doc in documents], index=["https://www.youtube.com/watch?v="+doc.doc_id for doc in documents])
    sim_df.to_csv("similarity_mat.csv")

    flat_sim_matrix = sim_matrix.flatten()
    indices = np.argpartition(flat_sim_matrix, -top_n)[-top_n:]
    indices = np.flip(indices[np.argsort(flat_sim_matrix[indices])])

    most_similar_pairs = []
    for idx in indices:
        max_idx = np.unravel_index(idx,sim_matrix.shape)
        most_similar_pairs.append({
            'video1': "https://www.youtube.com/watch?v="+documents[max_idx[0]].doc_id,
            'video2': "https://www.youtube.com/watch?v="+documents[max_idx[1]].doc_id,
            'similarity': sim_matrix[max_idx[0]][max_idx[1]]
        })
    
    return most_similar_pairs

most_similar_videos_tool = FunctionTool.from_defaults(
    fn=get_top_n_similar_videos,
    description="Returns list of pairs of youtube links with most similarity."
)

In [5]:
similarity_agent = ReActAgent.from_tools(
    tools=[most_similar_videos_tool],
    llm=llm,
    verbose=True,
    context="Purpose: The primary role of this agent is to give n pairs of most similar youtube videos."
)

In [6]:
response3 = similarity_agent.query("Give me 5 pairs of most similar videos.")

> Running step dc82661c-6685-422d-934f-19b3d5a18503. Step input: Give me 5 pairs of most similar videos.
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: get_top_n_similar_videos
Action Input: {'top_n': 5}
[0m[1;3;34mObservation: [{'video1': 'https://www.youtube.com/watch?v=3vzagdNuOCo', 'video2': 'https://www.youtube.com/watch?v=FZJ5w3xHrAc', 'similarity': 0.6166502990482879}, {'video1': 'https://www.youtube.com/watch?v=3vzagdNuOCo', 'video2': 'https://www.youtube.com/watch?v=TDq3-wmihw4', 'similarity': 0.5854569089183563}, {'video1': 'https://www.youtube.com/watch?v=FZJ5w3xHrAc', 'video2': 'https://www.youtube.com/watch?v=TDq3-wmihw4', 'similarity': 0.5145864511244012}, {'video1': 'https://www.youtube.com/watch?v=TDq3-wmihw4', 'video2': 'https://www.youtube.com/watch?v=zmWcR-Mj2QQ', 'similarity': 0.47643341857437804}, {'video1': 'https://www.youtube.com/watch?v=07Pmjxhuo4k', 'video2': 'https://www.you

In [7]:
print(response3.response)

Here are the 5 pairs of most similar videos:
1. Video 1: [https://www.youtube.com/watch?v=3vzagdNuOCo], Video 2: [https://www.youtube.com/watch?v=FZJ5w3xHrAc], Similarity: 0.6167
2. Video 1: [https://www.youtube.com/watch?v=3vzagdNuOCo], Video 2: [https://www.youtube.com/watch?v=TDq3-wmihw4], Similarity: 0.5855
3. Video 1: [https://www.youtube.com/watch?v=FZJ5w3xHrAc], Video 2: [https://www.youtube.com/watch?v=TDq3-wmihw4], Similarity: 0.5146
4. Video 1: [https://www.youtube.com/watch?v=TDq3-wmihw4], Video 2: [https://www.youtube.com/watch?v=zmWcR-Mj2QQ], Similarity: 0.4764
5. Video 1: [https://www.youtube.com/watch?v=07Pmjxhuo4k], Video 2: [https://www.youtube.com/watch?v=LmnRH4aWaug], Similarity: 0.4694


In [10]:
df = pd.read_csv('similarity_mat.csv',index_col=[0])

In [25]:
records = []
for i in range(10):
    for j in range(i+1,10):
        records.append({'video1':df.index[i],'video2':df.columns[j],'score':df.iloc[i,j]})
rec_df = pd.DataFrame(records)

In [26]:
rec_df.sort_values('score',ascending=False).head(10)

Unnamed: 0,video1,video2,score
24,https://www.youtube.com/watch?v=3vzagdNuOCo,https://www.youtube.com/watch?v=FZJ5w3xHrAc,0.61665
25,https://www.youtube.com/watch?v=3vzagdNuOCo,https://www.youtube.com/watch?v=TDq3-wmihw4,0.585457
30,https://www.youtube.com/watch?v=FZJ5w3xHrAc,https://www.youtube.com/watch?v=TDq3-wmihw4,0.514586
35,https://www.youtube.com/watch?v=TDq3-wmihw4,https://www.youtube.com/watch?v=zmWcR-Mj2QQ,0.476433
8,https://www.youtube.com/watch?v=07Pmjxhuo4k,https://www.youtube.com/watch?v=LmnRH4aWaug,0.46936
32,https://www.youtube.com/watch?v=FZJ5w3xHrAc,https://www.youtube.com/watch?v=jO9h2qez9Wg,0.461005
27,https://www.youtube.com/watch?v=3vzagdNuOCo,https://www.youtube.com/watch?v=jO9h2qez9Wg,0.451152
26,https://www.youtube.com/watch?v=3vzagdNuOCo,https://www.youtube.com/watch?v=zmWcR-Mj2QQ,0.448302
31,https://www.youtube.com/watch?v=FZJ5w3xHrAc,https://www.youtube.com/watch?v=zmWcR-Mj2QQ,0.425237
11,https://www.youtube.com/watch?v=XWVBBPcxoZE,https://www.youtube.com/watch?v=FZJ5w3xHrAc,0.418047


In [27]:
manual_scores = pd.read_csv('manual_scoring.csv')

In [34]:
(rec_df['video1']==manual_scores['video1']).all()

True

In [32]:
np.absolute(rec_df['score'] - manual_scores['score']).sum()/45

0.219500281744659