In [1]:
from typing import List, Tuple
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.core.agent import ReActAgent
from llama_index.core.tools import FunctionTool
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import CouldNotRetrieveTranscript
import urllib.request
import re, os, yt_dlp
import numpy as np
import pandas as pd
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

### loading envs

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

#### Declaring LLM and Embedding Models
Using local models for heavy tasks due to cost constrains

In [3]:
local_llm = Ollama(
    model="llama3.2", 
    request_timeout=60.0
)

local_embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

llm = AzureOpenAI(
    engine = "gpt-35-turbo",
    model = "gpt-35-turbo",
    api_key = os.getenv('AZURE_OPENAI_API_KEY'),  
    api_version = "2024-02-01",
    azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
)

embed_model = AzureOpenAIEmbedding(
    model = "text-embedding-3-large",
    deployment_name = "text-embedding-3-large",
    api_key = os.getenv('AZURE_OPENAI_API_KEY'),
    api_version = "2024-02-01",
    azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
)

Settings.llm = llm
Settings.embed_model = embed_model

#### Function for parsing top 10 videos from YouTube

In [9]:
def get_youtube_videos(search_keyword: str, results: int = 10) -> list:
    html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_keyword.replace(' ','+'))
    video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
    unique_ids = []
    for id in video_ids:
        if len(unique_ids) == results:
            break

        if id not in unique_ids:
            try:
                text = YouTubeTranscriptApi.get_transcript(id)
                valid_chunks = 0
                for obj in text:
                    if obj['text'] != '[Music]':
                        valid_chunks += 1 
                if valid_chunks > 2:
                    unique_ids.append(id)
            except CouldNotRetrieveTranscript:
                pass
            
    return ["https://www.youtube.com/watch?v="+i for i in unique_ids]

get_youtube_videos_tool = FunctionTool.from_defaults(
    fn=get_youtube_videos,
    description="Returns list of 10 youtube video links for given search keyword."
)

#### Function for parsing title of given youtube video

In [6]:
def get_youtube_title(youtube_url: str) -> str:
    ydl_opts = {}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info_dict = ydl.extract_info(youtube_url, download=False)
            return info_dict.get('title', 'Title not found')
        except Exception as e:
            return f"Error occurred: {str(e)}"

#### Saving video list for repeated usage
Skip this step if already saved

In [None]:
video_list = get_youtube_videos('jupiter')

df = pd.DataFrame([{'title':get_youtube_title(vid),'link':vid} for vid in video_list])
df.to_csv("video_list.csv",index=False)

#### Loading video list and getting titles

In [12]:
df = pd.read_csv("video_list.csv")
df

Unnamed: 0,title,link
0,Coldplay - JUPiTER (Official Lyric Visualiser),https://www.youtube.com/watch?v=07Pmjxhuo4k
1,Thoughts about the new Jupiter 110? #jupiter110,https://www.youtube.com/watch?v=MHj0dksesho
2,You can see Jupiter! 🪐,https://www.youtube.com/watch?v=FZJ5w3xHrAc
3,The Largest Planet In The Universe,https://www.youtube.com/watch?v=zmWcR-Mj2QQ
4,Jupiters Giant Red Spot is Acting Strange!,https://www.youtube.com/watch?v=TDq3-wmihw4
5,younger life in Jupiter #shorts #funny #anima...,https://www.youtube.com/watch?v=CFQfKhVweBI
6,Jupiter VS Saturn #shorts #space,https://www.youtube.com/watch?v=tvGgt-a445g
7,What is YOUR first impression of the new TVS J...,https://www.youtube.com/watch?v=czrgVVRPzJY
8,What If You Fell into Jupiter? #Shorts,https://www.youtube.com/watch?v=4lQAD07KRAg
9,Hello Jupiter #astrophotography #space #telesc...,https://www.youtube.com/watch?v=3vzagdNuOCo


#### Manual Observations
Noting similarity of all 45 pairs of videos

In [13]:
combinations = []
for i in range(10):
    for j in range(i+1, 10):
        combinations.append({'v1_idx':i,'v2_idx':j})
comb_df = pd.DataFrame(combinations)
comb_df.to_csv('combinations.csv',index=False)

I manually insert "is_similar" column in csv and reading again

In [14]:
comb_df = pd.read_csv("combinations.csv")
comb_df.head()

Unnamed: 0,v1_idx,v2_idx,is_similar
0,0,1,0
1,0,2,0
2,0,3,0
3,0,4,0
4,0,5,0


In [15]:
comb_df.is_similar.value_counts()

is_similar
0    23
1    22
Name: count, dtype: int64

# Using plain LLM approach

Summary function using LLM

In [23]:
def summarize_text(text: str) -> str:
    prompt = f"Summarize the following text in a concise way:\n\n{text}"
    
    response = llm.chat([
        ChatMessage(role="user",content=prompt)
    ])
    
    return response.message.content

Getting summary for all links

In [27]:
loader = YoutubeTranscriptReader()
documents = loader.load_data(ytlinks=df.link.to_list())
sum_df = pd.DataFrame([{'doc_id':doc.doc_id,'text':summarize_text(doc.text)} for doc in documents])

In [28]:
sum_df

Unnamed: 0,doc_id,text
0,07Pmjxhuo4k,The text is about a person struggling with the...
1,MHj0dksesho,The new Jitter 110 has a bigger seat and handl...
2,FZJ5w3xHrAc,Jupiter will be closer to Earth than it has be...
3,zmWcR-Mj2QQ,Jupiter is the largest planet in our solar sys...
4,TDq3-wmihw4,"Jupiter's giant red spot, a storm that has bee..."
5,CFQfKhVweBI,Living on Jupiter would make us younger becaus...
6,tvGgt-a445g,The text is about taking a step back in music.
7,czrgVVRPzJY,"The new TVs Jupyter 110 has a new design, LED ..."
8,4lQAD07KRAg,Falling into Jupiter would involve reaching hi...
9,3vzagdNuOCo,The text describes capturing Jupiter's Great R...


In [111]:
prompt = '''
This are 10 videos from youtube for keyword 'Jupiter' and their summaries:

{summaries}

Based on the summary, can you give me pairs of videos which are of similar topic out of all 45 possible pairs?
'''

full_prompt = prompt.format(summaries='\n'.join([f"{i+1}. {row['doc_id']}: {row['text']}" for i, row in sum_df.iterrows()]))
print(full_prompt)


This are 10 videos from youtube for keyword 'Jupiter' and their summaries:

1. 07Pmjxhuo4k: The text is about a person struggling with their identity and self-acceptance, but ultimately finding the message to never give up on love.
2. MHj0dksesho: The new Jitter 110 has a bigger seat and handlebar, making it comfortable for taller riders but potentially challenging for shorter riders. The reviewer, who is 5'9, finds it comfortable, while a colleague who is 5'5 finds flat footing difficult but has plenty of room. Overall, the reviewer is super comfortable with plenty of room.
3. FZJ5w3xHrAc: Jupiter will be closer to Earth than it has been in 59 years, and the speaker plans to point a dynamic laser at it. It will be slightly brighter than usual.
4. zmWcR-Mj2QQ: Jupiter is the largest planet in our solar system, with a size so big that all other planets could fit inside it. The largest exoplanet, hd100546b, is a gas giant with a mass of 752 Jupiters and a radius of 300,000 miles, locate

In [112]:
response = llm.chat([
    ChatMessage(role="user",content=full_prompt)
])
print(response.message.content)

Sure! Here are some pairs of videos that are of similar topic:

1. Videos 2 and 8 both discuss the features and comfort of the Jupyter 110.
2. Videos 4 and 9 both discuss scientific facts and phenomena related to Jupiter.
3. Videos 5 and 10 both discuss the Great Red Spot on Jupiter and its characteristics.

I hope this helps!


# Agent approach

In [48]:
def youtube_links_to_summary(youtube_links: List[str]) -> bool:
    loader = YoutubeTranscriptReader()
    documents = loader.load_data(ytlinks=youtube_links)
    df = pd.DataFrame([{'doc_id':doc.doc_id,'text':summarize_text(doc.text)} for doc in documents])
    df.to_csv("top_10_summaries.csv",index=False, sep="|")
    return True

summarize_youtube_video_tool = FunctionTool.from_defaults(
    fn=youtube_links_to_summary,
    description="Returns True if summaries are stored in file."
)

In [62]:
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def get_top_n_similar_videos(top_n: int) -> List[dict]:
    df = pd.read_csv("top_10_summaries.csv",sep="|")
    documents = []
    for t in df.itertuples():
        documents.append(
            Document(
                doc_id=t.doc_id,
                text=t.text,
                metadata={'video_id':t.doc_id},
                embedding=local_embed_model.get_text_embedding(t.text)
            )
        )

    # sim_matrix = np.zeros(shape=(10,10))
    # for i in range(len(documents)):
    #     for j in range(i+1,len(documents)):
    #         sim_matrix[i][j] = cosine_similarity(documents[i].embedding,documents[j].embedding)
    
    # sim_df = pd.DataFrame(sim_matrix, columns=["https://www.youtube.com/watch?v="+doc.doc_id for doc in documents], index=["https://www.youtube.com/watch?v="+doc.doc_id for doc in documents])
    # sim_df.to_csv("similarity_mat.csv")

    # flat_sim_matrix = sim_matrix.flatten()
    # indices = np.argpartition(flat_sim_matrix, -top_n)[-top_n:]
    # indices = np.flip(indices[np.argsort(flat_sim_matrix[indices])])

    # most_similar_pairs = []
    # for idx in indices:
    #     max_idx = np.unravel_index(idx,sim_matrix.shape)
    #     most_similar_pairs.append({
    #         'video1': "https://www.youtube.com/watch?v="+documents[max_idx[0]].doc_id,
    #         'video2': "https://www.youtube.com/watch?v="+documents[max_idx[1]].doc_id,
    #         'similarity': sim_matrix[max_idx[0]][max_idx[1]]
    #     })

    similarities = []
    for i in range(len(documents)):
        for j in range(i+1,len(documents)):
            similarities.append({'v1_idx':i,'v2_idx':j,'score':cosine_similarity(documents[i].embedding,documents[j].embedding)})
    
    df = pd.DataFrame(similarities)
    df['is_similar'] = (df['score']>0.3).astype(int)
    df.to_csv("similarity_mat.csv",index=False)
    most_similar_pairs = [{
            'video1': "https://www.youtube.com/watch?v="+documents[t.v1_idx].doc_id,
            'video2': "https://www.youtube.com/watch?v="+documents[t.v2_idx].doc_id,
            'similarity': t.score
    } for t in df.sort_values('score',ascending=False).head(top_n).itertuples()]
    return most_similar_pairs

most_similar_videos_tool = FunctionTool.from_defaults(
    fn=get_top_n_similar_videos,
    description="Returns list of pairs of youtube links with most similarity."
)

In [63]:
agent = ReActAgent.from_tools(
    tools=[summarize_youtube_video_tool,most_similar_videos_tool],
    llm=llm,
    verbose=True,
    context="Purpose: The primary role of this agent is to figure out which pairs of videos are similar to each other based on their summary."
)

In [64]:
answer = agent.query(f'Find similar pairs of videos based on summary from following list: {df.link.to_list()}')

> Running step 5c8cc06b-721b-4516-991c-68f3c38f7e70. Step input: Find similar pairs of videos based on summary from following list: ['https://www.youtube.com/watch?v=07Pmjxhuo4k', 'https://www.youtube.com/watch?v=MHj0dksesho', 'https://www.youtube.com/watch?v=FZJ5w3xHrAc', 'https://www.youtube.com/watch?v=zmWcR-Mj2QQ', 'https://www.youtube.com/watch?v=TDq3-wmihw4', 'https://www.youtube.com/watch?v=CFQfKhVweBI', 'https://www.youtube.com/watch?v=tvGgt-a445g', 'https://www.youtube.com/watch?v=czrgVVRPzJY', 'https://www.youtube.com/watch?v=4lQAD07KRAg', 'https://www.youtube.com/watch?v=3vzagdNuOCo']
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: youtube_links_to_summary
Action Input: {'youtube_links': ['https://www.youtube.com/watch?v=07Pmjxhuo4k', 'https://www.youtube.com/watch?v=MHj0dksesho', 'https://www.youtube.com/watch?v=FZJ5w3xHrAc', 'https://www.youtube.com/watch?v=zmWcR-Mj2QQ', 'https://www.youtube

### Calculate Accuracy of Agents's response

In [89]:
agent_df = pd.read_csv("similarity_mat.csv")

In [90]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_true=comb_df['is_similar'],y_pred=agent_df['is_similar'])

array([[22,  1],
       [ 5, 17]], dtype=int64)

In [91]:
print(classification_report(y_true=comb_df['is_similar'],y_pred=agent_df['is_similar']))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88        23
           1       0.94      0.77      0.85        22

    accuracy                           0.87        45
   macro avg       0.88      0.86      0.86        45
weighted avg       0.88      0.87      0.87        45

