In [13]:
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import CouldNotRetrieveTranscript
import urllib.request
import re
import numpy as np
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage
from llama_index.core import VectorStoreIndex, Settings, load_index_from_storage, StorageContext, Document

In [3]:
llm = Ollama(
    model="gemma", 
    request_timeout=60.0
)

embed_model = OllamaEmbedding(
    model_name="mxbai-embed-large",
    base_url="http://localhost:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

Settings.llm = llm
Settings.embed_model = embed_model

In [4]:
def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [5]:
def get_youtube_videos(search_keyword: str) -> list:
    html = urllib.request.urlopen("https://www.youtube.com/results?search_query=" + search_keyword.replace(' ','+'))
    video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
    unique_ids = []
    for id in video_ids:
        if len(unique_ids) == 10:
            break

        if id not in unique_ids:
            try:
                YouTubeTranscriptApi.get_transcript(id)
                unique_ids.append(id)
            except CouldNotRetrieveTranscript:
                pass
            
    return ["https://www.youtube.com/watch?v="+i for i in unique_ids]

In [6]:
def summarize_text(text: str) -> str:
    prompt = f"Summarize the following text in a concise way:\n\n{text}"
    
    response = llm.chat([
        ChatMessage(role="user",content=prompt)
    ])
    
    return response.content

In [7]:
top_10_videos = get_youtube_videos("samsung galaxy s24 ultra")
loader = YoutubeTranscriptReader()
documents = loader.load_data(
    ytlinks=top_10_videos
)

In [10]:
import pandas as pd
pd.DataFrame([{'doc_id':doc.doc_id,'text':doc.text} for doc in documents]).to_csv("top_10_summaries.csv",index=False, sep="|")

In [12]:
df = pd.read_csv("top_10_summaries.csv",sep="|")

In [17]:
new_documents = []
for t in df.itertuples():
    new_documents.append(
        Document(doc_id=t.doc_id,text=t.text,metadata={'video_id':t.doc_id})
    )

In [19]:
for doc in documents:
    doc.embedding = embed_model.get_text_embedding(doc.text)

In [48]:
sim_matrix = np.zeros(shape=(10,10))
for i in range(len(documents)):
    for j in range(i+1,len(documents)):
        sim_matrix[i][j] = cosine_similarity(documents[i].embedding,documents[j].embedding)

In [49]:
sim_matrix

array([[0.        , 0.73586613, 0.46613015, 0.46254255, 0.50276201,
        0.43542267, 0.44174074, 0.58531675, 0.4279344 , 0.407983  ],
       [0.        , 0.        , 0.4860926 , 0.43430609, 0.55194662,
        0.42539777, 0.4352589 , 0.61435845, 0.45117613, 0.40781358],
       [0.        , 0.        , 0.        , 0.45185586, 0.41174182,
        0.50243459, 0.47889434, 0.42675861, 0.44447204, 0.46983459],
       [0.        , 0.        , 0.        , 0.        , 0.38851148,
        0.7167457 , 0.78844252, 0.40494019, 0.60124663, 0.7353023 ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.39361684, 0.4031558 , 0.68485239, 0.38440715, 0.37667721],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.84314474, 0.37441555, 0.56031629, 0.89880822],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.41566068, 0.58526229, 0.8912586 ],
       [0.        , 0.        , 0.       

In [50]:
flat_sim_matrix = sim_matrix.flatten()
indices = np.argpartition(flat_sim_matrix, -4)[-4:]
indices = np.flip(indices[np.argsort(flat_sim_matrix[indices])])

In [52]:
most_similar_pairs = []
for idx in indices:
    max_idx = np.unravel_index(idx,sim_matrix.shape)
    most_similar_pairs.append({
        'video1': "https://www.youtube.com/watch?v="+documents[max_idx[0]].doc_id,
        'video2': "https://www.youtube.com/watch?v="+documents[max_idx[1]].doc_id,
        'similarity': sim_matrix[max_idx[0]][max_idx[1]]
    })

In [53]:
most_similar_pairs

[{'video1': 'https://www.youtube.com/watch?v=KQV7jcBcCvU',
  'video2': 'https://www.youtube.com/watch?v=Od9F_9guCrY',
  'similarity': 0.8988082210616966},
 {'video1': 'https://www.youtube.com/watch?v=CcrotrFYWc8',
  'video2': 'https://www.youtube.com/watch?v=Od9F_9guCrY',
  'similarity': 0.8912586011331761},
 {'video1': 'https://www.youtube.com/watch?v=KQV7jcBcCvU',
  'video2': 'https://www.youtube.com/watch?v=CcrotrFYWc8',
  'similarity': 0.8431447377059343},
 {'video1': 'https://www.youtube.com/watch?v=1Z1xsD3xxIU',
  'video2': 'https://www.youtube.com/watch?v=CcrotrFYWc8',
  'similarity': 0.7884425226777243}]

In [36]:
np.unravel_index(sim_matrix.argmax(),sim_matrix.shape)

(5, 9)

In [37]:
documents[5].doc_id,documents[9].doc_id

('KQV7jcBcCvU', 'Od9F_9guCrY')

In [38]:
print(["https://www.youtube.com/watch?v="+i for i in most_similar_pair])
print(max_sim)

['https://www.youtube.com/watch?v=KQV7jcBcCvU', 'https://www.youtube.com/watch?v=Od9F_9guCrY']
0.8988082210616966
