In [21]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv
!pip install -q langchain-groq
!pip install -q -U langchain langchain-community



In [22]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

In [23]:
video_id = "watch?v=EN7xrLlBu_k"#"Gfr50f6ZBvo" # only the ID, not full URL
try:
    # If you don’t care which language, this returns the “best” one
    transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

    # Flatten it to plain text
    transcript = " ".join(chunk["text"] for chunk in transcript_list)
    print(transcript)

except TranscriptsDisabled:
    print("No captions available for this video.")

VideoUnavailable: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=watch?v=EN7xrLlBu_k! This is most likely caused by:

The video is no longer available

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [None]:
#embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
#vector_store = FAISS.from_documents(chunks, embeddings)

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_core.embeddings import Embeddings
from typing import List
import numpy as np

# 1. Create a custom LangChain-compatible Embedding class for BGE
class BGEEmbeddings(Embeddings):
    def __init__(self):
        self.model = SentenceTransformer("BAAI/bge-small-en-v1.5")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = self.model.encode(texts, normalize_embeddings=True)
        return embeddings.tolist()

    def embed_query(self, text: str) -> List[float]:
        embedding = self.model.encode(text, normalize_embeddings=True)
        return embedding.tolist()

# 2. Initialize the BGE embedding object
embeddings = BGEEmbeddings()

# 3. Create FAISS vector store (assuming `chunks` is a list of LangChain Documents)
vector_store = FAISS.from_documents(
    documents=chunks,
    embedding=embeddings  # Pass the LangChain-compatible embedding object
)

# 4. Save/load the vector store (optional)
#vector_store.save_local("faiss_bge_index")
# loaded_store = FAISS.load_local("faiss_bge_index", embeddings=embeddings)

In [None]:
vector_store.index_to_docstore_id

In [None]:
#vector_store.get_by_ids(['99932a7a-a67a-486f-84e0-0759380d4130'])

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
retriever

In [None]:
retriever.invoke('What is deepmind')

In [None]:
from huggingface_hub import login
login(token="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")


In [None]:
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq

# Set up LLM using Groq
llm = ChatGroq(
    #groq_api_key="your_groq_api_key",
    #model_name="llama-4-scout-17b-16e"
    groq_api_key="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",
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)

# Define prompt
prompt = PromptTemplate(
    template="""
You are a helpful assistant.
Answer ONLY from the provided transcript context.
If the context is insufficient, just say you don't know.

{context}
Question: {question}
""",
    input_variables=["context", "question"]
)

# Sample question and context
#question = "is the topic of nuclear fusion discussed in this video?"
#retrieved_docs = retriever.invoke(question)  # <-- your retrieval logic here
#context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

#final_prompt = prompt.invoke({"context": context_text, "question": question})
#answer = llm.invoke(final_prompt)

#print(answer.content)


In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
#parallel_chain.invoke('who is Demis')

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
main_chain.invoke('is the topic of nuclear fusion discussed in this video?')#Can you summarize the video')