In [None]:
import os
os.environ["OPENAI_API_KEY"] = "Your OpenAI API Key"

## Install libraries

In [None]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

## Step 1a - Indexing (Document Ingestion)

In [None]:
!pip install yt-dlp langchain

from langchain.text_splitter import RecursiveCharacterTextSplitter
import re
import glob

video_url = "https://www.youtube.com/watch?v=Gfr50f6ZBvo"

# Download subtitles in SRT format (no video)
!yt-dlp --skip-download --write-auto-sub --sub-lang en --convert-subs srt "{video_url}" -o "subs.%(ext)s"

# Detect the .srt file automatically
srt_files = glob.glob("*.srt")
if not srt_files:
    raise FileNotFoundError("No SRT file found. Check if the video has English subtitles.")
srt_file = srt_files[0]
print(f"Found subtitle file: {srt_file}")

# Function to strip timestamps and numbers from SRT
def srt_to_transcript(srt_path):
    with open(srt_path, "r", encoding="utf-8") as f:
        srt_content = f.read()
    # Remove numeric indexes and timestamps
    text = re.sub(r"\d+\n\d{2}:\d{2}:\d{2},\d{3} --> .*", "", srt_content)
    # Merge lines into paragraphs
    text = re.sub(r"\n+", " ", text).strip()
    return text

transcript = srt_to_transcript(srt_file)

# Split into chunks for your pipeline
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

print(f"Total chunks: {len(chunks)}")
print(f"First chunk:\n{chunks[0].page_content}")


[youtube] Extracting URL: https://www.youtube.com/watch?v=Gfr50f6ZBvo
[youtube] Gfr50f6ZBvo: Downloading webpage
[youtube] Gfr50f6ZBvo: Downloading tv client config
[youtube] Gfr50f6ZBvo: Downloading tv player API JSON
[youtube] Gfr50f6ZBvo: Downloading ios player API JSON
[youtube] Gfr50f6ZBvo: Downloading m3u8 information
[info] Gfr50f6ZBvo: Downloading subtitles: en
[info] Testing format 616
[info] Gfr50f6ZBvo: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: subs.en.vtt
[download] Destination: subs.en.vtt
[K[download] 100% of    1.22MiB in [1;37m00:00:00[0m at [0;32m5.04MiB/s[0m
[SubtitlesConvertor] Converting subtitles
Deleting original file subs.en.vtt (pass -k to keep)
Found subtitle file: subs.en.srt
Total chunks: 512
First chunk:
the following is a conversation with the following is a conversation with   the following is a conversation with demus hasabis demus hasabis   demus hasabis ceo and co-founder of deepmind ceo and co-founder of deepmind   ceo an

## Step 1b - Indexing (Text Splitting)

In [None]:
len(chunks)

512

In [None]:
chunks[500]

Document(metadata={}, page_content="to me it sure as heck sounds suspicious to me it sure as heck sounds   suspicious to me it sure as heck sounds this puzzle sure sounds like something this puzzle sure sounds like something   this puzzle sure sounds like something we talked about earlier what it takes to we talked about earlier what it takes to   we talked about earlier what it takes to to design a game to design a game   to design a game that's really fun to play for prolonged that's really fun to play for prolonged   that's really fun to play for prolonged periods of time periods of time   periods of time and it does seem like this puzzle like and it does seem like this puzzle like   and it does seem like this puzzle like you mentioned the more you learn about you mentioned the more you learn about   you mentioned the more you learn about it the more you realize how little you it the more you realize how little you   it the more you realize how little you know know   know so it humb

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [None]:
vector_store.index_to_docstore_id

In [None]:
vector_store.get_by_ids(['18b5fb9e-3d04-462e-92ab-8f59545fb28b'])

[Document(id='18b5fb9e-3d04-462e-92ab-8f59545fb28b', metadata={}, page_content="to me it sure as heck sounds suspicious to me it sure as heck sounds   suspicious to me it sure as heck sounds this puzzle sure sounds like something this puzzle sure sounds like something   this puzzle sure sounds like something we talked about earlier what it takes to we talked about earlier what it takes to   we talked about earlier what it takes to to design a game to design a game   to design a game that's really fun to play for prolonged that's really fun to play for prolonged   that's really fun to play for prolonged periods of time periods of time   periods of time and it does seem like this puzzle like and it does seem like this puzzle like   and it does seem like this puzzle like you mentioned the more you learn about you mentioned the more you learn about   you mentioned the more you learn about it the more you realize how little you it the more you realize how little you   it the more you realiz

## Step 2 - Retrieval

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [None]:
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7facd1278a50>, search_kwargs={'k': 4})

In [None]:
retriever.invoke('What is deepmind')

[Document(id='ff7ff45b-6b59-48cd-a91a-ff28651a8bab', metadata={}, page_content='the following is a conversation with the following is a conversation with   the following is a conversation with demus hasabis demus hasabis   demus hasabis ceo and co-founder of deepmind ceo and co-founder of deepmind   ceo and co-founder of deepmind a company that has published and builds a company that has published and builds   a company that has published and builds some of the most incredible artificial some of the most incredible artificial   some of the most incredible artificial intelligence systems in the history of intelligence systems in the history of   intelligence systems in the history of computing including alfred zero that computing including alfred zero that   computing including alfred zero that learned learned   learned all by itself to play the game of gold all by itself to play the game of gold   all by itself to play the game of gold better than any human in the world and better than

## Step 3 - Augmentation

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

In [None]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [None]:
question          = "is the topic of nuclear fusion discussed in this video? if yes then what was discussed"
retrieved_docs    = retriever.invoke(question)

In [None]:
retrieved_docs

[Document(id='19c51167-6ded-460d-92b4-d18d750adfb1', metadata={}, page_content="yeah contain it and hold it in structure   yeah contain it and hold it in structure and there's different shapes that are and there's different shapes that are   and there's different shapes that are better for for the energy productions better for for the energy productions   better for for the energy productions called droplets and and and so on so um called droplets and and and so on so um   called droplets and and and so on so um so that was huge and now we're looking so that was huge and now we're looking   so that was huge and now we're looking we're talking to lots of fusion startups we're talking to lots of fusion startups   we're talking to lots of fusion startups to see what's the next problem we can to see what's the next problem we can   to see what's the next problem we can tackle uh in the fusion area tackle uh in the fusion area   tackle uh in the fusion area so another fascinating place so a

In [None]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"yeah contain it and hold it in structure   yeah contain it and hold it in structure and there's different shapes that are and there's different shapes that are   and there's different shapes that are better for for the energy productions better for for the energy productions   better for for the energy productions called droplets and and and so on so um called droplets and and and so on so um   called droplets and and and so on so um so that was huge and now we're looking so that was huge and now we're looking   so that was huge and now we're looking we're talking to lots of fusion startups we're talking to lots of fusion startups   we're talking to lots of fusion startups to see what's the next problem we can to see what's the next problem we can   to see what's the next problem we can tackle uh in the fusion area tackle uh in the fusion area   tackle uh in the fusion area so another fascinating place so another fascinating place   so another fascinating place in a paper title\n\nwha

In [None]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [None]:
final_prompt

StringPromptValue(text="\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      yeah contain it and hold it in structure   yeah contain it and hold it in structure and there's different shapes that are and there's different shapes that are   and there's different shapes that are better for for the energy productions better for for the energy productions   better for for the energy productions called droplets and and and so on so um called droplets and and and so on so um   called droplets and and and so on so um so that was huge and now we're looking so that was huge and now we're looking   so that was huge and now we're looking we're talking to lots of fusion startups we're talking to lots of fusion startups   we're talking to lots of fusion startups to see what's the next problem we can to see what's the next problem we can   to see what's the next problem we can tackle uh i

## Step 4 - Generation

In [None]:
answer = llm.invoke(final_prompt)
print(answer.content)

Yes, the topic of nuclear fusion is discussed in this video. The discussion includes the containment and shaping of plasma for energy production, the challenges associated with material science and engineering in building fusion reactors, and the collaboration with domain experts to tackle problems in the fusion area. Additionally, there is mention of a nature paper published regarding the control of plasma shapes and the use of a test reactor for experiments related to fusion.


## Building a Chain

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
parallel_chain.invoke('who is Demis')

{'context': "the following is a conversation with the following is a conversation with   the following is a conversation with demus hasabis demus hasabis   demus hasabis ceo and co-founder of deepmind ceo and co-founder of deepmind   ceo and co-founder of deepmind a company that has published and builds a company that has published and builds   a company that has published and builds some of the most incredible artificial some of the most incredible artificial   some of the most incredible artificial intelligence systems in the history of intelligence systems in the history of   intelligence systems in the history of computing including alfred zero that computing including alfred zero that   computing including alfred zero that learned learned   learned all by itself to play the game of gold all by itself to play the game of gold   all by itself to play the game of gold better than any human in the world and better than any human in the world and   better than any human in the world an

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
main_chain.invoke('Can you summarize the video')

'The video discusses the use of quantum chemistry and computational data to improve the efficiency of learning functionals. It touches on the potential for simulating complex biological systems, including the human brain and body, through multi-protein interaction systems. Additionally, there is a brief tangent about impressive AI in game design and the challenges of creating such AI systems. The overarching theme is about utilizing advanced computational methods for significant scientific and technological advancements.'