In [None]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "SECRET_KEY"

In [5]:
!pip install -q youtube-transcript-api langchain langchain_huggingface langchain-community faiss-cpu tiktoken python-dotenv

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled

## **Step 1a : Indexing (Document Ingestion)**

In [81]:
from logging import exception
video_id = "uD4izuDMUQA"

try:
   # fetch transcropt from youtube id
  fetch_transcript_list = YouTubeTranscriptApi().fetch(video_id=video_id, languages=["en"])

  transcript = ""

  for snippet in fetch_transcript_list:
    transcript = transcript + snippet.text + " "

  print(transcript)


except TranscriptsDisabled:
  print("no transcript availabale for this video")


Supported By: Supported By:
Protocol Labs Protocol Labs Protocol Labs
What does our future hold? Everything has its wonders, even darkness and silence... Everything has its wonders, even darkness and silence...
- Helen Keller What does the future look like? How will the universe meet its end? We may never be truly certain. But science has begun to paint a stunning picture of how the future might unfold. Let's take a journey to the end of time. We will travel through time exponentially, doubling our speed every 5 seconds. The vision of the future will surely evolve
as we probe for more clues. But one thing is clear: The universe has only just begun. 2019
[Anthrpocene era] 2020
[Anthrpocene era] 2021
[Anthrpocene era] 2021
The Holocene has ended.
[Anthrpocene era] 2021
The Holocene has ended.
[Anthrpocene era] 2022
The Holocene has ended.
[Anthrpocene era] 2023
The Holocene has ended.
[Anthrpocene era] 2023
[Anthrpocene era] What we do now, What we do now, What we do now, What we do now,

## **Step 1b - Indexing (Text Splitting)**

In [82]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size =500,
    chunk_overlap=100
)
chunks = splitter.create_documents([transcript])
print(len(chunks))

62


## **Step 1c & 1d - (Embeddings and vector Store)**


In [83]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = FAISS.from_documents(
    chunks,
    embeddings
)

In [39]:
vectorstore.index_to_docstore_id

{0: '96109d9c-c067-4a85-8133-e0b695d8b8cc',
 1: '6ca85c87-3e65-4e38-b4a7-b27bbc805e28',
 2: '736df920-b8ab-4dd6-9bfe-3726756a4758',
 3: '673a6995-533a-4c4c-b4cb-2d95d2bae3d4',
 4: '65ec200e-538d-489a-8bf5-4978101d6b3b',
 5: '7d66045d-fec6-4cc2-ae03-657e161f6b8e',
 6: '92eac9c7-75a2-4c8b-9424-890d9a454dc3',
 7: '03bbb65c-273e-4574-9893-c71d91ada0ef',
 8: '3f3a024b-71bb-4cc7-90b4-774fa05bc7f5',
 9: 'c0015421-1f64-4bcc-a6a3-359613f39249',
 10: '1272898f-c029-4be7-8a8a-c5d17ad22da8',
 11: '2f9dc0fe-1f10-4678-a14f-14eb856f0440',
 12: '6bc0507a-4932-4c99-bed2-8c74a26c0ba7',
 13: 'f938b4a5-1d32-493d-a4ca-ea14f1fa6a3c',
 14: '1b00e2ec-bead-4c5b-aaac-1058d445a6c7',
 15: 'eaf8d7b4-9c74-47dd-942c-80f31f769359',
 16: '0107a8c8-0eab-4abc-923c-b1686445d9a8',
 17: '86b56fc2-732b-4f62-826c-f893e01c4918',
 18: '26ae2f78-7289-4b5a-9294-c247f22f4501',
 19: '63bf95fa-aeab-446b-bef8-4b9586d364ef',
 20: '6eb06147-6efe-4480-ab1a-4417051c36c3',
 21: '97d89262-624e-4524-a49f-39eaee00122c',
 22: 'ec017a11-ddf3-

In [43]:
vectorstore.get_by_ids(['dd773bb8-dd99-4e89-8105-260ff9b6aed4'])

<class 'langchain_community.vectorstores.faiss.FAISS'>


## **Step 2 - Rerieval**

In [84]:
retriever = vectorstore.as_retriever(
    serach_type="similarity",
    search_kwargs={"k": 4}
)
print(retriever)

tags=['FAISS', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7854da866240> search_kwargs={'k': 4}


In [55]:
retriever.invoke("how many amazon leadership principle are there")

[Document(id='96109d9c-c067-4a85-8133-e0b695d8b8cc', metadata={}, page_content="[Music] the leadership principles here at Amazon have arguably been the most important part of our being able to scale our culture and Company the first 29 years that we've been a company it's helped us grow to the number of people we have and the number of geographies that we're in in the number of diverse businesses that we're in and build together as just one culture and these leadership principles have been crafted very carefully over many years and you'll note the leadership principles have"),
 Document(id='e8c6f11d-143a-4a9a-9c3f-236a4e3a4109', metadata={}, page_content="and commitment to their employees personal success whether that be at Amazon or elsewhere this is one of our um newer two leadership principles and it's really a leadership principle that's Broad and could be interpreted lots of different ways and indeed sometimes people do interpret a lot of different ways you know the as a company w

## **Step - 3 Augmentation**

In [85]:
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    task="text-generation",
    temperature=0.7,
    max_new_tokens=256,
    huggingfacehub_api_token=hf_token
)

model = ChatHuggingFace(llm=llm)

In [86]:
prompt_template = PromptTemplate (
    template = '''
    You are a helpfull assistent.
    Answer Only from the provided transcript context.
    If the context is insufficient, just say you don't know.

    {context}
    Question : {question}
    ''',
    input_variables=['context', 'question']
)

In [65]:
question = "what is cat ?"

retrieved_docs = retriever.invoke(question)

context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)

# print(context_text)

final_prompt = prompt_template.invoke({"context" : context_text, "question" : question})

print(final_prompt)

text="\n    You are a helpfull assistent.\n    Answer Only from the provided transcript context.\n    If the context is insufficient, just say you don't know.\n\n    it means when we're debating something my idea has to be one the one that carries the day because after all I I would be right a lot and the reality is what we're all trying to do is we're trying to get to the best possible answer for customers whoever's idea it is and so what what we need to do when we're thinking about a hard issue is we need to get the right people in a room the right people to give feedback it's often why as Leaders we speak last in the room we want everybody's input we\n\nin some cases it's been whole cloth invention you know pioneering AWS and cloud computing is a good example of that or pioneering a a digital book reader or a you know a a device that does natural language understanding and automatic speech recognition um like what we did with devices for Alexa is another example and in many other ca

## **Step 4 - Generation**

In [66]:
result = model.invoke(final_prompt)
print(result)

content=' I\'m sorry, but the context provided does not mention or discuss anything about a "cat." The text is about debating ideas, leading teams, learning, and inventing/reinventing businesses.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 43, 'prompt_tokens': 447, 'total_tokens': 490}, 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='run--c36fb075-a61e-4ce8-9a2e-5a5978742c8c-0' usage_metadata={'input_tokens': 447, 'output_tokens': 43, 'total_tokens': 490}


## **Building Chains**

In [67]:
from langchain_core.runnables import RunnableParallel, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


In [70]:
parser = StrOutputParser()

In [73]:
def format_docs (retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [90]:
parallel_chain = RunnableParallel ({
    'context' : retriever | RunnableLambda(format_docs),
    'question' : RunnablePassthrough()
})

merge_chain = prompt_template | model | parser

main_chain = parallel_chain | merge_chain

result = main_chain.invoke("what is cat")

print(result)

 Based on the context provided, "cat" is not mentioned. The text discusses various concepts related to astronomy, including dark energy, the expansion of the universe, and the fate of stars.
