In [None]:
# importing necessary libraries
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_text_splitters.sentence_transformers import SentenceTransformersTokenTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain_chroma import Chroma

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY

In [4]:
# gemini 

llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro")

embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [None]:
# chromadb

vectorstore = Chroma(
    embedding_function=embedding_model,
    collection_name="test_db",
    persist_directory="./test_db"
    )



In [None]:
# youtube videos to transcribe 

# https://www.youtube.com/watch?v=gM_7DbppaaI
# https://www.youtube.com/watch?v=-AuK92Jq4yQ

# this is an id of a youtube video -> -AuK92Jq4yQ

# https://www.youtube.com/watch?v=_6BrRB8VCvo&t=319s
# https://www.youtube.com/watch?v=4mA54Uy3YGY


In [6]:
def extract_video_id(url: str) -> str:
    try:
        return url.split("v=")[1].split("&")[0]
    except IndexError:
        raise ValueError("Invalid YouTube URL format. Expected something like: https://www.youtube.com/watch?v=VIDEO_ID")


In [22]:
url = 'https://www.youtube.com/watch?v=lUe_RI_m-Vg&list=PL8dPuuaLjXtOAKed_MxxWBNaPno5h3Zs8&index=4'
yt_video_id = extract_video_id(url)

In [23]:
ytt_api = YouTubeTranscriptApi()
fetched_transcript = ytt_api.fetch(yt_video_id)
formatted_transcript = TextFormatter().format_transcript(fetched_transcript)
formatted_transcript

"As any teacher will tell you, when you’re\ndealing with certain elements that are being\nfeisty and fidgety and basically not cooperating,\nthere’s pretty much only one thing you can\ndo: you gotta keep ‘em separated.\nAnd there’s a whole system of biological\ntissue that’s dedicated to doing just that\n-- creating order where there would otherwise\nbe total mayhem.\nBecause you and pretty much every other animal\nis made up of incredibly complex, feisty, fidgety\nsystems that need to be kept apart to some\nextent if they’re going to get anything done.\nThink of it this way: Say all the middle-schoolers\nin your town wanted to have lunch together.\nAt the same time. On Taco Tuesday.\nIf you crammed everyone into one giant lunchroom,\nyou’d have a lot of interesting and talented\npeople in one place, yes, but you’d also\nnever get a handle on them with everyone shoved\nand talking, and jostling, and flirting, and farting, and\nand stepping on toes, and haggling over tater tots.\nIt’d b

In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

docs = Document(page_content=formatted_transcript,
                metadata={
                "video_id": yt_video_id,
    })
docs

Document(metadata={'video_id': 'lUe_RI_m-Vg'}, page_content="As any teacher will tell you, when you’re\ndealing with certain elements that are being\nfeisty and fidgety and basically not cooperating,\nthere’s pretty much only one thing you can\ndo: you gotta keep ‘em separated.\nAnd there’s a whole system of biological\ntissue that’s dedicated to doing just that\n-- creating order where there would otherwise\nbe total mayhem.\nBecause you and pretty much every other animal\nis made up of incredibly complex, feisty, fidgety\nsystems that need to be kept apart to some\nextent if they’re going to get anything done.\nThink of it this way: Say all the middle-schoolers\nin your town wanted to have lunch together.\nAt the same time. On Taco Tuesday.\nIf you crammed everyone into one giant lunchroom,\nyou’d have a lot of interesting and talented\npeople in one place, yes, but you’d also\nnever get a handle on them with everyone shoved\nand talking, and jostling, and flirting, and farting, and\

In [19]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,  # chunk size (characters)
    chunk_overlap=100,  # chunk overlap (characters)
)
all_splits = text_splitter.split_documents([docs])
all_splits

[Document(metadata={'video_id': 'i5tR3csCWYo'}, page_content='Check out this amoeba.\nPretty nice. Kind of a rugged, no-frills life\nform.\nThe thing about amoebas is that they do everything\nin the same place. They take in and digest\ntheir food, and reject their waste, and get\nthrough everything else they need to do, all\nwithin a single cell.\nThey don’t need trillions of different cells\nworking together to keep them alive. They\ndon’t need a bunch of structures to keep\ntheir stomachs away from their hearts away\nfrom their lungs. They’re content to just\nblob around and live the simple life.\nBut we humans, along with the rest of the'),
 Document(metadata={'video_id': 'i5tR3csCWYo'}, page_content='blob around and live the simple life.\nBut we humans, along with the rest of the\nmulticellular animal kingdom, are substantially\nmore complex. We’re all about cell specialization,\nand compartmentalizing our bodies.\nEvery cell in your body has its own specific\njob description relat

In [20]:
def add_transcript_to_vectorstore(docs, video_id):  

    existing_docs = vectorstore.get(
        where={"video_id": video_id},
        include=['metadatas']
    )

    if existing_docs['ids']:
        print(f"Video {video_id} already exists in the collection.")
        return


    text_splitter = RecursiveCharacterTextSplitter(
     chunk_size=600,  # chunk size (characters)
     chunk_overlap=100,  # chunk overlap (characters)
    )
    all_splits = text_splitter.split_documents([docs])

    vectorstore.add_documents(all_splits)

    print(f"Added transcript for video {video_id} to the collection.")

In [25]:
add_transcript_to_vectorstore(docs, yt_video_id)

Added transcript for video lUe_RI_m-Vg to the collection.


In [None]:
# docs_to_delete = vectorstore.get(
#     where={"video_id": yt_video_id}
# )

In [None]:
# ids = docs_to_delete['ids']
# vectorstore.delete(ids=ids)

In [26]:
retrieved_data = vectorstore.get(
        where={"video_id": yt_video_id},
        include=['embeddings', 'metadatas', 'documents']
    )
retrieved_data

{'ids': ['b867110c-e24f-47b6-b2a3-57ad24e3cb9f',
  '35a1e96c-6087-44ca-9ce8-9f6caaeeedc9',
  'b1853b4a-69b5-4e83-ad58-02d56b678c4d',
  '0bcc573d-350e-45a2-901b-85afb6bb1d4f',
  '716e6af4-1bdc-4cfd-b596-5df749096b3c',
  'a2327daf-4bed-4cb3-9224-a7e60dd169f5',
  '252c6fbc-42d5-400e-9e33-586d2e8ba744',
  'fdc8aa73-0b6d-4ee2-9220-e9a6b3ddcf81',
  'ab95feab-2170-4ca9-9378-e32b50a6d004',
  '5bac68be-d2c8-4e15-a1e4-09f5c8b63591',
  'de400dfd-5236-493f-b05e-9d5d2b52a9ed',
  '4004839b-cd59-42e5-9f25-f32f7eeec968',
  'eb61afd7-0e34-4698-9cc1-310b4300cf21',
  '595ad680-d6f9-41f2-a18e-a7ddfc1d12ac',
  '940cc9a8-e4d6-461f-a35d-cf491462cad9',
  '19fce1e6-1cc0-4c67-a520-fb7ac8c21bc0',
  '16a845ed-b2ba-4e3f-a1e5-8d720f38d2da',
  '66ade472-3619-4232-9c36-3582156d9ab4',
  'f4cd6bcd-c597-400e-8b5f-7e07e46fabe0',
  '4143f135-9f13-4502-b59c-5009bb822e5c',
  '19b67e83-a32d-47dd-8b8a-ac5204556706',
  '00b7d928-7f46-4778-8bf5-4a463b698af4',
  'da45c4cf-510f-4578-93d3-a4fb5a38ebee',
  '3c4b1013-3787-4a0e-b505-

In [28]:
import pprint
pprint.pprint(retrieved_data['ids'])

['b867110c-e24f-47b6-b2a3-57ad24e3cb9f',
 '35a1e96c-6087-44ca-9ce8-9f6caaeeedc9',
 'b1853b4a-69b5-4e83-ad58-02d56b678c4d',
 '0bcc573d-350e-45a2-901b-85afb6bb1d4f',
 '716e6af4-1bdc-4cfd-b596-5df749096b3c',
 'a2327daf-4bed-4cb3-9224-a7e60dd169f5',
 '252c6fbc-42d5-400e-9e33-586d2e8ba744',
 'fdc8aa73-0b6d-4ee2-9220-e9a6b3ddcf81',
 'ab95feab-2170-4ca9-9378-e32b50a6d004',
 '5bac68be-d2c8-4e15-a1e4-09f5c8b63591',
 'de400dfd-5236-493f-b05e-9d5d2b52a9ed',
 '4004839b-cd59-42e5-9f25-f32f7eeec968',
 'eb61afd7-0e34-4698-9cc1-310b4300cf21',
 '595ad680-d6f9-41f2-a18e-a7ddfc1d12ac',
 '940cc9a8-e4d6-461f-a35d-cf491462cad9',
 '19fce1e6-1cc0-4c67-a520-fb7ac8c21bc0',
 '16a845ed-b2ba-4e3f-a1e5-8d720f38d2da',
 '66ade472-3619-4232-9c36-3582156d9ab4',
 'f4cd6bcd-c597-400e-8b5f-7e07e46fabe0',
 '4143f135-9f13-4502-b59c-5009bb822e5c',
 '19b67e83-a32d-47dd-8b8a-ac5204556706',
 '00b7d928-7f46-4778-8bf5-4a463b698af4',
 'da45c4cf-510f-4578-93d3-a4fb5a38ebee',
 '3c4b1013-3787-4a0e-b505-a8d69306cca1']


In [31]:
pprint.pprint(retrieved_data['documents'][1])

('extent if they’re going to get anything done.\n'
 'Think of it this way: Say all the middle-schoolers\n'
 'in your town wanted to have lunch together.\n'
 'At the same time. On Taco Tuesday.\n'
 'If you crammed everyone into one giant lunchroom,\n'
 'you’d have a lot of interesting and talented\n'
 'people in one place, yes, but you’d also\n'
 'never get a handle on them with everyone shoved\n'
 'and talking, and jostling, and flirting, and farting, and\n'
 'and stepping on toes, and haggling over tater tots.\n'
 'It’d be like a John Hughes movie gone horribly\n'
 'wrong.\n'
 'So what you need is a solid system of organization')


In [32]:
pprint.pprint(retrieved_data['metadatas'][1])

{'video_id': 'lUe_RI_m-Vg'}


In [36]:
pprint.pprint(retrieved_data['embeddings'][1])

array([ 2.02767216e-02, -3.73526700e-02, -4.57797088e-02,  1.39983022e-03,
        4.26122360e-02,  5.33134327e-04, -1.96096525e-02, -1.66711975e-02,
        1.50419287e-02,  2.49343030e-02, -2.85947113e-03,  1.74481813e-02,
       -1.78330038e-02,  7.35757733e-03,  1.79889295e-02, -1.67594012e-02,
        2.37109270e-02, -1.86084863e-03,  1.45586412e-02, -3.12208477e-02,
       -2.71998961e-02,  2.01372635e-02,  1.47835566e-02, -2.33463962e-02,
        3.36043048e-03,  1.65260434e-02,  2.60578487e-02, -9.13023129e-02,
       -1.53100246e-03, -1.48780765e-02, -6.78033009e-03,  2.79478338e-02,
       -3.46727297e-02,  2.58520916e-02,  2.32001767e-02, -3.33851911e-02,
       -3.86952683e-02, -3.76281776e-02,  1.00376811e-02,  2.02598087e-02,
       -3.60782444e-02, -1.68623291e-02, -1.87314507e-02,  6.26044646e-02,
       -1.34810503e-03, -1.09708179e-02, -5.40625863e-02,  3.08256242e-02,
       -3.30382888e-03, -6.95325732e-02,  7.35818828e-03,  1.77978911e-02,
        4.97833565e-02, -

In [None]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

def retrieve(query :str):
    retrieved_docs = vectorstore.similarity_search(query)
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    
    messages = prompt.invoke({"question": query, "context": docs_content})
    response = llm.invoke(messages)
    return response.content




In [42]:
query = 'tell what kind of cell shape can you tell ?'
retrieve(query)

'There are three basic cell shapes: squamous, cuboidal, and columnar. Squamous cells are flat and scale-like, cuboidal cells are cube-ish, and columnar cells are tall and thick like columns.'