# **RAG with Open Model**

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from youtube_transcript_api.formatters import TextFormatter

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

import re
from typing import Optional, List

In [None]:
def extract_video_id(url):
    """Extracts the YouTube video ID from a URL.

    Args:
        url: The YouTube URL.

    Returns:
        The video ID, or None if not found.
    """
    # Standard URL: https://www.youtube.com/watch?v=VIDEO_ID
    match = re.search(r"v=([a-zA-Z0-9_-]+)", url)
    if match:
        return match.group(1)

    # Short URL: https://youtu.be/VIDEO_ID
    match = re.search(r"youtu\.be\/([a-zA-Z0-9_-]+)", url)
    if match:
        return match.group(1)
    return None

In [None]:
url1 = "https://www.youtube.com/watch?v=VMj-3S1tku0&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ"
url2 = "https://youtu.be/dQw4w9WgXcQ"
url3 = "invalid-url"
url4 = "https://www.youtube.com/watch?v=aj_OGw-CQeA&ab_channel=Benjamin-DerDeutschlehrer"

video_id1 = extract_video_id(url1)
video_id2 = extract_video_id(url2)
video_id3 = extract_video_id(url3)
video_id4 = extract_video_id(url4)

print(f"Video ID from url1: {video_id1}")
print(f"Video ID from url2: {video_id2}")
print(f"Video ID from url3: {video_id3}")
print(f"Video ID from url3: {video_id4}")

In [None]:
def get_transcript(video_id: str, languages: Optional[List[str]] = None):
        """
        Fetches and prints the transcript of a YouTube video, along with the raw transcript data.

        Args:
            video_id: The ID of the YouTube video.
            languages: A list of language codes to prioritize (e.g., ["en", "de", "hi"]).
                If None, the "best" available language is fetched.
        """
        try:
            #  Get the transcript as a list of dictionaries
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)

            #  Print the raw transcript data (list of dictionaries)
            print("Raw Transcript Data:")
            print(transcript_list)

            # Flatten the transcript to plain text and print it
            transcript_text = " ".join(chunk["text"] for chunk in transcript_list)
            print("\nPlain Text Transcript:")
            print(transcript_text)

        except TranscriptsDisabled:
            print("No captions available for this video.")
        except Exception as e:
            print(f"An error occurred: {e}")

In [None]:
get_transcript(video_id1, languages=["en", "ge"])

In [None]:
get_transcript(video_id4, languages=["en", "de"])

In [None]:
def extract_transcript_from_url(youtube_url: str, languages: Optional[List[str]] = None):
    """
    Extracts the transcript of a YouTube video from its URL,
    along with the raw transcript data.

    Args:
        youtube_url: The full YouTube video URL.
        languages: A list of language codes to prioritize (e.g., ["en", "de", "hi"]).
            If None, the "best" available language is fetched.

    Returns:
        The transcript text as a string, or None on error.
    """
    def extract_video_id(url: str) -> str or None:
        """Extracts the YouTube video ID from a URL."""
        match_v = re.search(r"v=([a-zA-Z0-9_-]+)", url)
        if match_v:
            return match_v.group(1)
        match_be = re.search(r"youtu\.be\/([a-zA-Z0-9_-]+)", url)
        if match_be:
            return match_be.group(1)
        return None

    video_id = extract_video_id(youtube_url)

    if not video_id:
        print(f"Error: Could not extract video ID from URL: {youtube_url}")
        return None

    def get_transcript(video_id: str, languages: Optional[List[str]] = None) -> Optional[str]:
        """
        Fetches the transcript of a YouTube video.

        Args:
            video_id: The ID of the YouTube video.
            languages: A list of language codes to prioritize.
                If None, the "best" available language is fetched.

        Returns:
            The transcript text as a string, or None on error.
        """
        try:
            #  Get the transcript as a list of dictionaries
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)

            # Flatten the transcript to plain text and return it
            transcript_text = " ".join(chunk["text"] for chunk in transcript_list)
            return transcript_text

        except TranscriptsDisabled:
            print("No captions available for this video.")
            return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    return get_transcript(video_id, languages)


In [None]:
languages = ['de', 'en']

In [None]:
transcript = extract_transcript_from_url(url1, ['de', 'en'])
transcript_1 = extract_transcript_from_url(url4, ['de', 'en'])

In [None]:
transcript

# **Step 1: Split the documents into chunks**

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
chunks = splitter.create_documents([transcript])

In [None]:
chunks[0]

# **Step 2: Encoding**

In [None]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [None]:
text = "This is test docuement."
query_result = embeddings.embed_query(text) 

len(query_result)

# **Step 3: Database**

In [None]:
db = FAISS.from_documents(chunks, embedding=embeddings)

In [None]:
question = "What is Micrograd?"

searchDocs = db.similarity_search(question)

print(searchDocs[0].page_content)

In [None]:
db.index_to_docstore_id

In [None]:
db.get_by_ids(["305429bf-a768-49b2-8d9e-ac2b85fcbe6c"])

## **Retrieval**

In [None]:
retriever = db.as_retriever(search_type = "similarity", search_kwargs={"k" : 4})

In [None]:
retriever.invoke(question)

## Augmentation

In [None]:
template="""
      You are a helpful assistant.
      Answer ONLY from the following transcript context.
      If the context is insufficient to answer the question, just say "I don't know."
      If possible, cite the specific part of the context that supports your answer.

      Transcript Context:
      {context}

      Question: {question}

      Answer:
    """

In [None]:
prompt = PromptTemplate(template=template, 
    input_variables = ['context', 'question'])

In [None]:
question          = "Is the topic of backward propagation discussed in this video? if yes then what was discussed"
retrieved_docs    = retriever.invoke(question)

In [None]:
retrieved_docs

In [None]:
context = "\n\n".join([doc.page_content for doc in retrieved_docs])
context

In [None]:
final_prompt = prompt.invoke({"context": context, "question": question})

In [None]:
final_prompt

# **Step 4: LLM Model**

![](https://miro.medium.com/v2/resize:fit:720/format:webp/1*tdPlvSI-dctBCbbRdG26jA.png)

In [None]:
llm_pipline = pipeline(
    "text-generation",
    model="microsoft/phi-2",
    max_new_tokens = 200
)

In [None]:
llm = HuggingFacePipeline(pipeline=llm_pipline)

In [None]:
answer = llm.invoke(question)
print(answer)

# **Building a Chain**

In [None]:
def format_docs(retrieved_docs):
    context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
    return context_text

In [None]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [None]:
parallel_chain.invoke('What is Micro Grad')

In [None]:
parser = StrOutputParser()

In [None]:
main_chain = parallel_chain | prompt | llm | parser

In [None]:
ans = main_chain.invoke('What is micrograd')

In [None]:
ans