#QnA Bot from Youtube Playlist

This project enables users to generate transcripts for videos in a YouTube playlist, create embeddings, and ask questions about the video content.


## Features

- Extracts video links from a YouTube playlist.
- Generates transcripts for each video.
- Creates embeddings for the transcripts.
- Provides a Q&A bot to answer questions about the video content.

#### To run this code follow these steps one by one:

### Step 1:
Install the necessary dependencies:

In [None]:
!pip install youtube_transcript_api
!pip install llama-index-llms-gemini
!pip install llama-index-vector-stores-chroma
!pip install llama-index-embeddings-huggingface
!pip install llama-index-core
!pip install google-api-python-client
!pip install llama-hub-youtube-transcript
!pip install llama-index-readers-youtube-transcript

### Step 2:

Create a class and module that accepts a YouTube playlist URL and generates a list of individual YouTube video links from that playlist.

In [None]:
# Import necessary classes from different modules
import googleapiclient.discovery
from urllib.parse import parse_qs, urlparse
import os
from dotenv import load_dotenv

class YouTubePlaylist:
    def __init__(self, url):
        # Extract the playlist ID from the provided URL
        query = parse_qs(urlparse(url).query, keep_blank_values=True)
        self.playlist_id = query["list"][0]

        # Load the API key from the .env file
        load_dotenv()
        self.api_key = os.getenv("YOUTUBE_API_KEY")

        # Build the YouTube API client using the API key
        self.youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=self.api_key)

    def get_playlist_items(self):
        # Create an API request to get playlist items
        request = self.youtube.playlistItems().list(
            part="snippet",
            playlistId=self.playlist_id,
            maxResults=50  # Maximum number of results to return per request
        )

        playlist_items = []
        while request is not None:
            # Execute the request and get the response
            response = request.execute()
            # Add the items from the response to the playlist_items list
            playlist_items += response["items"]
            # Get the next page of results, if available
            request = self.youtube.playlistItems().list_next(request, response)

        # Extract the video links from the playlist items
        links = [
            f'https://www.youtube.com/watch?v={t["snippet"]["resourceId"]["videoId"]}&list={self.playlist_id}'
            for t in playlist_items
        ]
        return links


### Step 3:

Create a class that accepts a YouTube video link and returns the transcript of the video.

In [None]:
# Import necessary classes from different modules
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader

class Transcript_Generator:
    def __init__(self, url):
        # Initialize the Transcript_Generator with a YouTube URL
        self.url = url
        # Create an instance of YoutubeTranscriptReader to load transcripts
        self.loader = YoutubeTranscriptReader()

    def generate_Transcript(self):
        # Use the loader to fetch transcript data for the given YouTube URL
        documents = self.loader.load_data(
            ytlinks=[self.url]  # Pass the URL as a list to the load_data method
        )
        return documents  # Return the loaded transcript documents

### Step 4:

Create a class that accepts a transcript, generates embeddings from it, and stores these embeddings in a ChromaDB database.

In [None]:
# Import necessary classes from different modules
from dotenv import load_dotenv
import os
import chromadb
from llama_index.core import (
    Settings, StorageContext, VectorStoreIndex
)
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

class Generator:
    def __init__(self, data):
        # Load environment variables
        load_dotenv()
        self.api_key = os.getenv('GEMINI_API_KEY')

        # Set embedding and language models
        self.embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
        self.llm = Gemini(api_key=self.api_key, model_name="models/gemini-pro")

        # Load documents
        self.documents = data

        # Create a client and a new collection
        self.client = chromadb.PersistentClient(path='./chroma_db')
        self.chroma_collection = self.client.get_or_create_collection("quickstart")

        # Create a vector store
        self.vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)

        # Create a storage context
        self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)

        # Set Global settings
        Settings.llm = self.llm
        Settings.embed_model = self.embedding_model

    # Create an index from the documents and save it to the disk
    def generate_embeddings(self):
        self.index = VectorStoreIndex.from_documents(
        self.documents, storage_context=self.storage_context
    )

### Step 5:

Create a classs to generate answers by querying the index with a given question.

In [None]:
# Import necessary classes from different modules
from dotenv import load_dotenv
import os
from llama_index.core import Settings, VectorStoreIndex, StorageContext
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

class Retriever:
    def __init__(self):
        # Load environment variables
        load_dotenv()
        self.api_key = os.getenv('GEMINI_API_KEY')

        if not self.api_key:
            raise ValueError("API key for Gemini model is not set.")

        # Initialize the Gemini embedding model
        self.embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

        # Initialize the Gemini language model
        self.llm = Gemini(api_key=self.api_key, model_name="models/gemini-pro")

        # Set Global settings
        Settings.llm = self.llm
        Settings.embed_model = self.embedding_model

        # Load the ChromaDB client
        self.client = chromadb.PersistentClient(path='./chroma_db')

        # Fetch the collection from ChromaDB
        self.chroma_collection = self.client.get_collection("quickstart")

        # Fetch the vector store from the collection
        self.vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)

        # Create a storage context
        self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)

        # Get the index from the vector store
        self.index = VectorStoreIndex.from_vector_store(self.vector_store)

    def generate_answers(self, question):
        query_engine = self.index.as_query_engine()
        return query_engine.query(question)

### Step 6:

Run each of the classes in the specified order to ensure proper initialization and functionality.

In [None]:
# Import necessary classes from different modules
from Links_Generator import YouTubePlaylist
from Transcript_Generator import Transcript_Generator
from generator import Generator
from retriever import Retriever

# Prompt the user to enter the YouTube playlist URL
url = str(input("Enter the YouTube playlist URL: "))

# Initialize the YouTubePlaylist object with the provided URL
yt_playlist = YouTubePlaylist(url)

# Get the list of video links from the playlist
links = yt_playlist.get_playlist_items()

# Initialize a counter for video numbering
i = 1

# Loop through each video link in the playlist
for link in links:
    # Initialize the Transcript_Generator object with the video link
    init_transcript = Transcript_Generator(link)

    # Generate the transcript for the current video
    transcript = init_transcript.generate_Transcript()

    # Prepare the input string for the current video with its transcript
    inp = f"Video {i}:\n {transcript}"

    # Print the input string (video number and its transcript)
    print(inp)

    # Initialize the Generator object with the transcript
    generator = Generator(transcript)

    # Generate embeddings for the transcript
    embeddings = generator.generate_embeddings()

    # Increment the video counter
    i += 1

# Initialize the Retriever object
init_retriever = Retriever()

# Continuously prompt the user for questions until they type 'exit'
while True:
    # Prompt the user to ask a question
    question = str(input("Ask any question (type 'exit' to quit): "))

    # Check if the user wants to exit
    if question.lower() == "exit":
        break

    # Generate an answer for the user's question
    answer = init_retriever.generate_answers(question)

    # Print the answer
    print(answer)