#QnA Bot from Youtube Playlist

This project enables users to generate transcripts for videos in a YouTube playlist, create embeddings, and ask questions about the video content.


## Features

- Extracts video links from a YouTube playlist.
- Generates transcripts for each video.
- Creates embeddings for the transcripts.
- Provides a Q&A bot to answer questions about the video content.

#### To run this code follow these steps one by one:

### Step 1:
Install the necessary dependencies:

In [None]:
!pip install chromadb
!pip install youtube_transcript_api
!pip install llama-index-llms-gemini
!pip install llama-index-vector-stores-chroma
!pip install llama-index-embeddings-huggingface
!pip install llama-index-readers-youtube-transcript
!pip install llama-index-core
!pip install google-api-python-client
!pip install speechrecognition
!pip install gradio
!pip install openai-whisper

Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.1-py3-none-any.whl.metadata (26 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.30.4-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.26.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_pro

Collecting llama-index-vector-stores-chroma
  Downloading llama_index_vector_stores_chroma-0.1.10-py3-none-any.whl.metadata (705 bytes)
Downloading llama_index_vector_stores_chroma-0.1.10-py3-none-any.whl (5.0 kB)
Installing collected packages: llama-index-vector-stores-chroma
Successfully installed llama-index-vector-stores-chroma-0.1.10
Collecting llama-index-embeddings-huggingface
  Downloading llama_index_embeddings_huggingface-0.2.2-py3-none-any.whl.metadata (769 bytes)
Collecting sentence-transformers>=2.6.1 (from llama-index-embeddings-huggingface)
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting minijinja>=1.0 (from huggingface-hub[inference]>=0.19.0->llama-index-embeddings-huggingface)
  Downloading minijinja-2.0.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers>=2.6.1->llama-index-embeddings-huggingface)
  Using cached nvidia_

### Step 2:

Define a function that accepts a YouTube playlist URL and generates a list of individual YouTube video links from that playlist.

In [None]:
import googleapiclient.discovery
from urllib.parse import parse_qs, urlparse

def get_playlist_items(url):
        # Extract the playlist ID from the provided URL
        query = parse_qs(urlparse(url).query, keep_blank_values=True)
        playlist_id = query["list"][0]

        # Build the YouTube API client using the API key
        youtube = googleapiclient.discovery.build("youtube", "v3", developerKey="your_youtube_api_key")

        # Create an API request to get playlist items
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50  # Maximum number of results to return per request
        )

        playlist_items = []
        while request is not None:
            # Execute the request and get the response
            response = request.execute()
            # Add the items from the response to the playlist_items list
            playlist_items += response["items"]
            # Get the next page of results, if available
            request = youtube.playlistItems().list_next(request, response)

        # Extract the video links from the playlist items
        links = [
            f'https://www.youtube.com/watch?v={t["snippet"]["resourceId"]["videoId"]}&list={self.playlist_id}'
            for t in playlist_items
        ]
        return links

### Step 3:

Define a function that accepts a YouTube video link and returns the transcript of the video.

In [None]:
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader

def generate_Transcript(url):
    loader = YoutubeTranscriptReader()

    documents = loader.load_data(
      ytlinks= [url]
    )
    return documents

### Step 4:

Define a funciton that accepts a transcript, generates embeddings from it, and stores these embeddings in a ChromaDB database.

In [None]:
import chromadb
from llama_index.core import (
    Settings, StorageContext, VectorStoreIndex
)
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

def generate_embeddings(data):

        # Set embedding and language models
        embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
        llm = Gemini(api_key="your_gemini_api_key", model_name="models/gemini-pro")

        # Load documents
        documents = data

        # Create a client and a new collection
        client = chromadb.PersistentClient(path='./chroma_db')
        chroma_collection = client.get_or_create_collection("quickstart")

        # Create a vector store
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

        # Create a storage context
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        # Set Global settings
        Settings.llm = llm
        Settings.embed_model = embedding_model

    # Create an index from the documents and save it to the disk
        index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context
    )

### Step 5:

Define a function to generate answers by querying the index with a given question.

In [None]:
from llama_index.core import Settings, VectorStoreIndex, StorageContext
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

def generate_answers(question):

        # Initialize the Gemini embedding model
        embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

        # Initialize the Gemini language model
        llm = Gemini(api_key="your_gemini_api_key", model_name="models/gemini-pro")

        # Set Global settings
        Settings.llm = llm
        Settings.embed_model = embedding_model

        # Load the ChromaDB client
        client = chromadb.PersistentClient(path='./chroma_db')

        # Fetch the collection from ChromaDB
        chroma_collection = client.get_collection("quickstart")

        # Fetch the vector store from the collection
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

        # Create a storage context
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        # Get the index from the vector store
        index = VectorStoreIndex.from_vector_store(vector_store)

        query_engine = index.as_query_engine()
        return query_engine.query(question)

### Step 6:

Run each of the functions in the specified order to ensure proper initialization and functionality.

In [None]:
import gradio as gr
import whisper
import time

whisper_model = whisper.load_model("base")

# Function to process YouTube playlist and generate transcripts
def process_playlist(url):
    links = get_playlist_items(url)

    transcripts = []
    num_videos = len(links)

    # Create a progress tracker instance
    progress_tracker = gr.Progress(num_videos)

    for i, link in enumerate(links, start=1):
        transcript = generate_Transcript(link)
        transcripts.append(f"Video {i}:\n {transcript}")

        generate_embeddings(transcript)

        # Update progress
        progress_tracker.update(i)
        time.sleep(1)

    return "Processing Completed."

# Function to generate answers from text input
def generate_text_answer(question):
    if question.lower() == "exit":
        return "Session terminated."

    answer = generate_answers(question)
    return answer

# Function to generate answers from voice input
def generate_voice_answer(audio):
    # Load the audio file
    audio_file = whisper.load_audio(audio)
    result = whisper_model.transcribe(audio_file, verbose=True)
    question = result["text"]

    if question.lower() == "exit":
        return "Session terminated."

    answer = generate_answers(question.lower())
    return answer

# Gradio interface
def gradio_interface():
    with gr.Blocks(theme=gr.themes.Default(primary_hue=gr.themes.colors.red, secondary_hue=gr.themes.colors.pink)) as demo:
        with gr.Row():
            gr.Image(value="./utils/logo.jpeg", label="YouTube QnA Bot", type="filepath", scale=1)
            gr.Label("YouTube QnA Bot", scale=4)

        with gr.Row():
            url_input = gr.Textbox(label="Enter YouTube playlist URL", placeholder="YouTube playlist URL")
            process_button = gr.Button("Process Playlist", variant="primary")

        # Add a progress bar and status output
        progress_tracker = gr.Progress()
        status_output = gr.Textbox(label="Status", placeholder="Processing status will appear here", lines=2)

        # Define the function that updates the progress
        def update_progress(url):
            status = process_playlist(url)
            return status

        process_button.click(fn=update_progress, inputs=url_input, outputs=status_output)

        with gr.Row():
            answer_output = gr.Textbox(label="Answer", placeholder="Answer will appear here", lines=5)

        with gr.Row():
            audio_input = gr.Audio(sources="microphone", type="filepath", scale=4)
            ask_voice_button = gr.Button("Ask (Voice)", scale=1, variant="primary")

        with gr.Row():
            question_text_input = gr.Textbox(label="Enter your question", placeholder="Ask your question here", scale=4)
            ask_text_button = gr.Button("Ask (Text)", scale=1, variant="primary")

        ask_text_button.click(generate_text_answer, inputs=question_text_input, outputs=answer_output)
        ask_voice_button.click(generate_voice_answer, inputs=audio_input, outputs=answer_output)

    return demo

# Launch the Gradio interface
gradio_interface().launch(share=True)