<a href="https://colab.research.google.com/github/darinkist/Medium-Article-Transparent-Question-Answering-Bot/blob/main/CodeForArticleYouTubeExample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from langchain import OpenAI
import pandas as pd
from tqdm.notebook import tqdm
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import PromptTemplate
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import CharacterTextSplitter
from youtube_transcript_api import YouTubeTranscriptApi

os.environ["OPENAI_API_KEY"] = "<YOUR-KEY>"

In [None]:
# For Debugging - In case you are interested in what and how many prompts are sent to openAI
# promptlayer account needed
# import promptlayer
# from promptlayer.langchain.llms import OpenAI
# promptlayer.api_key = "<YOUR-KEY>"

# Youtube
## Option 1: Youtube transcripts without timestamps

In [None]:
yt_ids = [
    "OtD8wVaFm6E",  # XGBoost Part 1 (of 4): Regression
    "8b1JEDvenQU",  # XGBoost Part 2 (of 4): Classification
    "ZVFeW798-2I",  # XGBoost Part 3 (of 4): Mathematical Details
    "oRrKeUCEbq8",  # XGBoost Part 4 (of 4): Crazy Cool Optimizations
]

yt_docs = []

for yt_id in tqdm(yt_ids, desc="Retrieving transcripts"):
    splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=150, separator=" ")
    yt_loader = YoutubeLoader(yt_id, add_video_info=True)
    yt_docs.extend(yt_loader.load_and_split(splitter))

In [None]:
# Manipulate / extend source attribute
for doc in yt_docs:
    doc.metadata["source"] = (
        doc.metadata["title"]
        + " ["
        + doc.metadata["author"]
        + "] "
        + "https://youtu.be/"
        + doc.metadata["source"]
    )

# Vector store
yt_store = FAISS.from_documents(yt_docs, OpenAIEmbeddings())

## Option 2: Youtube transcripts with timestamps

In [None]:
# Create transcript df
def create_transcript_df(yt_transcript: list, yt_id: str):
    return (
        pd.DataFrame(yt_transcript)
        .assign(start_dt=lambda x: pd.to_datetime(x["start"], unit="s"))
        .set_index("start_dt")
        .resample("3min")
        .agg({"text": " ".join})
        .reset_index()
        .assign(start_dt=lambda x: x["start_dt"].dt.minute * 60)
        .assign(
            source=lambda x: "https://youtu.be/"
            + yt_id
            + "&t="
            + x["start_dt"].astype("str")
        )
        .drop(columns=["start_dt"])
    )

In [None]:
yt_ids = [
    "OtD8wVaFm6E",  # XGBoost Part 1 (of 4): Regression
    "8b1JEDvenQU",  # XGBoost Part 2 (of 4): Classification
    "ZVFeW798-2I",  # XGBoost Part 3 (of 4): Mathematical Details
    "oRrKeUCEbq8",  # XGBoost Part 4 (of 4): Crazy Cool Optimizations
]
transcript_dfs = []
for yt_id in tqdm(yt_ids, desc="Fetching transcription"):
    yt_transcript = YouTubeTranscriptApi.get_transcript(yt_id)
    transcript_dfs.append(create_transcript_df(yt_transcript, yt_id))

transcripts_df = pd.concat(transcript_dfs).reset_index(drop=True)

In [None]:
text_splitter = CharacterTextSplitter(separator=" ", chunk_size=1200, chunk_overlap=150)

yt_docs, yt_meta = [], []

for index, row in tqdm(transcripts_df.iterrows(), total=len(transcripts_df)):
    splits = text_splitter.split_text(row["text"])
    yt_docs.extend(splits)
    yt_meta.extend([{"source": row["source"]}] * len(splits))
    print(f"Split {row['source']} into {len(splits)} chunks")

yt_ts_store = FAISS.from_texts(yt_docs, OpenAIEmbeddings(), metadatas=yt_meta)

assert len(yt_docs) == len(yt_meta)

In [None]:
yt_ts_store = FAISS.from_texts(yt_docs, OpenAIEmbeddings(), metadatas=yt_meta)

# Question Answering Bot

In [None]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="question",
    output_key="answer",
    return_messages=True,
)

template = """You are a chatbot having a conversation with a human.
    Given the following extracted parts of a long document and a question,
    create a final answer.
    {context}
    {chat_history}
    Human: {question}
    Chatbot:"""

question_prompt = PromptTemplate(
    input_variables=["chat_history", "question", "context"], template=template
)

In [None]:
# Do now the transparent question answering
yt_ts_chain = RetrievalQAWithSourcesChain.from_llm(
    llm=OpenAI(temperature=0.0),
    retriever=yt_ts_store.as_retriever(k=4),
    memory=memory,
    question_prompt=question_prompt,
)

In [None]:
# Use here either yt_ts_store or ys_store depending if you like to use source with or without timestamps

result = yt_ts_chain(
    {
        "question": "What is the difference in building a tree for a regression case compared to a classification case?"
    },
    return_only_outputs=True,
)