# LangChain Chat with Your Data - Building Chatbot using LLM and RAG
Credit: deeplearning.ai

In [None]:
%load_ext autoreload
%autoreload 2

import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
env_loader = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

# Loading Document

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("../docs/alphafold_nature.pdf")
pages = loader.load()
page = pages[0]
print(page.page_content[:200])
print(page.metadata)

In [None]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# url="https://www.youtube.com/watch?v=7q8Uw3rmXyE"  # "what is alphafold" tutorial video
# save_dir="../docs/youtube/"
# youtube_loader = YoutubeAudioLoader([url],save_dir)  # failed: require premium subscription to download video?

# whisper_loader = OpenAIWhisperParser()
# docs = whisper_loader.load()

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://alphafold.ebi.ac.uk/")
loader.requests_kwargs = {'verify':False}
docs = loader.load()
print(docs[0].page_content[:500])

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("../docs/shawn_notion")
docs = loader.load()
doc=docs[0]
print(doc.page_content[:500])
print(doc.metadata)

# Splitting Document

In [None]:
from langchain.document_loaders import NotionDirectoryLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
loader = NotionDirectoryLoader("../docs/shawn_notion")
docs = loader.load()

markdown_document = ' '.join([d.page_content for d in docs])

# markdown spliitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)
print(len(md_header_splits))

# # recursive character text splitter
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size = 1500,
#     chunk_overlap = 150
# )
# text_splits = text_splitter.split_text(markdown_document)
# print(len(text_splits))

# Vectorstores and Embedding

In [None]:
import numpy as np
import tiktoken
MAX_TOKEN_EMBEDDING = 150000  # TPM

# Initialize the tokenizer for the 'text-embedding-ada-002' model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Calculate the total number of tokens
total_tokens = sum(len(tokenizer.encode(doc.page_content)) for doc in md_header_splits)

num_batch = np.ceil(total_tokens / MAX_TOKEN_EMBEDDING)
batch_size = np.ceil(total_tokens / num_batch)

print(f"Total number of tokens: {total_tokens}, split into {num_batch} batches of size {batch_size} tokens")


In [None]:
import os
import shutil

persist_directory = '../docs/chroma/'
os.makedirs(persist_directory, exist_ok=True)

# # Check if the directory exists
# if os.path.exists(persist_directory):
#     # Remove all files and subdirectories
#     shutil.rmtree(persist_directory)
#     print(f"All files in {persist_directory} have been removed.")
# else:
#     print(f"The directory {persist_directory} does not exist.")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()  # model used: text-embedding-ada-002, limit: 150,000 TPM

# batch process documents
for i in range(0, len(md_header_splits), batch_size):
    batch = md_header_splits[i:i + batch_size]
    vectordb = Chroma.from_documents(
        documents=batch,
        embedding=embedding,
        persist_directory=persist_directory
    )
    print(f"Batch #{i}")
    print(vectordb._collection.count())

# vectordb = Chroma.from_texts(
#     texts=text_splits,
#     embedding=embedding,
#     persist_directory=persist_directory
# )

# Retrieval

In [None]:
import re
def contains_chinese(text):
    # Regex pattern to match Chinese characters
    pattern = re.compile(r'[\u4e00-\u9fff]')
    
    # Search for Chinese characters in the text
    return bool(pattern.search(text))

def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings

persist_directory = '../docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)
print(vectordb._collection.count())

In [None]:
question = "What is my career path? Where have I been working?"
docs = vectordb.similarity_search(question, k=3)
# docs = vectordb.max_marginal_relevance_search(question, k=5, fetch_k=10)
pretty_print_docs(docs)


In [None]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"Header 1":"Agenda / Topics"}
)

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_field_info = [
    AttributeInfo(
        name="Header 1",
        description="Top category of a page",
        type="string",
    ),
    AttributeInfo(
        name="Header 2",
        description="Subcategory of a page",
        type="string",
    ),
]

In [None]:
document_content_description = "Personal Notion Database"
llm = OpenAI(model='gpt-3.5-turbo-instruct', temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)
pretty_print_docs(docs)

In [None]:
docs = retriever.get_relevant_documents(question)
for d in docs:
    print(d.metadata)

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# Wrap our vectorstore
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [None]:
question = "What is my career path? Where have I been working?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")  # max marginal relevance search
)

compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

# Answer Questions with Context

In [None]:
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
persist_directory = '../docs/chroma/'
embedding = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

from langchain_openai import ChatOpenAI
llm_name = "gpt-3.5-turbo-0125"
llm = ChatOpenAI(model_name=llm_name, temperature=0)

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)
result = qa_chain({"query": question})
result["result"]

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
    {context}

    Question: {question}
    Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [None]:
# Run chain
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr", search_kwargs={"k":3})  # max marginal relevance search
)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    chain_type="stuff",  # "stuff", 'refine', 'map_reduce'
    retriever=compression_retriever, # vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},  # only work with "stuff" chain type
)


In [None]:
question = "Who is Shawn"
result = qa_chain({"query": question})
result

# Memory

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [None]:
from langchain.chains import ConversationalRetrievalChain

# Run chain
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr", search_kwargs={"k": 3})  # max marginal relevance search
)

qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=compression_retriever,
    memory=memory
)

In [None]:
question = "Who wrote these notes?"
result = qa({"question": question})

# Chatbot Demo

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA,  ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

In [None]:
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name=llm_name, temperature=0), 
        chain_type=chain_type, 
        retriever=retriever, 
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa 


In [None]:
import panel as pn
import param

class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query  = param.String("")
    db_response = param.List([])
    
    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        self.panels = []
        self.loaded_file = "../docs/alphafold_nature.pdf"
        self.qa = load_db(self.loaded_file,"stuff", 4)
    
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result['answer'] 
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, style={'background-color': '#F6F6F6'}))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    @param.depends('db_query ', )
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    def get_sources(self):
        if not self.db_response:
            return 
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history') 
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self,count=0):
        self.chat_history = []
        return 


In [None]:
# create chatbot
llm_name = "gpt-3.5-turbo-0125"

cb = cbfs()

pn.extension()
file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp) 

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
dashboard