#### 1. Load the dependencies


In [1]:
pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


#### 2. Load the PDF


In [2]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("document.pdf")
pages = loader.load()

In [3]:
len(pages)

11

In [4]:
import pandas as pd
pd.DataFrame(pages)

Unnamed: 0,0,1,2
0,"(page_content, Attention Is All You Need\nAshi...","(metadata, {'source': 'document.pdf', 'page': 0})","(type, Document)"
1,"(page_content, Recurrent models typically fact...","(metadata, {'source': 'document.pdf', 'page': 1})","(type, Document)"
2,"(page_content, Figure 1: The Transformer - mod...","(metadata, {'source': 'document.pdf', 'page': 2})","(type, Document)"
3,"(page_content, Scaled Dot-Product Attention\n ...","(metadata, {'source': 'document.pdf', 'page': 3})","(type, Document)"
4,"(page_content, MultiHead( Q,K,V ) = Concat(hea...","(metadata, {'source': 'document.pdf', 'page': 4})","(type, Document)"
5,"(page_content, Table 1: Maximum path lengths, ...","(metadata, {'source': 'document.pdf', 'page': 5})","(type, Document)"
6,"(page_content, the input sequence centered aro...","(metadata, {'source': 'document.pdf', 'page': 6})","(type, Document)"
7,"(page_content, Table 2: The Transformer achiev...","(metadata, {'source': 'document.pdf', 'page': 7})","(type, Document)"
8,"(page_content, Table 3: Variations on the Tran...","(metadata, {'source': 'document.pdf', 'page': 8})","(type, Document)"
9,"(page_content, References\n[1]Jimmy Lei Ba, Ja...","(metadata, {'source': 'document.pdf', 'page': 9})","(type, Document)"


In [5]:
page = pages[0]
print(page)

page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brain\nnoam@google.comNiki Parmar∗\nGoogle Research\nnikip@google.comJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.comAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring signiﬁcan

In [6]:
print(page.page_content[0:500])

Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.comNoam Shazeer∗
Google Brain
noam@google.comNiki Parmar∗
Google Research
nikip@google.comJakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.comAidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.eduŁukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
conv


In [7]:
page.metadata

{'source': 'document.pdf', 'page': 0}

In [8]:
page.metadata['source']

'document.pdf'

In [9]:
page.type

'Document'

#### 3. Load environment variables


In [10]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)


token = os.environ['HUGGINGFACEHUB_API_TOKEN']
openai.api_key = os.environ['OPENAI_API_KEY']

##### 4. Document Splitting to Chunks


In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [12]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=2000,
    chunk_overlap=150,
    length_function=len
)

In [13]:
chunks = text_splitter.split_documents(pages)

In [14]:
len(pages)

11

In [15]:
len(chunks)

22

#### 5. Create Embeddings for the chunks


In [16]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [17]:
embedding = HuggingFaceEmbeddings()

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


#### 6. VectorStores


In [18]:
from langchain.vectorstores import Chroma

In [19]:
persist_dir = 'docs/chroma_document_pdf'

In [20]:
!rm -rf 'docs/chroma_document_pdf'

In [21]:
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory=persist_dir
)

In [22]:
vectordb.persist()

  warn_deprecated(


In [23]:
print(vectordb._collection.count())

22


#### Similarity Search on the documents


In [24]:
question = "What does this document explain about?"

In [25]:
vectordb.similarity_search(question, k=3)

[Document(page_content='sequence (y1,...,y m)of symbols one element at a time. At each step the model is auto-regressive\n[9], consuming the previously generated symbols as additional input when generating the next.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N= 6 identical layers. Each layer has two\nsub-layers. The ﬁrst is a multi-head self-attention mechanism, and the second is a simple, position-\n2', metadata={'page': 1, 'source': 'document.pdf'}),
 Document(page_content='[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine\ntranslation system: Bridging the gap between human and machine translation. arXiv preprin

#### Retrieval


In [26]:
from langchain.llms import Ollama

question = "Who are the primary authors of this document?"
docs_ss = vectordb.similarity_search(question, k=3)
print(docs_ss)

[Document(page_content='[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine\ntranslation system: Bridging the gap between human and machine translation. arXiv preprint\narXiv:1609.08144 , 2016.\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with\nfast-forward connections for neural machine translation. CoRR , abs/1606.04199, 2016.\n11', metadata={'page': 10, 'source': 'document.pdf'}), Document(page_content='sequence (y1,...,y m)of symbols one element at a time. At each step the model is auto-regressive\n[9], consuming the previously generated symbols as additional input when generating the next.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Dec

In [27]:
docs_mmr = vectordb.max_marginal_relevance_search(question, k=3)
print(docs_mmr)

[Document(page_content='[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine\ntranslation system: Bridging the gap between human and machine translation. arXiv preprint\narXiv:1609.08144 , 2016.\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with\nfast-forward connections for neural machine translation. CoRR , abs/1606.04199, 2016.\n11', metadata={'page': 10, 'source': 'document.pdf'}), Document(page_content='sequence (y1,...,y m)of symbols one element at a time. At each step the model is auto-regressive\n[9], consuming the previously generated symbols as additional input when generating the next.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Dec

In [28]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source": "document.pdf"}
)

In [29]:
for d in docs:
    print(d.metadata)

{'page': 10, 'source': 'document.pdf'}
{'page': 1, 'source': 'document.pdf'}
{'page': 9, 'source': 'document.pdf'}


In [30]:
from langchain.llms import Ollama
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [31]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The document is a chunk and is from, should be one of `document.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page is one of the pages from a research paper.",
        type="integer",
    ),
]

In [32]:
document_content_description = "The document is a research paper."
llm = Ollama(model='llama3', temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [33]:
question = "Who are the authors of this document?"

In [34]:
docs = retriever.get_relevant_documents(question)
print(docs)

  warn_deprecated(


[Document(page_content='[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine\ntranslation system: Bridging the gap between human and machine translation. arXiv preprint\narXiv:1609.08144 , 2016.\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with\nfast-forward connections for neural machine translation. CoRR , abs/1606.04199, 2016.\n11', metadata={'page': 10, 'source': 'document.pdf'}), Document(page_content='[21] Minh-Thang Luong, Hieu Pham, and Christopher D Manning. Effective approaches to attention-\nbased neural machine translation. arXiv preprint arXiv:1508.04025 , 2015.\n[22] Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention\nmodel. In Empirical Methods in Natural Language Processing , 2016.\n[23] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstracti

In [35]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [36]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" +
          d.page_content for i, d in enumerate(docs)]))

In [37]:
# Wrap our vectorstore
llm = Ollama(temperature=0, model="llama3")
compressor = LLMChainExtractor.from_llm(llm)

In [38]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [39]:
from langchain.chains import RetrievalQA

In [40]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [41]:
result = qa_chain({"query": question})

  warn_deprecated(


In [42]:
result["result"]

"I don't know the answer to that question. The provided text appears to be a list of references with citations and does not contain information about the authors of the document."

#### Prompt


In [43]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

In [44]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [45]:
result = qa_chain({"query": question})

In [46]:
result["source_documents"][0]

Document(page_content='[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine\ntranslation system: Bridging the gap between human and machine translation. arXiv preprint\narXiv:1609.08144 , 2016.\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with\nfast-forward connections for neural machine translation. CoRR , abs/1606.04199, 2016.\n11', metadata={'page': 10, 'source': 'document.pdf'})

In [47]:
import pprint
pp = pprint.PrettyPrinter(width=50)

In [48]:
pp.pprint(result["result"])

('The authors of this document are listed at the '
 'end. They include:\n'
 '\n'
 '* Yonghui Wu\n'
 '* Mike Schuster\n'
 '* Zhifeng Chen\n'
 '* Quoc V Le\n'
 '* Mohammad Norouzi\n'
 '* Wolfgang Macherey\n'
 '* Maxim Krikun\n'
 '* Yuan Cao\n'
 '* Qin Gao\n'
 '* Klaus Macherey\n'
 '\n'
 'And many others, as listed in the references.\n'
 '\n'
 'Thanks for asking!')


In [49]:
print(result["source_documents"][0])

page_content='[31] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang\nMacherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine\ntranslation system: Bridging the gap between human and machine translation. arXiv preprint\narXiv:1609.08144 , 2016.\n[32] Jie Zhou, Ying Cao, Xuguang Wang, Peng Li, and Wei Xu. Deep recurrent models with\nfast-forward connections for neural machine translation. CoRR , abs/1606.04199, 2016.\n11' metadata={'page': 10, 'source': 'document.pdf'}


In [50]:
import pprint

# Get the output
output = result["result"]

# If the output is a string, replace newline characters with spaces


if isinstance(output, str):
    output = output.replace('\n', ' ')

# Create a pretty printer
pp = pprint.PrettyPrinter(width=100)

# Use the pretty printer to print the output
pp.pprint(output)

('The authors of this document are listed at the end. They include:  * Yonghui Wu * Mike Schuster '
 '* Zhifeng Chen * Quoc V Le * Mohammad Norouzi * Wolfgang Macherey * Maxim Krikun * Yuan Cao * '
 'Qin Gao * Klaus Macherey  And many others, as listed in the references.  Thanks for asking!')


In [51]:
def chat_bishop(question, history):
    from langchain.prompts import PromptTemplate

    # Build prompt
    template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
    {context}
    Question: {question}
    Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    result = qa_chain({"query": question})

    import pprint
    pp = pprint.PrettyPrinter(width=50)

    # Use the pretty printer to print the long string
    pp.pprint(result["result"])

    # response = llm.invoke(message)
    return result["result"]

In [52]:
import gradio as gr
gradio_interface = gr.ChatInterface(
    chat_bishop,
    chatbot=gr.Chatbot(),
    textbox=gr.Textbox(placeholder="Example: What is your Question?",
                       container=False, scale=7),
    title="Hey, welcome. Please ask your Question",
    description=f"Ask the chatbot a question!",
    theme='gradio/base',
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",

)

gradio_interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




#### 2. Load the Excel


In [53]:
import pandas as pd
excel_data = pd.read_excel("data.xlsx")
excel_data.head()

##### 4. Excel Data Splitting to Chunks


In [54]:
excel_text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=2000,
    chunk_overlap=150,
    length_function=len
)

In [55]:
excel_chunks = excel_text_splitter.split_text(excel_data.to_string())

In [56]:
len(excel_chunks)

5

#### 5. Create Embeddings for the Excel chunks


In [57]:
excel_embeddings = [embedding.embed_query(chunk) for chunk in excel_chunks]

#### 6. VectorStores for Excel


In [58]:
excel_vectordb = Chroma.from_texts(
    texts=excel_chunks,
    embedding=embedding,
    persist_directory='docs/chroma_excel_data'
)

In [59]:
excel_vectordb.persist()

In [60]:
print(excel_vectordb._collection.count())

5


#### Similarity Search on the Excel data


In [61]:
excel_question = "What are the sales figures for Q1?"

In [62]:
excel_vectordb.similarity_search(excel_question, k=1)

[Document(page_content='   Product  Sales_Q1  Sales_Q2  Sales_Q3  Sales_Q4\n0  Product A      1000      1500      2000      2500\n1  Product B      2000      2500      3000      3500\n2  Product C      3000      3500      4000      4500\n3  Product D      4000      4500      5000      5500\n4  Product E      5000      5500      6000      6500', metadata={})]

#### Retrieval for Excel data


In [63]:
excel_docs_ss = excel_vectordb.similarity_search(excel_question, k=1)
print(excel_docs_ss)

[Document(page_content='   Product  Sales_Q1  Sales_Q2  Sales_Q3  Sales_Q4\n0  Product A      1000      1500      2000      2500\n1  Product B      2000      2500      3000      3500\n2  Product C      3000      3500      4000      4500\n3  Product D      4000      4500      5000      5500\n4  Product E      5000      5500      6000      6500', metadata={})]


In [64]:
excel_docs_mmr = excel_vectordb.max_marginal_relevance_search(excel_question, k=1)
print(excel_docs_mmr)

[Document(page_content='   Product  Sales_Q1  Sales_Q2  Sales_Q3  Sales_Q4\n0  Product A      1000      1500      2000      2500\n1  Product B      2000      2500      3000      3500\n2  Product C      3000      3500      4000      4500\n3  Product D      4000      4500      5000      5500\n4  Product E      5000      5500      6000      6500', metadata={})]


In [65]:
excel_docs = excel_vectordb.similarity_search(
    excel_question,
    k=1,
    filter={}
)

In [66]:
for d in excel_docs:
    print(d.metadata)

{}


In [67]:
excel_retriever = SelfQueryRetriever.from_llm(
    llm,
    excel_vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [68]:
excel_docs = excel_retriever.get_relevant_documents(excel_question)
print(excel_docs)

In [69]:
excel_compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=excel_vectordb.as_retriever()
)

In [70]:
excel_qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=excel_vectordb.as_retriever()
)

In [71]:
excel_result = excel_qa_chain({"query": excel_question})

In [72]:
excel_result["result"]

'The sales figures for Q1 are as follows: Product A: 1000, Product B: 2000, Product C: 3000, Product D: 4000, Product E: 5000. Thanks for asking!'

In [73]:
excel_qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=excel_vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [74]:
excel_result = excel_qa_chain({"query": excel_question})

In [75]:
excel_result["source_documents"][0]

Document(page_content='   Product  Sales_Q1  Sales_Q2  Sales_Q3  Sales_Q4\n0  Product A      1000      1500      2000      2500\n1  Product B      2000      2500      3000      3500\n2  Product C      3000      3500      4000      4500\n3  Product D      4000      4500      5000      5500\n4  Product E      5000      5500      6000      6500', metadata={})

In [76]:
pp.pprint(excel_result["result"])

('The sales figures for Q1 are as follows: Product A: 1000, Product B: 2000, Product C: 3000, '
 'Product D: 4000, Product E: 5000. Thanks for asking!')


In [77]:
print(excel_result["source_documents"][0])

page_content='   Product  Sales_Q1  Sales_Q2  Sales_Q3  Sales_Q4\n0  Product A      1000      1500      2000      2500\n1  Product B      2000      2500      3000      3500\n2  Product C      3000      3500      4000      4500\n3  Product D      4000      4500      5000      5500\n4  Product E      5000      5500      6000      6500' metadata={}


In [78]:
output = excel_result["result"]
if isinstance(output, str):
    output = output.replace('\n', ' ')
pp.pprint(output)

('The sales figures for Q1 are as follows: Product A: 1000, Product B: 2000, Product C: 3000, '
 'Product D: 4000, Product E: 5000. Thanks for asking!')


In [79]:
def chat_bishop_excel(question, history):
    result = excel_qa_chain({"query": question})
    pp.pprint(result["result"])
    return result["result"]

In [80]:
gradio_interface_excel = gr.ChatInterface(
    chat_bishop_excel,
    chatbot=gr.Chatbot(),
    textbox=gr.Textbox(placeholder="Example: What is your Question?",
                       container=False, scale=7),
    title="Hey, welcome. Please ask your Question",
    description=f"Ask the chatbot a question!",
    theme='gradio/base',
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",

)

gradio_interface_excel.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


