In [57]:
import sys
import os
from dotenv import find_dotenv, load_dotenv

In [58]:
# Env variable
sys.path.append("../")
load_dotenv(find_dotenv())

True

# 1. Confluence Loader 

In [59]:
username = os.getenv("CONFLUENCE_USERNAME")
api_token = os.getenv("CONFLUENCE_API_KEY")
base_url = os.getenv("CONFLUENCE_BASE_URL")
space_key = os.getenv("CONFLUENCE_SPACE_KEY")


In [60]:
from langchain_community.document_loaders import ConfluenceLoader

loader = ConfluenceLoader(
    url=base_url,
    username=username,
    api_key=api_token,
    space_key=space_key,
    limit=10,
    # include_attachments=True, # uncomment to include png, jpeg, ..
    max_pages=50,
    keep_markdown_format=True
)

In [61]:
docs = loader.load()

In [62]:
# Look at one page content and its metadata
print("Content: \n ------- \n" + docs[-1].page_content)
print("Metadatas: \n ------- \n" + str(docs[-1].metadata))

Content: 
 ------- 


The Basingstoke Maplewood office has a number of options for team meals but these may not be immediately obvious.

1. **Pizza from Pizzeria Gali**  
   <https://pizzeriagali.co.uk/>  
   [info@pizzeriagali.co.uk](mailto:info@pizzeriagali.co.uk)  
   Open: 12 - 2pm. Closed Mondays.  
   This is freshly cooked Pizza with quality ingredients. We’ve ordered food for 30 people from here and he’s more than happy to accommodate. Give him some advance notice but normally payment is on the day via credit card. He’s located next to Fitness First on the business park. He will deliver but it’s so close pickup might be easier. Large order was around £400. For small teams there’s also the option to drop into their restaurant which is about a ten minute walk.
2. **Mannicitas Food**  
   <https://mannicitas.com/our-menu/>   
   This is a local lady who caters for businesses. Minimum order size would be about 10 people. We’ve sampled her food and it was excellent. Website doesn’t 

In [63]:
def pretty_print(chunks):
    print(
        str("\n" + "=" * 50 + "\n").join(
            [chunk.page_content + "\n" + "-" * 50 + "\n" + str(chunk.metadata) for chunk in chunks]
        )
    )

## 2. Document Splitter 

### Document Example

In [64]:
from langchain_core.documents import Document

text = """
# I am a title
## I am a subtitle

I am a block of text. However, my size is quite long. I would first like the MarkdownHeaderTextSplitter
to identify my title and subtitle in its metadata.

I then want RecursiveCharacterTextSplitter to identify the two parts that compose me
because my size would be too large to feed a language model.

Finally, I would like the metadata corresponding to my origins, namely the url, to be merged with my title and subtitle
information.
"""

metadata = {"url": "https://my_origin.com"}

sample = Document(page_content=text, metadata=metadata)

### MarkdownHeaderTextSplitter example

In [65]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Markdown
headers_to_split_on = [
    ("#", "Heading 1"),
    ("##", "Heading 2"),
    ("###", "Heading 3"),
]

# Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = markdown_splitter.split_text(sample.page_content)

print(chunks)

[Document(metadata={'Heading 1': 'I am a title', 'Heading 2': 'I am a subtitle'}, page_content='I am a block of text. However, my size is quite long. I would first like the MarkdownHeaderTextSplitter\nto identify my title and subtitle in its metadata.  \nI then want RecursiveCharacterTextSplitter to identify the two parts that compose me\nbecause my size would be too large to feed a language model.  \nFinally, I would like the metadata corresponding to my origins, namely the url, to be merged with my title and subtitle\ninformation.')]


### RecursiveCharacterTextSplitter example

In [66]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=20,
    separators=["#", r"\n\n", r"\n", r"(?<=\. )", " ", ""],
)

splitted_chunks = splitter.split_documents(chunks)

pretty_print(splitted_chunks)

I am a block of text. However, my size is quite long. I would first like the MarkdownHeaderTextSplitter
to identify my title and subtitle in its metadata.  
I then want RecursiveCharacterTextSplitter to identify the two parts that compose me
because my
--------------------------------------------------
{'Heading 1': 'I am a title', 'Heading 2': 'I am a subtitle'}
me
because my size would be too large to feed a language model.  
Finally, I would like the metadata corresponding to my origins, namely the url, to be merged with my title and subtitle
information.
--------------------------------------------------
{'Heading 1': 'I am a title', 'Heading 2': 'I am a subtitle'}


### MarkdownHeaderTextSplitter & RecursiveCharacterTextSplitter

In [67]:
# Markdown
headers_to_split_on = [
    ("#", "Heading 1"),
    ("##", "Heading 2"),
    ("###", "Heading 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Split based on markdown and add original metadata
md_docs = []
for doc in [sample]:
    md_doc = markdown_splitter.split_text(doc.page_content)
    for i in range(len(md_doc)):
        md_doc[i].metadata = md_doc[i].metadata | doc.metadata
    md_docs.extend(md_doc)

# RecursiveTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Chunk size big enough
splitter = RecursiveCharacterTextSplitter(
    chunk_size=256, chunk_overlap=20, separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
)

splitted_docs = splitter.split_documents(md_docs)

pretty_print(splitted_docs)

I am a block of text. However, my size is quite long. I would first like the MarkdownHeaderTextSplitter
to identify my title and subtitle in its metadata.  
I then want RecursiveCharacterTextSplitter to identify the two parts that compose me
--------------------------------------------------
{'Heading 1': 'I am a title', 'Heading 2': 'I am a subtitle', 'url': 'https://my_origin.com'}
because my size would be too large to feed a language model.  
Finally, I would like the metadata corresponding to my origins, namely the url, to be merged with my title and subtitle
information.
--------------------------------------------------
{'Heading 1': 'I am a title', 'Heading 2': 'I am a subtitle', 'url': 'https://my_origin.com'}


In [68]:
def my_custom_splitter(docs):
    # Markdown
    headers_to_split_on = [
        ("#", "Heading 1"),
        ("##", "Heading 2"),
        ("###", "Heading 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    # Split based on markdown and add original metadata
    md_docs = []
    for doc in docs:
        md_doc = markdown_splitter.split_text(doc.page_content)
        for i in range(len(md_doc)):
            md_doc[i].metadata = md_doc[i].metadata | doc.metadata
        md_docs.extend(md_doc)

    # RecursiveTextSplitter
    from langchain.text_splitter import RecursiveCharacterTextSplitter

    # Chunk size big enough
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500, chunk_overlap=20, separators=["\n\n", "\n", r"(?<=\. )", " ", ""]
    )

    splitted_docs = splitter.split_documents(md_docs)
    return splitted_docs

In [69]:
chunks = my_custom_splitter(docs)

# 3. Embeddings & Vector DB 

In [70]:
persist_directory = "./db/chroma"

In [71]:
# Embeddings
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [72]:
import shutil

try:
    shutil.rmtree(persist_directory)
except FileNotFoundError:
    pass

In [73]:
# Save db
from langchain.vectorstores import Chroma

db = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
db.persist()

OperationalError: attempt to write a readonly database

In [46]:
# Count the number of chunks in the vector store
db._collection.count()

389

In [47]:
# db.get()
retriever = db.as_retriever()
# retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.3})

In [48]:
from langchain.prompts import PromptTemplate

template = """Given this text extracts:
    -----
    {context}
    -----
    Please answer with to the following question:
    Question: {question}
    Answer: 
    """

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [50]:
# LLM
from langchain_openai import OpenAI

llm = OpenAI(streaming=True)

In [51]:
from langchain.chains import RetrievalQA

chain_type_kwargs = {"prompt": prompt}

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # or
    retriever=retriever,
    return_source_documents=True,
    verbose=True,
)

In [52]:
question = "Best Pizza place near the Basingstoke office?"

query = {"query": question}
answer = qa(query)

  answer = qa(query)




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [53]:
from IPython.display import display_markdown

display_markdown(answer["result"], raw=True)

 Pizzeria Gali, located next to Fitness First on the business park, offers freshly cooked Pizza with quality ingredients. They can accommodate large orders with advance notice and accept credit card payment on the day. For smaller teams, there is also the option to dine in at their restaurant, which is about a ten minute walk from the office. 

### Display documents used by the LLM for answering

In [56]:
retriever.get_relevant_documents("Best Pizza place near the Basingstoke office?")

[Document(metadata={'id': '4714168344', 'source': 'https://easypark.jira.com/wiki/spaces/EP/pages/4714168344/Places+for+team+lunch+around+Maplewood', 'title': 'Places for team lunch around Maplewood', 'when': '2024-08-19T15:54:43.947Z'}, page_content='The Basingstoke Maplewood office has a number of options for team meals but these may not be immediately obvious.  \n1. **Pizza from Pizzeria Gali**\n<https://pizzeriagali.co.uk/>\n[info@pizzeriagali.co.uk](mailto:info@pizzeriagali.co.uk)\nOpen: 12 - 2pm. Closed Mondays.'),
 Document(metadata={'id': '4714168344', 'source': 'https://easypark.jira.com/wiki/spaces/EP/pages/4714168344/Places+for+team+lunch+around+Maplewood', 'title': 'Places for team lunch around Maplewood', 'when': '2024-08-19T15:54:43.947Z'}, page_content='This is freshly cooked Pizza with quality ingredients. We’ve ordered food for 30 people from here and he’s more than happy to accommodate. Give him some advance notice but normally payment is on the day via credit card. H