## Example of RAG using Unstructured

RAG with different documents using OpenAI model, OpenAI embedding, and ChromaDB

In [9]:
from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError
from unstructured.partition.html import partition_html
from unstructured.partition.pptx import partition_pptx
from unstructured.staging.base import dict_to_elements, elements_to_json
from unstructured.partition.auto import partition
from unstructured.chunking.title import chunk_by_title
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.prompts.prompt import PromptTemplate
from langchain_openai import OpenAI
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
import os
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
filename = "documents/medium_blog.html"
elements_html = partition_html(filename=filename)
filename = "documents/msft_openai.pptx"
elements_ppt = partition_pptx(filename=filename)
filename = "documents/CoT.pdf"
elements_pdf = partition(filename)

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

In [5]:
elements = chunk_by_title(elements_html + elements_ppt + elements_pdf)

In [68]:
documents = []
for element in elements[:]:
    metadata = element.metadata.to_dict()
    metadata['source'] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))

In [80]:
from langchain.vectorstores import utils

documents = utils.filter_complex_metadata(documents)

embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [76]:
docs = vectorstore.similarity_search("What is the Unstructured API about?")
docs[0]

Document(page_content='Why Use Unstructured API?\n\nUse cases for the Unstructured open-source library and API are vast; the possibilities are endless, from analyzing customer feedback in real time to extracting insights from large document repositories. While the Unstructured open-source library offers robust functionality, using the Unstructured API brings several compelling advantages:\n\nEase of Use: Managing dependencies is unnecessary with the API. All you need is an API key, and you’re ready to go.', metadata={'file_directory': 'documents', 'filename': 'medium_blog.html', 'filetype': 'text/html', 'last_modified': '2024-04-13T20:07:55', 'orig_elements': 'eJzNlE1v2zAMhv8K4XM+bMdO6uww9LBDgX0dWvSwFYYs0bZQWTL0kS0t+t9HOQ0QpAW2Ajv0aJEvSb166B+PCSocUPtaimQLScPTLCtXLYq1qLBiqwyL8oI1ZVW0bYlVMoNkQM8E84zyHxPOPHbG7muBo+/pKKWMViqshbTIPYViXWF4iG1c8hzWbMAYGFDIMNSNMt2i94M6xv1+nOIef/vlMaCY7gLr0FHkR4K6S+6mU+frwQjZSpwukad5MU+Leba6ztNtutmWZVSPpKx1GBq0lJU9zQ7Vo+K238ONQ7jRztvAfbAo4PL71ceoO45yLb3ChGTnpvF

In [62]:
template = """You are an AI assistant for answering questions.
You are given the following extracted parts of a document and a question. Provide a conversational answer.
If you don't know the answer, just say "I don't know." Don't try to make up an answer.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""
prompt = PromptTemplate(template=template, input_variables=["question", "context"])

In [82]:
llm = OpenAI(temperature=0)

doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce")
question_generator_chain = LLMChain(llm=llm, prompt=prompt)
qa_chain = ConversationalRetrievalChain(
    retriever=retriever,
    question_generator=question_generator_chain,
    combine_docs_chain=doc_chain,
)

In [83]:
qa_chain.invoke({
    "question": "What is the Unstructured API about?",
    "chat_history": []
})["answer"]

' The Unstructured API allows for real-time analysis of customer feedback and extraction of insights from large document repositories.\nSOURCES: medium_blog.html'