### Mini RAG 

General imports

In [1]:
import os
import pdb
import dotenv
import csv

In [2]:
from pathlib import Path
from uuid import uuid4

In [3]:
# Langchain controls
from langchain.globals import set_debug

Langchain imports

In [4]:
# llm model
from langchain_openai import ChatOpenAI

# Vector database
from langchain_core.vectorstores import InMemoryVectorStore

# Embeddings
from langchain_openai import OpenAIEmbeddings

# Prompts and messages
from langchain.prompts import (
  PromptTemplate, ChatPromptTemplate
)

# Documents processing
import fitz
from langchain.schema import Document

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Parsers and runnables
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# Chains
from langchain.chains.retrieval_qa.base import RetrievalQA

In [9]:
# Load environment variables
dotenv.load_dotenv()

True

In [11]:
# Langchain debug mode
set_debug(False)

In [17]:
# Load LLM model as chat model
model_params = {
  "model_name": "gpt-4o-mini",
  "api_version": "2023-05-15",
  "temperature": 0.05, 
  "max_tokens": 4000,
  "top_p": 0.95,
}
    
llm = ChatOpenAI(
  model=model_params["model_name"],
  api_key=os.environ.get('OPENAI_API_KEY'),
  temperature=model_params["temperature"],
  max_tokens=model_params["max_tokens"],
  top_p=model_params["top_p"]
)

In [18]:
# Load embeddings model
embeddings_model: str="text-embedding-ada-002"
embeddings_service = OpenAIEmbeddings(
  model=embeddings_model,
  api_key=os.environ.get('OPENAI_API_KEY')
)

In [19]:
# Create a vector database
#* There are more robust alternatives to this, such as Pinecone, Chroma
vdb = InMemoryVectorStore(embeddings_service)

#### Document source: CSV file

In [20]:
# Load a .csv file
file_folder = "documents"
file_name = "mi-tax-table.csv"
file_path = os.path.join(file_folder, file_name)

documents: list = []
with open(file_path, 'r', encoding='utf-8') as f:
  reader = csv.reader(f)
  header = next(reader)
  rows = [row for row in reader]
  
# Crear documentos para insertar en la vdb
for row in rows:
  doc = Document(
    id=str(uuid4()),
    page_content=" ".join(row),
    metadata={
      "source": file_name
    }
  )
  documents.append(doc)
  
documents[0]

Document(id='8d7fdedc-0ef3-4471-8b9d-3875f3d73808', metadata={'source': 'mi-tax-table.csv'}, page_content='ST LOUIS COUNTY TAX 1% 66.600-66.630')

In [21]:
# Add the documents to the vector database
vdb.add_documents(documents)

['8d7fdedc-0ef3-4471-8b9d-3875f3d73808',
 'd9f04c45-b125-49df-83fd-53d0bb0cebf2',
 '9ee4410e-5a67-45d2-9ba2-ce3a7437db64',
 '18615f54-624a-47c7-8804-dd86ee9243f2',
 '395de47e-fd71-41cd-8133-7c79ebeca69e',
 '8ac025c0-2623-4fa6-9dfb-4f1ddfd94708',
 'f7916ec9-50fb-44b3-be2f-54a5931afb0e',
 'f5a1d386-4afc-478e-9b2d-05fcf628f610',
 'dedc4bcb-13d4-4d28-a4b8-d4325a78ecf1',
 '9c94abb5-f09c-4b07-98e5-be79311afae6',
 'c4f918f5-8ead-4557-adb3-f2d00995495e',
 '7864d0ab-f756-4141-97da-ed4ff4efd2a4',
 '07490781-c87b-455d-8467-f78d3cc42073',
 'd0431fc1-1990-4185-b1b6-91163f8423fb',
 '5cd76ad4-3aed-40cd-98e5-6dedbab06a1e',
 'b622090a-544b-44fc-8298-d3c27a454146',
 '9879d50b-e6aa-419a-bd91-a58e0d9faaa7',
 '14bcf9e5-9894-4fed-b68c-fe2d7457422f',
 '18d8eb21-9a61-4f28-a4e4-566beec34d41',
 '64fcc2ea-7a5c-4a4e-a4cc-01bea88b97df',
 '819cad7b-c610-4acd-86e3-e4a33b5430c2',
 '01491731-0f13-443c-8155-486773e178f0',
 '57654da5-27c3-4e43-b754-d4ccfd07db4a',
 '77dc1943-8264-4d5e-9ff5-ecabd4f8516d',
 '2df752a4-e008-

In [22]:
# Search the vector database by similarity
query = "¿What is st louis county tax rate?"
results = vdb.similarity_search_with_score(
  query=query,
  k=2
)
for result in results:
  print("Answer:::", result[0].page_content)
  print("Score:::", result[1])
  print("*" * 20)

Answer::: ST LOUIS COUNTY TAX 1% 66.600-66.630
Score::: 0.9023943986225309
********************
Answer::: ST LOUIS COUNTY ADDITIONAL SALES TAX 275/1000% 67.581
Score::: 0.877963188931247
********************


In [None]:
# qa chain
# Retriever
retriever = vdb.as_retriever(
  search_type="similarity",
  search_kwargs={
    "k": 3
  }
)

# Prompt for the qa chain
prompt_text = """
You are an assistant specialized in answering questions about documents.
Your task is to use the information provided in the context to answer
the question.
Instructions: 
  1. Answer the following question based on the information
  2. Do not include unsolicited information, do not make up data, do not 
  include recommendations outside of the provided context.

Context:
{context}
Question:
{question}
"""

qa_prompt = PromptTemplate.from_template(template=prompt_text)

qa_chain = RetrievalQA.from_chain_type(
  llm = llm,
  retriever=retriever,
  return_source_documents=True,
  chain_type="stuff",
  chain_type_kwargs={"prompt": qa_prompt}
)

In [24]:
# QA test
query = "¿What is county tourism sales tax rate in Missouri?"
answer = qa_chain.invoke(
  {"query": query}
)
print("Query:", answer["query"])
print("Answer:", answer["result"])
print("Source documents:")
for doc in answer["source_documents"]:
  print(doc.page_content)

Query: ¿What is county tourism sales tax rate in Missouri?
Answer: The county tourism sales tax rate in Missouri can be up to 7/8%.
Source documents:
COUNTY TOURISM SALES TAX UP TO 7/8% 67.671-67.685
ST LOUIS COUNTY ZOOGICAL SALES TAX 1/8, 1/4, 3/8, 1/2% (Cannot exceed 1/8% beginning 8/28/17) (Cannot be in excess of 1% combined) 67.547
ST LOUIS COUNTY ADDITIONAL SALES TAX 275/1000% 67.581


In [25]:
# Straightforward LLM call
llm_response = llm.invoke(query)
print(llm_response.content)

In Missouri, the county tourism sales tax rate can vary by county, as each county has the authority to impose its own sales tax for tourism-related purposes. Generally, the rates can range from 1% to 5%, depending on the specific county's regulations and decisions. 

To find the exact tourism sales tax rate for a specific county in Missouri, it's best to check with the local county government or their official website, as they will provide the most accurate and up-to-date information.


#### Document source: PDF document

In [26]:
# Load PDF document
file_folder = "documents"
file_name = "wa-taxes.pdf"
file_path = os.path.join(file_folder, file_name)

with open(file_path, "rb") as f:
  document_bytes = f.read()
  
# Open the PDF document
document_pdf = fitz.open(
  stream=document_bytes,
  filetype="pdf"
)

# Split the PDF document into chunks - to insert into the vdb
splitting_method="recursive"
splitter_params = {
  "chunk_size": 800,
  "chunk_overlap": 50
}
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=splitter_params["chunk_size"],
  chunk_overlap=splitter_params["chunk_overlap"]
)

# Create the langchain list of documents
document_with_metadata: list = []
for page_number in range(document_pdf.page_count):
  page = document_pdf.load_page(page_number)
  page_text = page.get_text("text")
  document_with_metadata.append(
    Document(
      id=str(uuid4()),
      page_content=page_text,
      metadata={
        "titulo": file_name,
        "pagina": page_number 
      }
    )
  )
  
document_splitted = text_splitter.split_documents(
  documents=document_with_metadata
)

# Ingest the documents into the vector database
vdb.add_documents(documents=document_splitted)

['7f4037ad-3ae1-4ad0-9832-bbfd0fd32332',
 'e8a821d2-a043-48d4-9dd0-f5b025110610',
 '06cd087a-88d8-447c-9c36-5bc40c3b89fc',
 '63355de8-ed73-42fd-830f-f2376727e297',
 '3e289554-320f-4acc-8393-3e18368cd1f6',
 '67f63f24-5ec6-452d-b077-00896875f7c7',
 '0cb9bc8e-09ee-48ee-bcd5-c59554dd6e3c',
 '6619ceb1-4e11-42fc-95d6-886572a08c21',
 '136a90ff-0df3-440b-bfd2-3284f0fd49d7',
 '2afee36a-a981-4956-b30b-f418443fe765',
 '4e219bfb-7188-49f2-bda6-c7e3eb050ba0',
 '63550fd2-50a0-40fa-8a5b-57758032e604',
 '9fb18b9d-7eb0-425b-89e6-6c9021100c0a',
 '6effe138-0fad-4783-ad22-5d018c67e652',
 '67057511-942e-4c4f-bd51-754aa4b92c46',
 '9c5435be-dae9-4f7f-9bba-2249f812f9b7',
 '515a46e2-52ad-41b7-bbb3-a53372d91287',
 '41a119ce-846c-4886-94a8-9d14d2ca9802',
 'e20d8223-04db-40d9-8a31-21108f81341b',
 '909982e9-1bb5-4e29-adbc-9c24b425928a',
 'c4c04cf6-ca7b-4913-a3e9-43a7637c2bb8',
 '3c934bc7-2203-49b4-be8a-ec79000c11fb',
 '4b1efa6a-ce17-46dc-9e74-02609e5a622b',
 'd16edd83-2b17-4141-ac32-b72593ae7e70',
 '91bc66f0-a26c-

In [27]:
# semanctic search
query = "¿What are Tax rates for Zoo & Aquarium in WA?"
results = vdb.similarity_search_with_score(
  query=query,
  k=2
)
for result in results:
  print("Answer:::", result[0].page_content)
  print("Score:::", result[1])
  print("*" * 20)

Answer::: Local Sales & Use Tax Components 
 
33 
 
Component Name: 
 
Zoo & Aquarium 
 
Revised Code of WA (RCW): 82.14.400 
Date Legislation Enacted: 
1999 
Date Last Modified: 
 
2020 
Statutory Expiration Date: 
None 
Imposed by: 
 
 
Voter Approval 
 
Description 
• 
Counties may levy a sales and use tax to fund costs associated with a zoo, aquarium, or wild life preservation, 
and parks 
 
Taxing Authorities  
• 
A county with a national park and a population of more than five hundred thousand and less than 1.5 million 
by joint request with metropolitan park district and a city with a population of more than one hundred fifty 
thousand 
 
Tax Base 
• 
This tax applies to retail sales taxable activity under chapter 82.08 and 82.12 RCW 
 
Tax Rate 
 
• 
Up to 0.001 
 
Fiscal Year Cap 
•
Score::: 0.8862839727968871
********************
Answer::: Local Sales & Use Tax Components 
 
34 
 
Line Code(s) 
 
• 
Sales Tax = 95  
• 
Use Tax = 96 
 
Use of funds 
• 
Costs associated with fi

In [28]:
# qa chain
answer = qa_chain.invoke(
  {"query": query}
)
print("Query:", answer["query"])
print("Answer:", answer["result"])
print("Source documents:")
for doc in answer["source_documents"]:
  print(doc.page_content)

Query: ¿What are Tax rates for Zoo & Aquarium in WA?
Answer: The tax rate for Zoo & Aquarium in Washington is up to 0.001.
Source documents:
Local Sales & Use Tax Components 
 
33 
 
Component Name: 
 
Zoo & Aquarium 
 
Revised Code of WA (RCW): 82.14.400 
Date Legislation Enacted: 
1999 
Date Last Modified: 
 
2020 
Statutory Expiration Date: 
None 
Imposed by: 
 
 
Voter Approval 
 
Description 
• 
Counties may levy a sales and use tax to fund costs associated with a zoo, aquarium, or wild life preservation, 
and parks 
 
Taxing Authorities  
• 
A county with a national park and a population of more than five hundred thousand and less than 1.5 million 
by joint request with metropolitan park district and a city with a population of more than one hundred fifty 
thousand 
 
Tax Base 
• 
This tax applies to retail sales taxable activity under chapter 82.08 and 82.12 RCW 
 
Tax Rate 
 
• 
Up to 0.001 
 
Fiscal Year Cap 
•
Local Sales & Use Tax Components 
 
34 
 
Line Code(s) 
 
• 
Sales

In [29]:
# Straightforward LLM call
llm_response = llm.invoke(query)
print(llm_response.content)

In Washington State, zoos and aquariums are generally subject to the state's sales tax, which is currently set at a base rate of 6.5%. However, local jurisdictions may impose additional sales taxes, which can vary by location. This means that the total sales tax rate for admissions and related sales at zoos and aquariums can be higher depending on the specific city or county.

Additionally, Washington State has specific tax exemptions for certain nonprofit organizations, which may apply to some zoos and aquariums if they meet the criteria.

For the most accurate and up-to-date information, it's best to consult the Washington State Department of Revenue or the specific zoo or aquarium in question.
