# First RAG pipeline

This is just so we can get something done. Experiments and improvements will be applied as development continues.

In [15]:
# Imports

from lxml import etree

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Annoy

from langchain.llms import GPT4All
from langchain.chains import RetrievalQA

# Namespace Spec (for lxml)
NS = {'uslm': 'http://xml.house.gov/schemas/uslm/1.0',
      'xhtml': 'http://www.w3.org/1999/xhtml'}

Read in data

In [10]:
def get_ancestor_heading_text(section, tag, ns):
    ancestor = section.getparent()
    while ancestor is not None:
        if ancestor.tag == f"{{{ns['uslm']}}}{tag}":
            heading = ancestor.find('uslm:heading', namespaces=ns)
            return heading.text.strip() if heading is not None else ""
        ancestor = ancestor.getparent()
    return ""

def parse_sections_with_metadata(file_path):
    with open(file_path, 'rb') as f:
        tree = etree.parse(f)
    
    sections = tree.findall('.//uslm:section', namespaces=NS)
    parsed = []

    for section in sections:
        heading = section.find('uslm:heading', namespaces=NS)
        heading_text = heading.text.strip() if heading is not None else ""

        # Get all paragraphs (and any nested elements)
        content_texts = []
        for p in section.findall('.//uslm:p', namespaces=NS):
            text = ' '.join(p.itertext()).strip()
            if text:
                content_texts.append(text)

        # Get ancestors: subtitle, chapter, part
        subtitle = get_ancestor_heading_text(section, 'subtitle', NS)
        chapter = get_ancestor_heading_text(section, 'chapter', NS)
        part = get_ancestor_heading_text(section, 'part', NS)

        parsed.append({
            "metadata": {
                "section_head": heading_text,
                "subtitle": subtitle,
                "chapter": chapter,
                "part": part
                },
            "content": "\n".join(content_texts)
        })

    return parsed

In [11]:
data_dict = parse_sections_with_metadata("./usc26.xml")

Ingest & Chunk Data

In [12]:
# Wrap in LangChain Document objects
documents = [
    Document(page_content=d["content"], metadata=d["metadata"])
    for d in data_dict
]

# Split each document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunked_docs = text_splitter.split_documents(documents)

Embed and Store

In [13]:
# Init the embedding model -- subject to change... may need GPUs
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create FAISS vector store from chunked docs
vector_store = Annoy.from_documents(chunked_docs, embedding_model)

# Optionally save to disk
vector_store.save_local("annoy_tax_code_index")

Build RAG

In [19]:
# Use ChatGPT as the LLM
llm = GPT4All(model="/home/chandlernick/.local/share/nomic.ai/GPT4All/orca-mini-3b-gguf2-q4_0.gguf", n_threads=4, backend="gptj")

# Reload vector store (if needed)
# vector_store = FAISS.load_local("faiss_tax_code_index", embedding_model)

retriever = vector_store.as_retriever(search_type="similarity", k=5)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"  # Or map_reduce for long documents
)

Ask Tax Law Question

In [20]:
query = "Is the income from renting out my garage taxable under federal law?"
response = qa_chain.invoke(query)

print("Answer:")
print(response)

  response = qa_chain.run(query)
Exception ignored on calling ctypes callback function: <function LLModel._callback_decoder.<locals>._raw_callback at 0x7c0de02992d0>
Traceback (most recent call last):
  File "/home/chandlernick/anaconda3/envs/general/lib/python3.10/site-packages/gpt4all/_pyllmodel.py", line 573, in _raw_callback
    def _raw_callback(token_id: int, response: bytes) -> bool:
KeyboardInterrupt: 


Answer:
 The rental of a garage is generally considered a nontaxable activity, as it is considered an accessory use of the property and not a separate trade or business. However, if you have a mortgage on your property and are paying
