Rag Pipeline

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

Ingest & Chunk Data

High potential for expansion:

- German tax law (highest priority)

- More USC (easiest after we have USC parser) -> no longer tax bot (law bot)

- More specific US tax law

In [None]:
# Suppose you have a list of plain text sections, each with metadata
plain_text_docs = [
    {"text": "Section 1. Gross income defined...", "metadata": {"section": "26 USC §1"}},
    {"text": "Section 61. General definition of gross income...", "metadata": {"section": "26 USC §61"}},
    # ... add more
]

# Wrap into LangChain Document objects
documents = [
    Document(page_content=d["text"], metadata=d["metadata"])
    for d in plain_text_docs
]

# Split each document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunked_docs = text_splitter.split_documents(documents)

Embed and Store

This has potential for expansion:

- Look at different embedding models: BERT classic, legal BERT, science BERT, tax BERT

- Fine tune our own: Give sections and have it predict headers (2257 class classification problem)

Remember: negative results (as long as there's something interesting are also results in academics)


Look at the techniques for querying the vector DB well.

Evaluate the precision and recall of retrieval






In [None]:
# Initialize the embedding model (you must have OPENAI_API_KEY set)
embedding_model = OpenAIEmbeddings()

# Create a FAISS vector store from chunked docs
vector_store = FAISS.from_documents(chunked_docs, embedding_model)

# Optionally save to disk
vector_store.save_local("faiss_tax_code_index")

Build RAG Chain

In [None]:
# Use ChatGPT as the LLM
llm = ChatOpenAI(temperature=0)

# Reload vector store (if needed)
# vector_store = FAISS.load_local("faiss_tax_code_index", embedding_model)

# Set up retriever and QA chain
retriever = vector_store.as_retriever(search_type="similarity", k=5)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"  # Or use "map_reduce" for long documents
)

Ask Tax Law Question

In [None]:
query = "Is the income from renting out my garage taxable under federal law?"
response = qa_chain.run(query)

print("Answer:")
print(response)