**Overview**
- Create an AI chatbot that can answer questions about 2025 tax filing using embedded tax docs as knowledge base

**Steps**
1. Document embedding
2. Pinecone for vector storage
3. GPT for answer generation
4. Streamlit for chatbot UI

In [31]:
import fitz  # PyMuPDF
import re
import os
from dotenv import load_dotenv

# load api keys for openai and pinecone
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Chunk & preprocess documents
* convert pdfs to text
* break text into manageable chunks (e.g. 500-1000 tokens) for embedding

In [18]:
def extract_text_pymupdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def clean_pdf_text(text):
    # Replace multiple newlines with single spaces
    text = re.sub(r'\n+', ' ', text)
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

pdf_text = extract_text_pymupdf("./i1040gi.pdf")

# Clean the extracted text
pdf_text = clean_pdf_text(pdf_text)

# print(pdf_text[:500])  # Print first 500 characters
print(pdf_text)


Instructions for Form 1040 (2024) Catalog Number 24811V Dec 16, 2024 Department of the Treasury Internal Revenue Service www.irs.gov Future Developments 2024 Changes R INSTRUCTIONS See What’s New in these instructions. See IRS.gov and IRS.gov/Forms, and for the latest information about developments related to Forms 1040 and 1040-SR and their instructions, such as legislation enacted after they were published, go to IRS.gov/Form1040. Free File is the fast, safe, and free way to prepare and e-ﬁle your taxes. See IRS.gov/FreeFile. Direct File is a permanent option to ﬁle individual federal tax returns online—for free—directly and securely with the IRS starting in 2025. See DirectFile.IRS.gov. Pay Online. It’s fast, simple, and secure. Go to IRS.gov/Payments. Including the instructions for Schedules 1 through 3 2024 TAX YEAR R 1040 (and 1040-SR) Table of Contents Contents Page Contents Page What's New . . . . . . . . . . . . . . . . . . . . . . . . 6 Filing Requirements . . . . . . . . . .

In [19]:
len(pdf_text)

547065

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
"""
Notice the difference - pdf_text vs [pdf_text]. The create_documents() method expects a list of texts, not a single string. When you pass a single string, it treats each character as a separate document.
"""
chunks = text_splitter.create_documents([pdf_text])


In [21]:
chunks

[Document(metadata={}, page_content="Instructions for Form 1040 (2024) Catalog Number 24811V Dec 16, 2024 Department of the Treasury Internal Revenue Service www.irs.gov Future Developments 2024 Changes R INSTRUCTIONS See What’s New in these instructions. See IRS.gov and IRS.gov/Forms, and for the latest information about developments related to Forms 1040 and 1040-SR and their instructions, such as legislation enacted after they were published, go to IRS.gov/Form1040. Free File is the fast, safe, and free way to prepare and e-ﬁle your taxes. See IRS.gov/FreeFile. Direct File is a permanent option to ﬁle individual federal tax returns online—for free—directly and securely with the IRS starting in 2025. See DirectFile.IRS.gov. Pay Online. It’s fast, simple, and secure. Go to IRS.gov/Payments. Including the instructions for Schedules 1 through 3 2024 TAX YEAR R 1040 (and 1040-SR) Table of Contents Contents Page Contents Page What's New . . . . . . . . . . . . . . . . . . . . . . . . 6 Fi

In [22]:
len(chunks)

608

In [23]:
chunks[0].page_content

"Instructions for Form 1040 (2024) Catalog Number 24811V Dec 16, 2024 Department of the Treasury Internal Revenue Service www.irs.gov Future Developments 2024 Changes R INSTRUCTIONS See What’s New in these instructions. See IRS.gov and IRS.gov/Forms, and for the latest information about developments related to Forms 1040 and 1040-SR and their instructions, such as legislation enacted after they were published, go to IRS.gov/Form1040. Free File is the fast, safe, and free way to prepare and e-ﬁle your taxes. See IRS.gov/FreeFile. Direct File is a permanent option to ﬁle individual federal tax returns online—for free—directly and securely with the IRS starting in 2025. See DirectFile.IRS.gov. Pay Online. It’s fast, simple, and secure. Go to IRS.gov/Payments. Including the instructions for Schedules 1 through 3 2024 TAX YEAR R 1040 (and 1040-SR) Table of Contents Contents Page Contents Page What's New . . . . . . . . . . . . . . . . . . . . . . . . 6 Filing Requirements . . . . . . . . . 

# Generate embeddings
* use OpenAI, HuggingFace, or Cohere to covert text chunks into embeddings

In [24]:
# from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)
vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks])


In [25]:
len(vectors)

608

In [13]:
# # using huggingface
# from langchain.embeddings import HuggingFaceEmbeddings  
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks]) 

# using sentence_transformers
# from langchain.embeddings import SentenceTransformerEmbeddings
# embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks]) 

# # using OpenAI
# from langchain.embeddings import OpenAIEmbeddings
# embedding_model = OpenAIEmbeddings(openai_api_key='')
# vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks])     

# Store embeddings in Pinecone
* use Pinecone to store the embeddings for efficient retrieval
* ensure you have a Pinecone index created and configured

In [None]:
from pinecone import Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
pc.list_indexes()

# # Delete multiple indexes
# pc.delete_index('tax-rag2')
# pc.delete_index('tax-rag3')

[
    {
        "name": "tax-rag",
        "metric": "cosine",
        "host": "tax-rag-n6gatrn.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "vector_type": "dense",
        "dimension": 1536,
        "deletion_protection": "disabled",
        "tags": null
    }
]

In [27]:
from pinecone import Pinecone, ServerlessSpec
import time
import tqdm

pc = Pinecone(api_key=PINECONE_API_KEY)

# Step 1: Create index if not exists
index_name = "tax-rag"
if index_name not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=1536,  # text-embedding-3-small
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # free tier region
        )
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

# Step 2: Get the index host
index_info = pc.describe_index(index_name)
index_host = index_info.host

# Step 3: Connect using host
index = pc.Index(host=index_host)

# Step 4: Upsert embeddings
for i in tqdm.tqdm(range(len(vectors))):
    index.upsert([
        (f"id-{i}", vectors[i], {"text": chunks[i].page_content})
    ])


100%|██████████| 608/608 [01:23<00:00,  7.31it/s]


In [30]:
# Verify upload
stats = index.describe_index_stats()
print(f"Index now contains {stats.total_vector_count} vectors")

Index now contains 608 vectors


# Finally, RUN app.py 

Run the following in the terminal:

streamlit run app.py