**Overview**
- Create an AI chatbot that can answer questions about 2025 tax filing using embedded tax docs as knowledge base

**Steps**
1. Document embedding
2. Pinecone for vector storage
3. GPT for answer generation
4. Streamlit for chatbot UI

In [1]:
from dotenv import load_dotenv
load_dotenv()

import os
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Chunk & preprocess documents
* convert pdfs to text
* break text into manageable chunks (e.g. 500-1000 tokens) for embedding

In [2]:
import fitz  # PyMuPDF

def extract_text_pymupdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

pdf_text = extract_text_pymupdf("./i1040gi.pdf")
print(pdf_text[:500])  # Print first 500 characters


Line 
Instructions
for
Forms 1040
and 1040-SR
Also see the instructions for Schedule 1 through Schedule 3 that follow the 
Form 1040 and 1040-SR instructions.
What form to file. Everyone can file Form 1040. Form 1040-SR is available to you if 
you were born before January 2, 1960.
Fiscal year filers. If you are a fiscal year filer using a tax year other than January 1 
through December 31, 2024, enter the beginning and ending months of your fiscal 
year in the entry space provided at the top of 


In [3]:
len(pdf_text)

5128

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.create_documents(pdf_text)


In [5]:
chunks

[Document(metadata={}, page_content='L'),
 Document(metadata={}, page_content='i'),
 Document(metadata={}, page_content='n'),
 Document(metadata={}, page_content='e'),
 Document(metadata={}, page_content='I'),
 Document(metadata={}, page_content='n'),
 Document(metadata={}, page_content='s'),
 Document(metadata={}, page_content='t'),
 Document(metadata={}, page_content='r'),
 Document(metadata={}, page_content='u'),
 Document(metadata={}, page_content='c'),
 Document(metadata={}, page_content='t'),
 Document(metadata={}, page_content='i'),
 Document(metadata={}, page_content='o'),
 Document(metadata={}, page_content='n'),
 Document(metadata={}, page_content='s'),
 Document(metadata={}, page_content='f'),
 Document(metadata={}, page_content='o'),
 Document(metadata={}, page_content='r'),
 Document(metadata={}, page_content='F'),
 Document(metadata={}, page_content='o'),
 Document(metadata={}, page_content='r'),
 Document(metadata={}, page_content='m'),
 Document(metadata={}, page_conten

In [6]:
chunks[0].page_content

'L'

# Generate embeddings
* use OpenAI, HuggingFace, or Cohere to covert text chunks into embeddings

In [7]:
from langchain.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)
vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks])


  embedding_model = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)


In [9]:
len(vectors)

4143

In [None]:
# # using huggingface
# from langchain.embeddings import HuggingFaceEmbeddings  
# embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks]) 

# using sentence_transformers
# from langchain.embeddings import SentenceTransformerEmbeddings
# embedding_model = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks]) 

# # using OpenAI
# from langchain.embeddings import OpenAIEmbeddings
# embedding_model = OpenAIEmbeddings(openai_api_key='')
# vectors = embedding_model.embed_documents([chunk.page_content for chunk in chunks])     

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Store embeddings in Pinecone
* use Pinecone to store the embeddings for efficient retrieval
* ensure you have a Pinecone index created and configured

In [10]:
from pinecone import Pinecone, ServerlessSpec
import time
import tqdm

pc = Pinecone(api_key=PINECONE_API_KEY)

# Step 1: Create index if not exists
if "tax-rag" not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name="tax-rag",
        dimension=1536,  # text-embedding-3-small
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"  # free tier region
        )
    )
    while not pc.describe_index("tax-rag").status["ready"]:
        time.sleep(1)

# Step 2: Get the index host
index_info = pc.describe_index("tax-rag")
index_host = index_info.host

# Step 3: Connect using host
index = pc.Index(host=index_host)

# Step 4: Upsert embeddings
for i in tqdm.tqdm(range(len(vectors))):
    index.upsert([
        (f"id-{i}", vectors[i], {"text": chunks[i].page_content})
    ])


100%|██████████| 4143/4143 [09:52<00:00,  6.99it/s]


In [14]:
index

<pinecone.db_data.index.Index at 0x134559190>

#### RUN app.py INSTED OF THE REST OF THE NOTEBOOK 

# Build the Retrieval-Augmented Generation (RAG) pipeline

In [11]:
def retrieve_context(query):
    embedded_query = embedding_model.embed_query(query)
    results = index.query(vector=embedded_query, top_k=5, include_metadata=True)
    return [match['metadata']['text'] for match in results['matches']]

def generate_answer(query):
    context = "\n\n".join(retrieve_context(query))
    prompt = f"""You are a tax assistant. Answer based only on the following documents:\n\n{context}\n\nQ: {query}\nA:"""
    response = openai.ChatCompletion.create(
        model="gpt-5-nano",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']


# Create a Chat UI with Streamlit

In [12]:
import streamlit as st

st.title("Chatbot")

query = st.text_input("Ask me a tax question:")
if query:
    with st.spinner("Searching..."):
        answer = generate_answer(query)
        st.write("**Answer:**", answer)


2025-08-15 17:23:06.423 
  command:

    streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2025-08-15 17:23:06.424 Session state does not function when running a script without `streamlit run`


In [13]:
! streamlit run /opt/anaconda3/lib/python3.12/site-packages/ipykernel_launcher.py

[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.88.17:8501[0m
[0m
NOTE: When using the `ipython kernel` entry point, Ctrl-C will not work.

To exit, you will have to explicitly quit this process, by either sending
"quit" from a client, or using Ctrl-\ in UNIX-like environments.

To read more about this, see https://github.com/ipython/ipython/issues/2049


To connect another client to this kernel, use:
    --existing kernel-3796.json
[IPKernelApp] ERROR | Unable to initialize signal:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in initialize
    self.init_signal()
  File "/opt/anaconda3/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 545, in init_signal
    signal.signal(signal.SIGINT, signal.SIG_IGN)
  File "/opt/anaconda3/lib/python3.12/signal.py", line 58, in signal
    ha