In [12]:
import os
from dotenv import load_dotenv
load_dotenv()
print("PYTHONPATH is:", os.getenv("PYTHONPATH"))
print("OPENAI_API_KEY is set:", bool(os.getenv("OPENAI_API_KEY")))

PYTHONPATH is: .
OPENAI_API_KEY is set: True


In [1]:
!which python
!pip list | grep langchain

/Users/ywchen/project/fleet_insights_rag/.venv/bin/python
langchain                 0.3.27
langchain-community       0.3.30
langchain-core            0.3.76
langchain-huggingface     0.3.1
langchain-openai          0.3.33
langchain-text-splitters  0.3.11


In [4]:
from src.embed_store import load_faiss
from src.splitters import load_config

cfg = load_config()
db = load_faiss(cfg["index_path"])

# How many chunks in the index
print("Number of chunks:", len(db.index_to_docstore_id))

# Peek at metadata for the first few
for i, doc_id in enumerate(db.index_to_docstore_id.values()):
    doc = db.docstore.search(doc_id)
    print(f"--- Chunk {i} ---")
    print("Source:", doc.metadata.get("source"))
    print("Page:", doc.metadata.get("page"))
    print("Content:", doc.page_content[:200], "...\n")  # show first 200 chars
    if i > 5:
        break

Number of chunks: 41
--- Chunk 0 ---
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 0
Content: Fleet  Classification  Model:  Technical  
Report
 
&
 
Methodology
 
Date:  September  25,  2025   
Author(s):  Olivia  Chen  
Executive  Summary  
This  report  details  the  methodology,  developme ...

--- Chunk 1 ---
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 0
Content: foundational
 
proxy.
 
By
 
training
 
this
 
model
 
on
 
our
 
rich
 
internal
 
customer
 
dataâ€”where
 
fleet
 
size
 
is
 
known
 
or
 
can
 
be
 
reliably
 
inferredâ€”we
 
can
 
predict
 
a
 
pro ...

--- Chunk 2 ---
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 0
Content: challenges
 
encountered
 
during
 
modeling
 
(such
 
as
 
severe
 
class
 
imbalance),
 
and
 
the
 
techniques
 
employed
 
to
 
build
 
a
 
high-performing
 
and
 
interpretable
 
model.
 
1.  Pro ...

--- Chunk 3 ---
So

In [5]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")  # for OpenAI embeddings
for i, doc_id in enumerate(db.index_to_docstore_id.values()):
    doc = db.docstore.search(doc_id)
    tokens = len(enc.encode(doc.page_content))
    print(f"Chunk {i} â†’ {tokens} tokens")
    if i > 5:
        break

Chunk 0 â†’ 218 tokens
Chunk 1 â†’ 214 tokens
Chunk 2 â†’ 217 tokens
Chunk 3 â†’ 47 tokens
Chunk 4 â†’ 229 tokens
Chunk 5 â†’ 223 tokens
Chunk 6 â†’ 212 tokens


In [6]:
results = db.similarity_search("test query", k=3)
for r in results:
    print("Source:", r.metadata.get("source"))
    print("Content:", r.page_content[:200])

Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Content: registration
 
data
 
linked
 
to
 
businesses,
 
offering
 
insights
 
into
 
existing
 
fleet
 
size,
 
vehicle
 
composition
 
(e.g.,
 
light-duty
 
trucks
 
vs.
 
sedans),
 
makes,
 
models,
 
and
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Content: that
 
can
 
be
 
used
 
to
 
qualify
 
and
 
prioritize
 
prospective
 
leads.
 
The
 
final
 
RandomForest
 
model
 
is
 
not
 
only
 
statistically
 
robust
 
but
 
also
 
transparent
 
and
 
align
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Content: challenges
 
encountered
 
during
 
modeling
 
(such
 
as
 
severe
 
class
 
imbalance),
 
and
 
the
 
techniques
 
employed
 
to
 
build
 
a
 
high-performing
 
and
 
interpretable
 
model.
 
1.  Pro


In [7]:
results = db.similarity_search("how local services impact fleet size", k=3)
for r in results:
    print(r.metadata.get("source"), r.metadata.get("page"))
    print(r.page_content[:200], "...\n")

data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf 1
A  direct  indicator  of  a  company's  potential  is  its  fleet  size.  However,  this  crucial  piece  of  
information
 
is
 
an
 
internal
 
metric
 
that
 
is
 
rarely
 
available
 
for
 
prospe ...

data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf 0
identify
 
and
 
prioritize
 
companies
 
most
 
likely
 
to
 
become
 
valuable,
 
high-volume
 
customers.
 
1.2.  The  Core  Problem:  Missing  fleet_size Data  
1 ...

data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf 0
Fleet  Classification  Model:  Technical  
Report
 
&
 
Methodology
 
Date:  September  25,  2025   
Author(s):  Olivia  Chen  
Executive  Summary  
This  report  details  the  methodology,  developme ...



In [8]:
from src.embed_store import load_faiss
from src.splitters import load_config

cfg = load_config()
db = load_faiss(cfg["index_path"])

query = "how do local services impact fleet size"
hits = db.similarity_search(query, k=3)
for i, h in enumerate(hits, 1):
    print(f"--- Hit {i} ---")
    print("Source:", h.metadata.get("source"))
    print("Page:", h.metadata.get("page"))
    print(h.page_content[:400], "...\n")

--- Hit 1 ---
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 1
A  direct  indicator  of  a  company's  potential  is  its  fleet  size.  However,  this  crucial  piece  of  
information
 
is
 
an
 
internal
 
metric
 
that
 
is
 
rarely
 
available
 
for
 
prospective
 
clients.
 
Relying
 
solely
 
on
 
data
 
where
 fleet_size is  present  would  severely  limit  our  ability  to  score  and  rank  the  broad  
universe
 
of
 
potential
 
leads.
 
1.3.  Our ...

--- Hit 2 ---
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 0
identify
 
and
 
prioritize
 
companies
 
most
 
likely
 
to
 
become
 
valuable,
 
high-volume
 
customers.
 
1.2.  The  Core  Problem:  Missing  fleet_size Data  
1 ...

--- Hit 3 ---
Source: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 0
Fleet  Classification  Model:  Technical  
Report
 
&
 
Methodology
 
Date:  September  25,  2025   
Author(s):  Oli

In [10]:
## test RAG pipeline with LLM
from src.embed_store import load_faiss
from src.splitters import load_config
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI  # or HuggingFaceHub if you want free

# 1. Load config + FAISS index
cfg = load_config()
db = load_faiss(cfg["index_path"])

# 2. Wrap FAISS into a retriever
retriever = db.as_retriever(search_kwargs={"k": 3})

# 3. Define LLM (OpenAI if quota, or HuggingFace fallback)
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# 4. Build RAG chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# 5. Ask a question
query = "How naics code impact fleet size?"
result = qa.invoke({"query": query})

print("ðŸ”Ž Answer:")
print(result["result"])

print("\nðŸ“„ Sources:")
for i, doc in enumerate(result["source_documents"], 1):
    print(f"--- Source {i} ---")
    print("File:", doc.metadata.get("source"))
    print("Page:", doc.metadata.get("page"))
    print(doc.page_content[:200], "...\n")

ðŸ”Ž Answer:
I don't know.

ðŸ“„ Sources:
--- Source 1 ---
File: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 1
A  direct  indicator  of  a  company's  potential  is  its  fleet  size.  However,  this  crucial  piece  of  
information
 
is
 
an
 
internal
 
metric
 
that
 
is
 
rarely
 
available
 
for
 
prospe ...

--- Source 2 ---
File: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 0
identify
 
and
 
prioritize
 
companies
 
most
 
likely
 
to
 
become
 
valuable,
 
high-volume
 
customers.
 
1.2.  The  Core  Problem:  Missing  fleet_size Data  
1 ...

--- Source 3 ---
File: data/raw/Fleet Classification Model_ Technical Report & Methodology.pdf
Page: 1
report
 
to
 
predict
 
a
 fleet_size category  (<=  10,  11-50,  >  50)  for  any  company,  using  
a
 
wide
 
array
 
of
 
firmographic
 
and
 
vehicular
 
data.
 
This
 
model
 
is
 
trained
 
on
 ...

