# Read the document (pdf) and convert to text file

In [1]:
import os
from sentence_transformers import SentenceTransformer
import chromadb
import chromadb.config


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read text file 
output_text_file = 'extracted_text.txt'
with open(output_text_file, 'r', encoding='utf-8') as f:
    extracted_text = f.read()

In [3]:
print(extracted_text)

Yellow Products
Annual Financial Statements
For the Year Ended December 31, 2024
Income Statement
Description
USD
Revenue
5,000,000
Cost of Goods Sold
(3,000,000)
Gross Profit
2,000,000
Operating Expenses
(1,300,000)
Operating Income
700,000
Interest Expense
(80,000)
Earnings Before Tax
620,000
Income Tax Expense
(155,000)
Net Income
465,000
Balance Sheet
Assets
USD
Total Assets
3,800,000
Liabilities & Equity
USD
Total Liabilities
2,050,000
Total Equity
1,750,000
Cash Flow Summary
Net Cash from Operating Activities: 615,000 USD
Net Cash Used in Investing Activities: (500,000) USD
Net Cash Used in Financing Activities: (50,000) USD



In [None]:
# split_text

def split_text(text, chunk_size=100, overlap=20):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunk = text[i:i + chunk_size]
        if chunk:
            chunks.append(chunk)
    return chunks






In [8]:
# random id for document
import uuid

chunk_documents = []
chunks = split_text(extracted_text)

for i, chunk in enumerate(chunks):
    chunk_documents.append({
        "id": str(uuid.uuid4()),
        "text": chunk
    })
print(f"Total chunks created: {len(chunk_documents)}")


Total chunks created: 7


In [9]:
print(chunk_documents)

[{'id': '8108872c-e3aa-49c9-9abd-e5939e20fc72', 'text': 'Yellow Products\nAnnual Financial Statements\nFor the Year Ended December 31, 2024\nIncome Statement\nDe'}, {'id': '65f83ccc-db8e-45b4-95f5-ad535f675468', 'text': 'nt\nDescription\nUSD\nRevenue\n5,000,000\nCost of Goods Sold\n(3,000,000)\nGross Profit\n2,000,000\nOperating'}, {'id': 'bad9d60e-9554-4cbc-a0ee-c1f05fc05fd8', 'text': 'ating Expenses\n(1,300,000)\nOperating Income\n700,000\nInterest Expense\n(80,000)\nEarnings Before Tax\n62'}, {'id': '6a1192d6-c903-4221-b17e-fd3e4f689579', 'text': 'ax\n620,000\nIncome Tax Expense\n(155,000)\nNet Income\n465,000\nBalance Sheet\nAssets\nUSD\nTotal Assets\n3,8'}, {'id': '4e30b00f-cd17-4271-8574-c7eaf34b1fa6', 'text': 's\n3,800,000\nLiabilities & Equity\nUSD\nTotal Liabilities\n2,050,000\nTotal Equity\n1,750,000\nCash Flow Su'}, {'id': '309f5bb0-d7bd-4b5c-8e4d-03125bd095eb', 'text': 'ow Summary\nNet Cash from Operating Activities: 615,000 USD\nNet Cash Used in Investing Activities: (5'},

In [None]:
#%pip install --upgrade sentence-transformers transformers huggingface-hub

In [10]:
# embedded chunks using model
eb_model = "all-MiniLM-L6-v2"
model = SentenceTransformer(eb_model)

In [11]:
texts = [chunk["text"] for chunk in chunk_documents]
ids = [chunk["id"] for chunk in chunk_documents]
embedding = model.encode(texts)

In [12]:
print(embedding)

[[-0.03126398  0.030136    0.05605014 ... -0.06703352  0.00293587
   0.0340689 ]
 [ 0.00518523 -0.00785316 -0.02014083 ... -0.04249072  0.02607788
  -0.06376328]
 [ 0.04007132  0.04000975 -0.01493552 ... -0.07254153 -0.02254399
  -0.03363599]
 ...
 [ 0.01150688 -0.05737597 -0.13078241 ... -0.07006931 -0.040231
  -0.06439541]
 [ 0.01307057 -0.01654872 -0.03494998 ... -0.10322753 -0.01550146
  -0.00877674]
 [ 0.01816797  0.00168506 -0.08731654 ... -0.07320128  0.00902623
  -0.03926052]]


In [13]:
chroma_client = chromadb.PersistentClient(path="chormadb3")

In [14]:
collection = chroma_client.get_or_create_collection(name="document_embeddings")
collection.add(
    ids=ids,
    documents=texts,
    embeddings=embedding.tolist(),
)
