## Basic RAG flow

In [None]:
import pdfplumber
import pandas as pd


def process_pdf(pdf_path: str, n_pages: int = 10) -> list  :
    """
    Extracts both text and tables from a PDF. 
    Tables are converted to Markdown to preserve structure for the LLM.
    """
    chunks = []
    
    # open pdf and iterate through pages up to n_pages
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            if page_num >= n_pages:
                break
            # check for tables
            tables = page.extract_tables()
            for table in tables:
                # Filter out tiny/empty tables
                if not table or len(table) < 2: 
                    continue
                
                # Convert to Markdown (LLMs understand this best)
                # We assume first row is header. 
                try:
                    df = pd.DataFrame(table[1:], columns=table[0])
                    df = df.fillna("") 
                    markdown_table = df.to_markdown(index=False)
                    
                    chunks.append({
                        "content": f"Table on Page {page_num + 1}:\n{markdown_table}",
                        "metadata": {"page": page_num + 1, "type": "table", "source": pdf_path}
                    })
                except Exception as e:
                    print(f"Skipping malformed table on page {page_num}: {e}")

            # 2. Extract Text (simple extraction for now)
            # In a real prod system, you might mask the table areas to avoid duplication
            text = page.extract_text()
            if text:
                paragraphs = text.split('\n\n')
                for p in paragraphs:
                    # filter headers/footers
                    if len(p.strip()) > 50:  
                        chunks.append({
                            "content": p.strip(),
                            "metadata": {"page": page_num + 1, "type": "text", "source": pdf_path}
                        })
                        
    return chunks

In [1]:
from src.document_processor import chunk_pdf
res = chunk_pdf('./assets/2023_Annual_Report.pdf', n_pages=10)

Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables
Found: 0 tables
Found: [] tables


In [6]:
res

[{'content': 'Dear shareholders, colleagues, customers, and partners,\nWe are living through a time of historic challenge and opportunity. As I write this, the world faces ongoing economic, social,\nand geopolitical volatility. At the same time, we have entered a new age of AI that will fundamentally transform productivity\nfor every individual, organization, and industry on earth, and help us address some of our most pressing challenges.\nThis next generation of AI will reshape every software category and every business, including our own. Forty-eight years\nafter its founding, Microsoft remains a consequential company because time and time again—from PC/Server, to\nWeb/Internet, to Cloud/Mobile—we have adapted to technological paradigm shifts. Today, we are doing so once again, as\nwe lead this new era.\nAmid this transformation, our mission to empower every person and every organization on the planet to achieve',
  'metadata': {'chunk_id': 0,
   'page': 2,
   'type': 'text',
   'sou

In [4]:
pages = process_pdf('./assets/2023_Annual_Report.pdf')

In [10]:
print(pages[8]["content"])

STOCK PERFORMANCE
COMPARISON OF 5 YEAR CUMULATIVE TOTAL RETURN*
Among Microsoft Corporation, the S&P 500 Index
and the NASDAQ Computer Index
6/18 6/1 9 6/20 6/21 6/22 6/23
Microsoft Corporation $ 100.00 $ 138.07 $ 212.34 $ 285.40 $ 272.82 $ 365.24
S&P 500 100.00 110.42 118.70 167.13 149.39 178.66
NASDAQ Computer 100.00 106.10 156.93 236.08 184.53 242.82
* $100 invested on 6/30/18 in stock or index, including reinvestment of dividends. Fiscal year ending June 30.
9


## Chroma (with principles from quadrant)

In [2]:
import chromadb
path: str = "./chroma_db"

client = chromadb.PersistentClient(path=path)
collection = client.get_or_create_collection("test-collection")

In [10]:
print(collection.peek()["documents"][-2])

ISSUER PURCHASES OF EQUITY SECURITIES, DIVIDENDS, AND STOCK PERFORMANCE
MARKET AND STOCKHOLDERS
Our common stock is traded on the NASDAQ Stock Market under the symbol MSFT. On July 24, 2023, there were 83,883
registered holders of record of our common stock.
SHARE REPURCHASES AND DIVIDENDS
Share Repurchases
On September 18, 2019, our Board of Directors approved a share repurchase program authorizing up to $40.0 billion in
share repurchases. This share repurchase program commenced in February 2020 and was completed in November 2021.
On September 14, 2021, our Board of Directors approved a share repurchase program authorizing up to $60.0 billion in
share repurchases. This share repurchase program commenced in November 2021, following completion of the program
approved on September 18, 2019, has no expiration date, and may be terminated at any time. As of June 30, 2023,
$22.3 billion remained of this $60.0 billion share repurchase program.
We repurchased the following shares of common sto

In [25]:
ids = [f"doc_{i}_{hash(c['content'])}" for i, c in enumerate(pages)]
documents = [c['content'] for c in pages]
metadatas = [c['metadata'] for c in pages]

# chroma uses default embedding algo.
collection.add(
    ids=ids,
    documents=documents,
    metadatas=metadatas
)

print(f"Added {len(documents)} chunks to database.")

Added 9 chunks to database.


In [20]:
collection.peek()

{'ids': ['doc_0_-5237426319515611554',
  'doc_1_7264514014932624805',
  'doc_2_-7559787089712588251',
  'doc_3_-7289827538358605559',
  'doc_4_-7023813475969023496',
  'doc_5_-373843603013203614',
  'doc_6_7027437471590386763',
  'doc_7_-3786015999162581173',
  'doc_8_6899469883038277305'],
 'embeddings': array([[-0.10622654,  0.01766581, -0.00561255, ..., -0.10036568,
         -0.0003026 , -0.02665709],
        [-0.05247169, -0.05846299,  0.00964065, ..., -0.06195053,
          0.01822844,  0.01889522],
        [-0.0974873 ,  0.03497915, -0.04942901, ..., -0.06371985,
         -0.00145037, -0.008322  ],
        ...,
        [ 0.01827986, -0.00288715,  0.01883518, ..., -0.06720383,
         -0.0598309 , -0.03341601],
        [-0.03591074, -0.1291655 ,  0.03465951, ..., -0.14265415,
         -0.04085117, -0.01238485],
        [ 0.0329219 , -0.05624714,  0.02792938, ..., -0.14079614,
          0.01611173,  0.00973737]], shape=(9, 384)),
 'documents': ['Dear shareholders, colleagues, cust

In [23]:
# query
query_text = "How many holders of MSFT were there in July 2023?"
n_results = 3

res = collection.query(
    query_texts=[query_text],
    n_results=n_results
)

In [24]:
res

{'ids': [['doc_7_-3786015999162581173',
   'doc_8_6899469883038277305',
   'doc_6_7027437471590386763']],
 'embeddings': None,
 'documents': [['ISSUER PURCHASES OF EQUITY SECURITIES, DIVIDENDS, AND STOCK PERFORMANCE\nMARKET AND STOCKHOLDERS\nOur common stock is traded on the NASDAQ Stock Market under the symbol MSFT. On July 24, 2023, there were 83,883\nregistered holders of record of our common stock.\nSHARE REPURCHASES AND DIVIDENDS\nShare Repurchases\nOn September 18, 2019, our Board of Directors approved a share repurchase program authorizing up to $40.0 billion in\nshare repurchases. This share repurchase program commenced in February 2020 and was completed in November 2021.\nOn September 14, 2021, our Board of Directors approved a share repurchase program authorizing up to $60.0 billion in\nshare repurchases. This share repurchase program commenced in November 2021, following completion of the program\napproved on September 18, 2019, has no expiration date, and may be terminated 

In [1]:
for page in pages:
    print(page["document"])

NameError: name 'pages' is not defined