In [1]:
!pip install PyPDF2 chromadb

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp313-cp313-win_amd64.whl.metadata (9.1 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp313-cp313-win_amd64.whl.metadata (5.3 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.39.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.0-py3-none-any.whl.metadata (2.5 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sd


[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from PyPDF2 import PdfReader
import chromadb
import re

In [4]:
def chunk_sentences(text, pdf_name, max_chunk_size = 500):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    current_chunk = ""
    chunks = {"text": [], "metadata": []}

    for sentence in sentences:
        # Check if adding this sentence would exceed max size
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            # Save current chunk and start new one
            chunks["text"].append(current_chunk.strip())
            chunks["metadata"].append({"Source": pdf_name,  
                                       "chunk_size": len(current_chunk.split(" ")),
                                       "character count": len(current_chunk)})
            current_chunk = sentence
        else:
            # Add sentence to current chunk
            current_chunk += " " + sentence if current_chunk else sentence
    
    # Don't forget the last chunk
    if current_chunk:
        chunks["text"].append(current_chunk.strip())
        chunks["metadata"].append({"Source": pdf_name,  
                                       "chunk_size": len(current_chunk.split(" ")),
                                       "character count": len(current_chunk)})
    
    return chunks





def retrieve_documents(pdf_path, pdf_name, max_chunk_size):
    reader = PdfReader(pdf_path)
    chunk_with_metadata = {"Documents": [], "metadata":[]}
    for page_num in range(len(reader.pages)):
        text = reader.pages[page_num].extract_text()

        #Chunking
        chunks = chunk_sentences(text, pdf_name, max_chunk_size=max_chunk_size)
        for index, document in enumerate(chunks["text"]):
            chunk_with_metadata["Documents"].append(document)
            chunk_with_metadata["metadata"].append(chunks["metadata"][index])
    return chunk_with_metadata
        





In [5]:
retrieve_documents("Python For Dummies.pdf", "Python for Dummies", 400)

{'Documents': ['Beginning \nProgramming \nwith Python®',
  'Beginning \nProgramming \nwith Python®\nby John Paul Mueller',
  'Beginning Programming with Python® For Dummies®\nPublished by: John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030-5774, www.wiley.com\nCopyright © 2014 by John Wiley & Sons, Inc., Hoboken, New Jersey\nMedia and software compilation copyright © 2014 by John Wiley & Sons, Inc. All rights reserved.',
  'Published simultaneously in Canada\nNo part of this publication may be reproduced, stored in a retrieval system or transmitted in any form or \nby any means, electronic, mechanical, photocopying, recording, scanning or otherwise, except as permit-\nted under Sections 107 or 108 of the 1976 United States Copyright Act, without the prior written permis -\nsion of the Publisher.',
  'Requests to the Publisher for permission should be addressed to the Permissions \nDepartment, John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030, (201) 748-6011, fax (2