In [14]:
def chunk_by_sentences(text, max_chunk_size=500):
    """
    Split text into chunks by sentences, keeping sentences intact.
    
    Args:
        text: The text to chunk
        max_chunk_size: Maximum characters per chunk
    
    Returns:
        List of text chunks
    """
    # Simple sentence splitting (split on . ! ?)
    import re
    import os
    sentences = re.split(r'(?<=[.!?])\s+', text)
    file_path  = r'C:\Users\USER\Desktop\Weekly_Task\Week_20\Day_2\Python For Dummies.pdf'
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Get file metadata
    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        # Check if adding this sentence would exceed max size
        if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
            # Save current chunk and start new one
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            # Add sentence to current chunk
            current_chunk += " " + sentence if current_chunk else sentence
    
    # Don't forget the last chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Test it
sample_document = file_path = r'C:\Users\USER\Desktop\Weekly_Task\Week_20\Day_2\Python For Dummies.pdf'
chunks = chunk_by_sentences(sample_document, max_chunk_size=400)

print(f"Number of chunks: {len(chunks)}\n")
for i, chunk in enumerate(chunks, 1):
    print(f"Chunk {i} ({len(chunk)} chars):")
    print(chunk)
    print("-" * 80)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 11: invalid start byte

In [10]:
def load_and_chunk_text_file(file_path, chunk_size=500, overlap=50):
    """
    Load a text file and chunk it.
    
    Args:
        file_path: Path to the text file
        chunk_size: Characters per chunk
        overlap: Character overlap between chunks
    
    Returns:
        List of chunks with metadata
    """
    import os
    file_path = r'Week_20\Day_2\Python For Dummies.pdf'
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Get file metadata
    file_name = os.path.basename(file_path)
    file_size = os.path.getsize(file_path)
    
    # Chunk the text
    chunks = chunk_by_sentences(text, max_chunk_size=chunk_size)
    
    # Add metadata to each chunk
    # chunks_with_metadata = []
    # for i, chunk in enumerate(chunks):
    #     chunks_with_metadata.append({
    #         'text': chunk,
    #         'metadata': {
    #             'source': file_name,
    #             'file_path': file_path,
    #             'file_size': file_size,
    #             'chunk_index': i,
    #             'total_chunks': len(chunks)
    #         }
    #     })
    
    #return chunks_with_metadata
    return chunks

# Example usage (create a sample file first)
# sample_file_path = 'sample_document.txt'
# with open(sample_file_path, 'w', encoding='utf-8') as f:
#     f.write(sample_document)

# Load and chunk
# chunks = load_and_chunk_text_file(sample_file_path, chunk_size=400)

# print(f"Loaded and chunked: {chunks[0]['metadata']['source']}")
# print(f"Total chunks: {len(chunks)}")
# print(f"\nChunk 1:")
# print(chunks[0]['text'])

In [1]:
# Install PyPDF2 if needed
!pip install -q PyPDF2

In [32]:
def load_and_chunk_pdf(file_path, chunk_size=500):
    """
    Load a PDF file and chunk it.
    
    Args:
        file_path: Path to the PDF file
        chunk_size: Characters per chunk
    
    Returns:
        List of chunks with metadata (including page numbers)
    """
    import PyPDF2
    import os
    file_path = r'C:\Users\USER\Desktop\Weekly_Task\Week_20\Day_2\Python For Dummies.pdf' 
    chunks_with_metadata = []
    file_name = os.path.basename(file_path)
    total_chunks = []
    
    # Open PDF
    with open(file_path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        num_pages = len(pdf_reader.pages)
        
        # Process each page
        for page_num in range(num_pages):
            # Extract text from page
            page = pdf_reader.pages[page_num]
            text = page.extract_text()
            
            # Chunk the page text
            page_chunks = chunk_by_sentences(text, max_chunk_size=chunk_size)
            # # Add metadata to each chunk
            for chunk_idx, chunk in enumerate(page_chunks):
                total_chunks.append({
                    'text': chunk,
                    'metadata': {
                        'source': file_name,
                        'page': page_num + 1,  # 1-indexed
                        'total_pages': num_pages,
                        'chunk_on_page': chunk_idx,
                    }
                })
    
    # return chunks_with_metadata
    return total_chunks
# Example (you would use this with a real PDF file)
print("PDF loading function ready!")
print("\nUsage:")
print("chunks = load_and_chunk_pdf('your_document.pdf', chunk_size=500)")

PDF loading function ready!

Usage:
chunks = load_and_chunk_pdf('your_document.pdf', chunk_size=500)


In [34]:
#You can test the load_and_chunk function with the code below

chunks = load_and_chunk_pdf("Python For Dummies.pdf", chunk_size=500)

print(f"Total chunks: {len(chunks)}\n")

for i, c in enumerate(chunks[:5]):  # show first 5 chunks
    print(f"CHUNK {i+1}")
    print("TEXT:", c["text"][:2000])  # print first 200 chars
    print("META:", c["metadata"])
    print("-"*40)


Total chunks: 1593

CHUNK 1
TEXT: Beginning 
Programming 
with Python®
META: {'source': 'Python For Dummies.pdf', 'page': 3, 'total_pages': 411, 'chunk_on_page': 0}
----------------------------------------
CHUNK 2
TEXT: Beginning 
Programming 
with Python®
by John Paul Mueller
META: {'source': 'Python For Dummies.pdf', 'page': 5, 'total_pages': 411, 'chunk_on_page': 0}
----------------------------------------
CHUNK 3
TEXT: Beginning Programming with Python® For Dummies®
Published by: John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030-5774, www.wiley.com
Copyright © 2014 by John Wiley & Sons, Inc., Hoboken, New Jersey
Media and software compilation copyright © 2014 by John Wiley & Sons, Inc. All rights reserved.
META: {'source': 'Python For Dummies.pdf', 'page': 6, 'total_pages': 411, 'chunk_on_page': 0}
----------------------------------------
CHUNK 4
TEXT: Published simultaneously in Canada
No part of this publication may be reproduced, stored in a retrieval system or transm

In [29]:
chunks

[[],
 [],
 ['Beginning \nProgramming \nwith Python®'],
 [],
 ['Beginning \nProgramming \nwith Python®\nby John Paul Mueller'],
 ['Beginning Programming with Python® For Dummies®\nPublished by: John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030-5774, www.wiley.com\nCopyright © 2014 by John Wiley & Sons, Inc., Hoboken, New Jersey\nMedia and software compilation copyright © 2014 by John Wiley & Sons, Inc. All rights reserved.',
  'Published simultaneously in Canada\nNo part of this publication may be reproduced, stored in a retrieval system or transmitted in any form or \nby any means, electronic, mechanical, photocopying, recording, scanning or otherwise, except as permit-\nted under Sections 107 or 108 of the 1976 United States Copyright Act, without the prior written permis -\nsion of the Publisher.',
  'Requests to the Publisher for permission should be addressed to the Permissions \nDepartment, John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030, (201) 748-6011, fa

In [35]:
!pip install chromadb --quiet



In [38]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")

In [39]:
collection = client.get_or_create_collection(
    name = "my_documents",
    metadata = {"description": "Sample document collection"} 
)

print(f"✅ Collection created: {collection.name}")
print(f"Current count: {collection.count()} documents")
print(f"📁 Data persisted to: ./chroma_db/")

✅ Collection created: my_documents
Current count: 0 documents
📁 Data persisted to: ./chroma_db/


In [48]:
documents = []
metadata = []
ids = []
for i, chunk in enumerate(chunks):
    documents.append(chunk["text"])
    metadata.append(chunk["metadata"])
    ids.append(str(i))
documents

['Beginning \nProgramming \nwith Python®',
 'Beginning \nProgramming \nwith Python®\nby John Paul Mueller',
 'Beginning Programming with Python® For Dummies®\nPublished by: John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030-5774, www.wiley.com\nCopyright © 2014 by John Wiley & Sons, Inc., Hoboken, New Jersey\nMedia and software compilation copyright © 2014 by John Wiley & Sons, Inc. All rights reserved.',
 'Published simultaneously in Canada\nNo part of this publication may be reproduced, stored in a retrieval system or transmitted in any form or \nby any means, electronic, mechanical, photocopying, recording, scanning or otherwise, except as permit-\nted under Sections 107 or 108 of the 1976 United States Copyright Act, without the prior written permis -\nsion of the Publisher.',
 'Requests to the Publisher for permission should be addressed to the Permissions \nDepartment, John Wiley & Sons, Inc., 111 River Street, Hoboken, NJ 07030, (201) 748-6011, fax (201) 748-6008, \nor

In [49]:
collection.add(
    documents=documents,
    metadatas=metadata,
    ids = ids
)

C:\Users\USER\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [02:30<00:00, 552kiB/s] 


In [47]:
ids

['<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 '<built-in function id>',
 