### Text Splitting using langchain_text_splitters
https://docs.langchain.com/oss/python/integrations/splitters

In [1]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("samplePdf.pdf")
docs = loader.load()    
docs

[Document(metadata={'producer': 'Skia/PDF m118', 'creator': 'Chromium', 'creationdate': '2025-12-04T07:07:18+00:00', 'moddate': '2025-12-04T07:07:18+00:00', 'source': 'samplePdf.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content="Billable152\x0000hNon Billable80\x0000hTotal232\x0000h\nYasref SCH\nImplementation\n04\x0000 DJDennis Joshua - 5000059 Billable\nApproved\nconfiguring and fixing bugs in pages,\nbv's, and ruleset's required for\nvendor registration and modification\nprocess.f\nDJDennis Joshua\nYasref SCH\nImplementation\n04\x0000 DJDennis Joshua - 5000059 Billable\nApproved\nconfiguring and fixing bugs in pages,\nbv's, and ruleset's required for\nvendor registration and modification\nprocess.\nDJDennis Joshua\nHolidays and\nLeave [Only]\n08\x0000 DJDennis Joshua - 5000059 Non\nBillable Approved satuday. DJDennis Joshua\nHolidays and\nLeave [Only]\n08\x0000 DJDennis Joshua - 5000059 Non\nBillable Approved Friday. DJDennis Joshua\nYasref SCH\nImplementation\n08\

##### Recursively split text by characters

In [2]:
# recursive text splitter

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(docs) # here incoming format is already a document
texts

[Document(metadata={'producer': 'Skia/PDF m118', 'creator': 'Chromium', 'creationdate': '2025-12-04T07:07:18+00:00', 'moddate': '2025-12-04T07:07:18+00:00', 'source': 'samplePdf.pdf', 'total_pages': 5, 'page': 0, 'page_label': '1'}, page_content="Billable152\x0000hNon Billable80\x0000hTotal232\x0000h\nYasref SCH\nImplementation\n04\x0000 DJDennis Joshua - 5000059 Billable\nApproved\nconfiguring and fixing bugs in pages,\nbv's, and ruleset's required for\nvendor registration and modification\nprocess.f\nDJDennis Joshua\nYasref SCH\nImplementation\n04\x0000 DJDennis Joshua - 5000059 Billable\nApproved\nconfiguring and fixing bugs in pages,\nbv's, and ruleset's required for\nvendor registration and modification\nprocess.\nDJDennis Joshua\nHolidays and\nLeave [Only]"),
 Document(metadata={'producer': 'Skia/PDF m118', 'creator': 'Chromium', 'creationdate': '2025-12-04T07:07:18+00:00', 'moddate': '2025-12-04T07:07:18+00:00', 'source': 'samplePdf.pdf', 'total_pages': 5, 'page': 0, 'page_label

In [3]:
# read speech.txt file

with open("speech.txt") as f:
    content = f.read()

content

'Freedom was not gifted; it was earned through courage and sacrifice.\nCountless voices rose together to demand dignity and self-rule.\nEvery step toward independence carried the weight of hope and loss.\nThe struggle taught us unity beyond language, region, or belief.\nIndependence is not just a date, but a responsibility we carry daily.\nIt reminds us to protect justice, equality, and truth.\nThe past whispers lessons of resilience and bravery.\nThe present asks us to build with integrity and compassion.\nThe future depends on how wisely we use our freedom today.\nIndependence lives on when we choose progress over fear.\n'

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=50,
    chunk_overlap=20,
)

texts = text_splitter.create_documents([content]) # here incoming format is plain text
texts

[Document(metadata={}, page_content='Freedom was not gifted; it was earned through'),
 Document(metadata={}, page_content='was earned through courage and sacrifice.'),
 Document(metadata={}, page_content='Countless voices rose together to demand dignity'),
 Document(metadata={}, page_content='to demand dignity and self-rule.'),
 Document(metadata={}, page_content='Every step toward independence carried the weight'),
 Document(metadata={}, page_content='carried the weight of hope and loss.'),
 Document(metadata={}, page_content='The struggle taught us unity beyond language,'),
 Document(metadata={}, page_content='beyond language, region, or belief.'),
 Document(metadata={}, page_content='Independence is not just a date, but a'),
 Document(metadata={}, page_content='just a date, but a responsibility we carry daily.'),
 Document(metadata={}, page_content='It reminds us to protect justice, equality, and'),
 Document(metadata={}, page_content='equality, and truth.'),
 Document(metadata={}, 

In [5]:
print(texts[0])
print(texts[1])

page_content='Freedom was not gifted; it was earned through'
page_content='was earned through courage and sacrifice.'
