# Document loaders

We will examine how to load and split popular file formats. The purpose of is to insert these documents into a vector database for semantic searches.

In [1]:
# Download the NLTK data
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /home/cmlee/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/cmlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
# Imports
import json, random
from langchain_community.document_loaders import UnstructuredEPubLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, JSONLoader, UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredEmailLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter


# Create a text splitter

The purpose of splitting the text is to enable more precise search results by ensuring that each smaller segment captures relevant context within a larger document. When these text fragrments are used as context in prompts, they will not exceed the context window.

In [3]:
# TODO: Create a text splitter
chunk_size = 300
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


## Loading and splitting documents

Documents can be loaded and split either as a single or as elements eg pages, individual slides, spreadsheet, etc. Set the `mode` arguemnt to
- `single` - load the entire document as a single object default for most loaders
- `multi`, `elements` - split the document into their respective elements eg pages, slides, etc.

In [4]:
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [5]:
# TODO: Word doucment
# load the document
word_loader = Docx2txtLoader('./docs/SST RL Python Setup.docx')

# chunk it with the text splitter
chunks = word_loader.load_and_split(text_splitter)

In [9]:
print(len(chunks))

print_chunk_info(chunks)

12
No of chunks: 12
Chunk index: 3
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/SST RL Python Setup.docx'})
	2 = ('page_content', 'On your Mac open a terminal.\n\n\n\nCreate a new environment\n\nconda create --name myenv \n\n\n\nwhere myenv is the name of your environment. Feel free to use any name. You can create multiple environment\n\n\n\nNote: You can confirm the newly created environment, by listing all envs\n\nconda env list')
	3 = ('type', 'Document')


In [None]:
# TODO: Excel doucment

# Load as a single document
xls_loader = UnstructuredExcelLoader('./docs/Financial Sample.xlsx', mode="single")
chunks = xls_loader.load_and_split(text_splitter)
print(len(chunks))
print_chunk_info(chunks)




322
No of chunks: 322
Chunk index: 277
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/Financial Sample.xlsx'})
	2 = ('page_content', '2014 Small Business Mexico Velo High 2605 120 300 781500 101595 679905 651250 28655 2013-11-01 00:00:00 11 November 2013 Channel Partners Germany Velo High 1013 120 12 12156 1580.28 10575.72 3039 7536.72 2014-12-01 00:00:00 12 December 2014 Enterprise Canada VTT High 1583 250 125 197875 25723.75')
	3 = ('type', 'Document')


In [11]:
# "text_as_html" key in the document metadata.
xls_loader = UnstructuredExcelLoader('./docs/Financial Sample.xlsx', mode="elements")
chunks = xls_loader.load_and_split(text_splitter)
print(len(chunks))
print_chunk_info(chunks)

322
No of chunks: 322
Chunk index: 224
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/Financial Sample.xlsx', 'file_directory': './docs', 'filename': 'Financial Sample.xlsx', 'last_modified': '2025-02-28T11:53:21', 'page_name': 'Sheet1', 'page_number': 1, 'text_as_html': '<table><tr><td>Segment</td><td>Country</td><td>Product</td><td>Discount Band</td><td>Units Sold</td><td>Manufacturing Price</td><td>Sale Price</td><td>Gross Sales</td><td>Discounts</td><td>Sales</td><td>COGS</td><td>Profit</td><td>Date</td><td>Month Number</td><td>Month Name</td><td>Year</td></tr><tr><td>Government</td><td>Canada</td><td>Carretera</td><td/><td>1618.5</td><td>3</td><td>20</td><td>32370</td><td>0</td><td>32370</td><td>16185</td><td>16185</td><td>2014-01-01 00:00:00</td><td>1</td><td>January</td><td>2014</td></tr><tr><td>Government</td><td>Germany</td><td>Carretera</td><td/><td>1321</td><td>3</td><td>20</td><td>26420</td><td>0</td><td>26420</td><td>13210</td><td>13210</td><td>2014-0

In [None]:
# TODO: Load Powerpoint



In [None]:
# TODO: Load Email



In [12]:
# TODO: Load PDF
pdf_loader = PyPDFLoader(file_path='./docs/Path-to-GitOps-Red-Hat-Developer-e-book.pdf')
chunks = pdf_loader.load_and_split(text_splitter)
print(len(chunks))
print_chunk_info(chunks)

410
No of chunks: 410
Chunk index: 273
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2022-07-15T19:22:48-05:00', 'moddate': '2022-07-17T10:32:28-04:00', 'title': 'The Path to GitOps', 'source': './docs/Path-to-GitOps-Red-Hat-Developer-e-book.pdf', 'total_pages': 45, 'page': 29, 'page_label': '30'})
	2 = ('page_content', 'organization, so it is not a hard and fast rule, but it’s something to keep in mind and \ncommunicate about.')
	3 = ('type', 'Document')


In [None]:
# TODO: Load EPUB



## Processing structured document/JSON

In [15]:
# TODO: Create a JSON splitter
json_splitter = RecursiveJsonSplitter(max_chunk_size=200)


In [19]:
# TODO: Process a single JSON object 
# REad in the JSON document with standard python
with open('./docs/single.json') as f:
   json_data = json.load(f)

chunks = json_splitter.split_json(json_data)
print(len(chunks))
print_chunk_info(chunks)
print(chunks)

3
No of chunks: 3
Chunk index: 0
Chunk details
	0 = web-app
[{'web-app': {'servlet': [{'servlet-name': 'cofaxCDS', 'servlet-class': 'org.cofax.cds.CDSServlet', 'init-param': {'configGlossary:installationAt': 'Philadelphia, PA', 'configGlossary:adminEmail': 'ksm@pobox.com', 'configGlossary:poweredBy': 'Cofax', 'configGlossary:poweredByIcon': '/images/cofax.gif', 'configGlossary:staticPath': '/content/static', 'templateProcessorClass': 'org.cofax.WysiwygTemplate', 'templateLoaderClass': 'org.cofax.FilesTemplateLoader', 'templatePath': 'templates', 'templateOverridePath': '', 'defaultListTemplate': 'listTemplate.htm', 'defaultFileTemplate': 'articleTemplate.htm', 'useJSP': False, 'jspListTemplate': 'listTemplate.jsp', 'jspFileTemplate': 'articleTemplate.jsp', 'cachePackageTagsTrack': 200, 'cachePackageTagsStore': 200, 'cachePackageTagsRefresh': 60, 'cacheTemplatesTrack': 100, 'cacheTemplatesStore': 50, 'cacheTemplatesRefresh': 15, 'cachePagesTrack': 200, 'cachePagesStore': 100, 'cachePage

In [22]:
# TODO: Process an array of JSON objects, return chunks as Dictionary

with open('./docs/tv-shows.json') as f:
   json_data = json.load(f)

chunks = json_splitter.create_documents(texts=json_data)
print(chunks)
print(len(chunks))

1726


In [23]:
# TODO: Process an array of JSON objects, return chunks as Document

json_loader = JSONLoader(
   file_path = "./docs/tv-shows.json",
   jq_schema=".[].summary",
   text_content=True
)

chunks = json_loader.load()

print(len(chunks))
print_chunk_info(chunks)



240
No of chunks: 240
Chunk index: 25
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/opt/tmp/ellm_2025_oct13/day03/docs/tv-shows.json', 'seq_num': 26})
	2 = ('page_content', '<p><b>Berserk </b>is a 25-part anime set in a dark fantasy/horror environment whereby the series focuses on the main character guts; a lone swordman who later meets up with a group of mercenaries called the band of the hawk. The leader of this band holds a strange necklace called a behelit that will only lead to evil.</p>')
	3 = ('type', 'Document')


In [None]:
# TODO: Extract specific attributes from the JSON document, use JSON path to define which element

