# Document loaders

We will examine how to load and split popular file formats. The purpose of is to insert these documents into a vector database for semantic searches.

In [1]:
# Download the NLTK data
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /home/cmlee/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/cmlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
# Imports
import json, random
from langchain_community.document_loaders import UnstructuredEPubLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, JSONLoader, UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredEmailLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter


# Create a text splitter

The purpose of splitting the text is to enable more precise search results by ensuring that each smaller segment captures relevant context within a larger document. When these text fragrments are used as context in prompts, they will not exceed the context window.

In [3]:
# TODO: Create a text splitter
chunk_size = 300
chunk_overlap = 30

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


## Loading and splitting documents

Documents can be loaded and split either as a single or as elements eg pages, individual slides, spreadsheet, etc. Set the `mode` arguemnt to
- `single` - load the entire document as a single object default for most loaders
- `multi`, `elements` - split the document into their respective elements eg pages, slides, etc.

In [4]:
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [6]:
# TODO: Word doucment
word_loader = Docx2txtLoader('./docs/SST RL Python Setup.docx')
chunks = word_loader.load_and_split(text_splitter)

In [11]:
print(len(chunks))

print_chunk_info(chunks)

12
No of chunks: 12
Chunk index: 4
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/SST RL Python Setup.docx'})
	2 = ('page_content', 'Activate your environment\n\nOnce you have created your environment, you will need to activate it\n\n\tconda activate myenv\n\n\n\nYour command prompt will now include the environment’s name.\n\n\n\nExit from your environment\n\nTo exit from your Conda environment type the following command\n\n\tconda deactivate')
	3 = ('type', 'Document')


In [16]:
# TODO: Excel doucment

# Load as a single document
xls_loader = UnstructuredExcelLoader('./docs/Financial Sample.xlsx', mode='single')
chunks = xls_loader.load_and_split(text_splitter)
print(len(chunks))
print_chunk_info(chunks)

# "text_as_html" key in the document metadata.
xls_loader = UnstructuredExcelLoader('./docs/Financial Sample.xlsx', mode='elements')
chunks = xls_loader.load_and_split(text_splitter)
print(len(chunks))
print_chunk_info(chunks)


298
No of chunks: 298
Chunk index: 41
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/Financial Sample.xlsx'})
	2 = ('page_content', 'Carretera Low 2145 3 7 15015 300.3 14714.7 10725 3989.7 2013-11-01 00:00:00 11 November 2013 Government Canada Carretera Low 2852 3 350 998200 19964 978236 741520 236716 2014-12-01 00:00:00 12 December 2014 Channel Partners United States of America Montana Low 1142 5 12 13704 274.08 13429.92 3426')
	3 = ('type', 'Document')
298
No of chunks: 298
Chunk index: 220
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/Financial Sample.xlsx', 'file_directory': './docs', 'filename': 'Financial Sample.xlsx', 'last_modified': '2025-02-28T11:53:21', 'page_name': 'Sheet1', 'page_number': 1, 'text_as_html': '<table><tr><td>Segment</td><td>Country</td><td>Product</td><td>Discount Band</td><td>Units Sold</td><td>Manufacturing Price</td><td>Sale Price</td><td>Gross Sales</td><td>Discounts</td><td>Sales</td><td>COGS</td><td>Profit</

In [None]:
# TODO: Load Powerpoint



In [None]:
# TODO: Load Email



In [17]:
# TODO: Load PDF
pdf_loader = PyPDFLoader(file_path='./docs/the-happy-prince-lesson-plans.pdf', extract_images=False)
chunks = pdf_loader.load_and_split(text_splitter)
print(len(chunks))
print_chunk_info(chunks)


9
No of chunks: 9
Chunk index: 3
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'Mac OS X 10.5.7 Quartz PDFContext', 'creator': 'Microsoft Word', 'creationdate': "D:20090817144440Z00'00'", 'title': 'Microsoft Word - Speakaboos_Happy Prince_Story Guide.doc', 'author': 'Yannai Segal', 'moddate': "D:20090817144440Z00'00'", 'source': './docs/the-happy-prince-lesson-plans.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'})
	2 = ('page_content', 'discussion questions •   What does the word “compassion” mean?  How did the Happy Prince show compassion? •   Why did the swallow decide not to go south with the other birds?  Do you think this was a wise decision? •  What made the Happy Prince and the swallow such good friends? •   Why do you')
	3 = ('type', 'Document')


In [None]:
# TODO: Load EPUB



## Processing structured document/JSON

In [18]:
# TODO: Create a JSON splitter
json_splitter = RecursiveJsonSplitter(max_chunk_size=128)



In [19]:
# TODO: Process a single JSON object 
with open('./docs/single.json') as f:
   json_data = json.load(f)

print(json_data)


{'web-app': {'servlet': [{'servlet-name': 'cofaxCDS', 'servlet-class': 'org.cofax.cds.CDSServlet', 'init-param': {'configGlossary:installationAt': 'Philadelphia, PA', 'configGlossary:adminEmail': 'ksm@pobox.com', 'configGlossary:poweredBy': 'Cofax', 'configGlossary:poweredByIcon': '/images/cofax.gif', 'configGlossary:staticPath': '/content/static', 'templateProcessorClass': 'org.cofax.WysiwygTemplate', 'templateLoaderClass': 'org.cofax.FilesTemplateLoader', 'templatePath': 'templates', 'templateOverridePath': '', 'defaultListTemplate': 'listTemplate.htm', 'defaultFileTemplate': 'articleTemplate.htm', 'useJSP': False, 'jspListTemplate': 'listTemplate.jsp', 'jspFileTemplate': 'articleTemplate.jsp', 'cachePackageTagsTrack': 200, 'cachePackageTagsStore': 200, 'cachePackageTagsRefresh': 60, 'cacheTemplatesTrack': 100, 'cacheTemplatesStore': 50, 'cacheTemplatesRefresh': 15, 'cachePagesTrack': 200, 'cachePagesStore': 100, 'cachePagesRefresh': 10, 'cachePagesDirtyRead': 10, 'searchEngineListTe

In [23]:
chunks = json_splitter.split_json(json_data)
print(len(chunks))
print_chunk_info(chunks)
print(chunks)

4
No of chunks: 4
Chunk index: 2
Chunk details
	0 = web-app
[{'web-app': {'servlet': [{'servlet-name': 'cofaxCDS', 'servlet-class': 'org.cofax.cds.CDSServlet', 'init-param': {'configGlossary:installationAt': 'Philadelphia, PA', 'configGlossary:adminEmail': 'ksm@pobox.com', 'configGlossary:poweredBy': 'Cofax', 'configGlossary:poweredByIcon': '/images/cofax.gif', 'configGlossary:staticPath': '/content/static', 'templateProcessorClass': 'org.cofax.WysiwygTemplate', 'templateLoaderClass': 'org.cofax.FilesTemplateLoader', 'templatePath': 'templates', 'templateOverridePath': '', 'defaultListTemplate': 'listTemplate.htm', 'defaultFileTemplate': 'articleTemplate.htm', 'useJSP': False, 'jspListTemplate': 'listTemplate.jsp', 'jspFileTemplate': 'articleTemplate.jsp', 'cachePackageTagsTrack': 200, 'cachePackageTagsStore': 200, 'cachePackageTagsRefresh': 60, 'cacheTemplatesTrack': 100, 'cacheTemplatesStore': 50, 'cacheTemplatesRefresh': 15, 'cachePagesTrack': 200, 'cachePagesStore': 100, 'cachePage

In [26]:
# TODO: Process an array of JSON objects, return chunks as Dictionary

with open('./docs/tv-shows.json') as f:
   json_data = json.load(f)

chunks = json_splitter.split_json(json_data=json_data, convert_lists=True)
print(chunks)



In [27]:
# TODO: Process an array of JSON objects, return chunks as Document

json_loader = JSONLoader(
   file_path="./docs/tv-shows.json",
   # jq 
   jq_schema='.[].summary',
   text_content=True
)

chunks = json_loader.load()

print(chunks)
print_chunk_info(chunks)




No of chunks: 240
Chunk index: 211
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/opt/tmp/ellm-2025-jul14/day03/docs/tv-shows.json', 'seq_num': 212})
	2 = ('page_content', "<p>The series taps into a fictional unit mandated by the International Criminal Court (ICC) to investigate cross-border crimes and ultimately bring global criminals to justice. <b>Crossing Lines</b> is set in the world's most exotic locales, where an elite team of eager cops work to solve the most notorious international crimes.</p>")
	3 = ('type', 'Document')


In [None]:
# TODO: Extract specific attributes from the JSON document, use JSON path to define which element

