# Document loaders

We will examine how to load and split popular file formats. The purpose of is to insert these documents into a vector database for semantic searches.

In [1]:
# Download the NLTK data
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /home/cmlee/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/cmlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
# Imports
import json, random
from langchain_community.document_loaders import UnstructuredEPubLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, JSONLoader, UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredEmailLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter


# Create a text splitter

The purpose of splitting the text is to enable more precise search results by ensuring that each smaller segment captures relevant context within a larger document. When these text fragrments are used as context in prompts, they will not exceed the context window.

In [None]:
# TODO: Create a text splitter
chunk_size = 1024
chunk_overlap = 50

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

## Loading and splitting documents

Documents can be loaded and split either as a single or as elements eg pages, individual slides, spreadsheet, etc. Set the `mode` arguemnt to
- `single` - load the entire document as a single object default for most loaders
- `multi`, `elements` - split the document into their respective elements eg pages, slides, etc.

In [4]:
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [5]:
# TODO: Word doucment
# create a loader
word_loader = Docx2txtLoader(file_path="./docs/SST RL Python Setup.docx")

# split the document
chunks = word_loader.load_and_split(text_splitter)

# display chunk info
print_chunk_info(chunks)

No of chunks: 4
Chunk index: 1
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/SST RL Python Setup.docx'})
	2 = ('page_content', 'Note: You can confirm the newly created environment, by listing all envs\n\nconda env list\n\n\n\nActivate your environment\n\nOnce you have created your environment, you will need to activate it\n\n\tconda activate myenv\n\n\n\nYour command prompt will now include the environment’s name.\n\n\n\nExit from your environment\n\nTo exit from your Conda environment type the following command\n\n\tconda deactivate \n\nInstalling packages into your environment\n\nYou will now need to install Python packages into your environment. Before installation, activate your environment; this step is important!. \n\n\n\nThese are the list of packages will will be installing into the environment\n\nnumpy\n\nmatplotlib\n\ntqdm\n\ngym\n\npyglet\n\n\n\nOn your terminal, type the following command (activate your environment before installing these packages)\n\

In [11]:
!pip install msoffcrypto-tool

Collecting msoffcrypto-tool
  Downloading msoffcrypto_tool-5.4.2-py3-none-any.whl.metadata (10 kB)
Downloading msoffcrypto_tool-5.4.2-py3-none-any.whl (48 kB)
Installing collected packages: msoffcrypto-tool
Successfully installed msoffcrypto-tool-5.4.2


In [13]:
# TODO: Excel doucment
# create a loader
xls_loader = UnstructuredExcelLoader('./docs/Financial Sample.xlsx', mode="single")

# Load as a single document
chunks = xls_loader.load_and_split(text_splitter)
print_chunk_info(chunks)

# "text_as_html" key in the document metadata.
xls_loader = UnstructuredExcelLoader('./docs/Financial Sample.xlsx', mode="elements")

# Load as a single document
chunks = xls_loader.load_and_split(text_splitter)
print_chunk_info(chunks)


No of chunks: 83
Chunk index: 77
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': './docs/Financial Sample.xlsx'})
	2 = ('page_content', 'VTT High 2541 250 300 762300 106722 655578 635250 20328 2014-08-01 00:00:00 8 August 2014 Small Business Canada VTT High 269 250 300 80700 11298 69402 67250 2152 2013-10-01 00:00:00 10 October 2013 Small Business Canada VTT High 1496 250 300 448800 62832 385968 374000 11968 2014-10-01 00:00:00 10 October 2014 Small Business United States of America VTT High 1010 250 300 303000 42420 260580 252500 8080 2014-10-01 00:00:00 10 October 2014 Government France VTT High 1281 250 350 448350 62769 385581 333060 52521 2013-12-01 00:00:00 12 December 2013 Small Business Canada Amarilla High 888 260 300 266400 37296 229104 222000 7104 2014-03-01 00:00:00 3 March 2014 Enterprise United States of America Amarilla High 2844 260 125 355500 49770 305730 341280 -35550 2014-05-01 00:00:00 5 May 2014 Channel Partners France Amarilla High 2475 260 12 29700 41

In [None]:
# TODO: Load Powerpoint



In [None]:
# TODO: Load Email



In [15]:
# TODO: Load PDF
pdf_loader = PyPDFLoader(file_path="./docs/Path-to-GitOps-Red-Hat-Developer-e-book.pdf", extract_images=True)
chunks = pdf_loader.load_and_split(text_splitter)
print(chunks)


[Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2022-07-15T19:22:48-05:00', 'moddate': '2022-07-17T10:32:28-04:00', 'title': 'The Path to GitOps', 'source': './docs/Path-to-GitOps-Red-Hat-Developer-e-book.pdf', 'total_pages': 45, 'page': 1, 'page_label': '2'}, page_content='The Path to GitOps | 2\nContents\n05 Foreword\n07 Introduction\n08 Chapter 1–What is GitOps?\n  Origins in DevOps \n   Kubernetes and containers \n   Cloud-Native DevOps \n   A DevOps Operating Model\n  GitOps Principles \n   Declarative \n   Versioned and Immutable \n   Pulled Automatically \n   Continuously Reconciled\n  GitOps and CI/CD \n\t \t \t Traditional\tCI/CD\tWorkflows \n\t \t \t Where\tdoes\tGitOps\tfit\tin? \n   Operations via Pull Request\n  Summary\n14 Chapter 2–T ools of the Trade\n  Infrastructure as Code \n   History of Infrastructure as Code \n    Challenges of Infrastructure as Code \n   Containers Change the Game\n  Argo C

In [None]:
# TODO: Load EPUB



## Processing structured document/JSON

In [17]:
# TODO: Create a JSON splitter
# load the JSON document
with open('./docs/single.json') as f:
   json_data = json.load(f)

# create the JSON splitter
json_splitter = RecursiveJsonSplitter(max_chunk_size=64)




In [19]:
# chunk it
chunks =  json_splitter.split_json(json_data=json_data, convert_lists=True)

print_chunk_info(chunks)

No of chunks: 74
Chunk index: 45
Chunk details
	0 = web-app


In [None]:
# TODO: Process a single JSON object 
# load the JSON document
with open('./docs/tv-shows.json') as f:
   json_data = json.load(f)


In [21]:
# TODO: Process an array of JSON objects, return chunks as Dictionary

with open('./docs/tv-shows.json') as f:
   json_data = json.load(f)

# create the JSON splitter
json_splitter = RecursiveJsonSplitter(max_chunk_size=64)

In [None]:
# use create_documents instead of split_json
chunks = json_splitter.create_documents(texts=json_data)
print_chunk_info(chunks)

No of chunks: 3690
Chunk index: 1764
Chunk details
	0 = ('id', None)
	1 = ('metadata', {})
	2 = ('page_content', '{"network": {"name": "FOX", "country": {"name": "United States"}}}')
	3 = ('type', 'Document')


In [None]:
# TODO: Extract specific attributes from the JSON document, use JSON path to define which element
json_loader = JSONLoader(
   file_path="./docs/tv-shows.json",
   jq_schema=".[].summary",
   text_content=True      
)

chunks = json_loader.load()

print_chunk_info(chunks)

No of chunks: 240
Chunk index: 164
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/opt/tmp/ellm-2025-aug11/day03/docs/tv-shows.json', 'seq_num': 165})
	2 = ('page_content', "<p><b>Suits</b> delves into the fast-paced, high-stakes world of a top Manhattan corporate law firm where hotshot associate Harvey Specter makes a risky move by hiring Mike Ross a brilliant but unmotivated college dropout, as his associate. As he becomes enmeshed in this unfamiliar world, Mike relies heavily on the firm's best paralegal Rachel Zane and Harvey's no-nonsense assistant Donna Paulsen to help him serve justice. With a photographic memory and the street smarts of a hustler, Mike proves to be a legal prodigy despite the absence of bonafide legal credentials.</p>")
	3 = ('type', 'Document')
