# Document loaders

We will examine how to load and split popular file formats. The purpose of is to insert these documents into a vector database for semantic searches.

In [2]:
# Download the NLTK data
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
# Imports
import json, random
from langchain_community.document_loaders import UnstructuredEPubLoader, UnstructuredExcelLoader, UnstructuredPowerPointLoader
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, JSONLoader, UnstructuredXMLLoader
from langchain_community.document_loaders import UnstructuredEmailLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter, RecursiveJsonSplitter


# Create a text splitter

The purpose of splitting the text is to enable more precise search results by ensuring that each smaller segment captures relevant context within a larger document. When these text fragrments are used as context in prompts, they will not exceed the context window.

In [4]:
# TODO: Create a text splitter
chunk_size = 300
chunk_overlap = 20

text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

## Loading and splitting documents

Documents can be loaded and split either as a single or as elements eg pages, individual slides, spreadsheet, etc. Set the `mode` arguemnt to
- `single` - load the entire document as a single object default for most loaders
- `multi`, `elements` - split the document into their respective elements eg pages, slides, etc.

In [5]:
def print_chunk_info(chunks):
   print(f'No of chunks: {len(chunks)}')
   idx = random.randrange(0, len(chunks))
   print(f'Chunk index: {idx}')
   print('Chunk details')
   for k, v in enumerate(chunks[idx]):
      print(f'\t{k} = {v}')

In [6]:
# TODO: Word doucment
# Load the document
word_loader = Docx2txtLoader(file_path="/content/docs/SST RL Python Setup.docx")

# chunk the doc
chunks = word_loader.load_and_split(text_splitter)

print_chunk_info(chunks)

No of chunks: 12
Chunk index: 8
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/content/docs/SST RL Python Setup.docx'})
	2 = ('page_content', 'After you have installed VSC, launch it. Next, we need to install the Python Extension for VSC. Open the extension panel by clicking on this icon  on the menu along VSCâ€™s left window edge. Type Python on the extension window and select the Microsoft Python Extension. See the following image')
	3 = ('type', 'Document')


In [9]:
# TODO: Excel doucment

# Load as a single document
xls_loader = UnstructuredExcelLoader(file_path="/content/docs/Financial Sample.xlsx", mode="single")
chunks = xls_loader.load_and_split(text_splitter)

print_chunk_info(chunks)


# "text_as_html" key in the document metadata.
xls_loader = UnstructuredExcelLoader(file_path="/content/docs/Financial Sample.xlsx", mode="elements")
chunks = xls_loader.load_and_split(text_splitter)

print('============ element')
print_chunk_info(chunks)


No of chunks: 287
Chunk index: 161
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/content/docs/Financial Sample.xlsx'})
	2 = ('page_content', '2014 Government United States of America Amarilla Medium 2907 260 7 20349 1627.92 18721.08 14535 4186.08 2014-06-01 00:00:00 6 June 2014 Government Germany Amarilla Medium 1366 260 20 27320 2185.6 25134.4 13660 11474.4 2014-06-01 00:00:00 6 June 2014 Small Business Mexico Amarilla Medium 2460 260')
	3 = ('type', 'Document')
No of chunks: 287
Chunk index: 62
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/content/docs/Financial Sample.xlsx', 'file_directory': '/content/docs', 'filename': 'Financial Sample.xlsx', 'last_modified': '2025-02-28T03:53:21', 'page_name': 'Sheet1', 'page_number': 1, 'text_as_html': '<table><tr><td>Segment</td><td>Country</td><td>Product</td><td>Discount Band</td><td>Units Sold</td><td>Manufacturing Price</td><td>Sale Price</td><td>Gross Sales</td><td>Discounts</td><td>Sales</td><td>COGS</td

In [None]:
# TODO: Load Powerpoint



In [None]:
# TODO: Load Email



In [10]:
# TODO: Load PDF
pdf_loader = PyPDFLoader(file_path="/content/docs/Path-to-GitOps-Red-Hat-Developer-e-book.pdf")

chunks = pdf_loader.load_and_split(text_splitter)

print_chunk_info(chunks)

No of chunks: 404
Chunk index: 398
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2022-07-15T19:22:48-05:00', 'moddate': '2022-07-17T10:32:28-04:00', 'title': 'The Path to GitOps', 'source': '/content/docs/Path-to-GitOps-Red-Hat-Developer-e-book.pdf', 'total_pages': 45, 'page': 43, 'page_label': '44'})
	2 = ('page_content', 'References\n[8.1]  https:/ /www.redhat.com/ en/technologies/management/advanced-cluster-\nmanagement\n[8.2] https:/ / open-cluster-management.io/\n[8.3] https:/ / developers.redhat.com/products/ansible/ overview\n[8.4] https:/ /www.redhat.com/ en/about/videos/acm-ansible-integration-overview')
	3 = ('type', 'Document')


In [None]:
# TODO: Load EPUB



## Processing structured document/JSON

In [14]:
# TODO: Create a JSON splitter
json_splitter = RecursiveJsonSplitter(max_chunk_size=64)


In [19]:
# TODO: Process a single JSON object
with open('/content/docs/single.json') as f:
  json_data = json.load(f)

chunks = json_splitter.split_json(json_data=json_data)

print_chunk_info(chunks)
print(chunks[5])


No of chunks: 8
Chunk index: 7
Chunk details
	0 = web-app
{'web-app': {'servlet-mapping': {'cofaxTools': '/tools/*'}}}


In [20]:
# TODO: Process an array of JSON objects, return chunks as Dictionary

with open('/content/docs/tv-shows.json') as f:
   json_data = json.load(f)

chunks = json_splitter.split_json(json_data=json_data, convert_lists=True)
print(chunks)



In [22]:
# TODO: Process an array of JSON objects, return chunks as Document

with open('/content/docs/tv-shows.json') as f:
   json_data = json.load(f)

chunks = json_splitter.create_documents(texts=json_data)

print(chunks)



In [24]:
# TODO: Extract specific attributes from the JSON document, use JSON path to define which element
json_loader = JSONLoader(
    file_path='/content/docs/tv-shows.json',
    jq_schema='.[].summary',
    text_content=True
)

chunks = json_loader.load()
#chunks = json_loader.load_and_split(text_splitter)

print_chunk_info(chunks)


No of chunks: 240
Chunk index: 73
Chunk details
	0 = ('id', None)
	1 = ('metadata', {'source': '/content/docs/tv-shows.json', 'seq_num': 74})
	2 = ('page_content', '<p><b>Sons of Anarchy</b> is an American television drama series created by Kurt Sutter, about the lives of a close-knit outlaw motorcycle club operating in Charming, a fictional town in California\'s Central Valley. The show centers on protagonist Jackson "Jax" Teller (Charlie Hunnam), initially the vice president of the club, who begins questioning the club and himself.</p>')
	3 = ('type', 'Document')
