# Download some `dataset`

In [1]:
#!wget "https://www.cheat-sheets.org/saved-copy/The%20One%20Page%20Linux%20Manual.pdf"

# Import `langchain_community`

In [2]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "The One Page Linux Manual.pdf"

loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

In [6]:
len(pages)

2

In [7]:
pages[0]

Document(page_content='THE ONE     PAGE LINUX MANUALA summary of useful Linux commands\nVersion 3.0 May 1999 squadron@powerup.com.au\nStarting & Stopping\nshutdown -h now Shutdown the system now and do not\nreboot\nhalt Stop all processes - same as above\nshutdown -r 5 Shutdown the system in 5 minutes and\nreboot\nshutdown -r now Shutdown the system now and reboot\nreboot Stop all processes and then reboot - same\nas above\nstartx Start the X system\nAccessing & mounting file systems\nmount -t iso9660 /dev/cdrom\n/mnt/cdromMount the device cdrom\nand call it cdrom under the\n/mnt directory\nmount -t msdos /dev/hdd\n/mnt/ddriveMount hard disk “d” as a\nmsdos file system and call\nit ddrive under the /mnt\ndirectory\nmount -t vfat /dev/hda1\n/mnt/cdriveMount hard disk “a” as a\nVFAT file system and call it\ncdrive under the /mnt\ndirectory\numount /mnt/cdrom Unmount the cdrom\nFinding files and text within files\nfind / -name  fname Starting with the root directory, look\nfor the file ca

## I. `CharacterTextSplitter`

In [8]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(pages)

In [14]:
print (f"You have {len(texts)} documents")

You have 2 documents


In [15]:
print(texts[0])

page_content='THE ONE     PAGE LINUX MANUALA summary of useful Linux commands\nVersion 3.0 May 1999 squadron@powerup.com.au\nStarting & Stopping\nshutdown -h now Shutdown the system now and do not\nreboot\nhalt Stop all processes - same as above\nshutdown -r 5 Shutdown the system in 5 minutes and\nreboot\nshutdown -r now Shutdown the system now and reboot\nreboot Stop all processes and then reboot - same\nas above\nstartx Start the X system\nAccessing & mounting file systems\nmount -t iso9660 /dev/cdrom\n/mnt/cdromMount the device cdrom\nand call it cdrom under the\n/mnt directory\nmount -t msdos /dev/hdd\n/mnt/ddriveMount hard disk “d” as a\nmsdos file system and call\nit ddrive under the /mnt\ndirectory\nmount -t vfat /dev/hda1\n/mnt/cdriveMount hard disk “a” as a\nVFAT file system and call it\ncdrive under the /mnt\ndirectory\numount /mnt/cdrom Unmount the cdrom\nFinding files and text within files\nfind / -name  fname Starting with the root directory, look\nfor the file called fnam

In [16]:
print ("Preview:")
print (texts[0].page_content)

Preview:
THE ONE     PAGE LINUX MANUALA summary of useful Linux commands
Version 3.0 May 1999 squadron@powerup.com.au
Starting & Stopping
shutdown -h now Shutdown the system now and do not
reboot
halt Stop all processes - same as above
shutdown -r 5 Shutdown the system in 5 minutes and
reboot
shutdown -r now Shutdown the system now and reboot
reboot Stop all processes and then reboot - same
as above
startx Start the X system
Accessing & mounting file systems
mount -t iso9660 /dev/cdrom
/mnt/cdromMount the device cdrom
and call it cdrom under the
/mnt directory
mount -t msdos /dev/hdd
/mnt/ddriveMount hard disk “d” as a
msdos file system and call
it ddrive under the /mnt
directory
mount -t vfat /dev/hda1
/mnt/cdriveMount hard disk “a” as a
VFAT file system and call it
cdrive under the /mnt
directory
umount /mnt/cdrom Unmount the cdrom
Finding files and text within files
find / -name  fname Starting with the root directory, look
for the file called fname
find / -name ”*fname* ” Starting 

## II. `RecursiveCharacterTextSplitter`

In [17]:
!echo "Helllo, my name is Suman\n Hello again\n\ntesting newline." > LLM.txt

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

In [19]:
text_splitter = RecursiveCharacterTextSplitter(
                                                    chunk_size=50,
                                                    chunk_overlap=10,
                                                    length_function=len,
                                                )

In [20]:
texts = text_splitter.create_documents([sample_text])
print(texts)

[Document(page_content='Helllo, my name is Suman\n Hello again'), Document(page_content='testing newline.')]


#======

## III. `NLTKTextSplitter`

In [23]:
import nltk
#nltk.download('punkt')
#!python -m spacy download en_core_web_sm

In [24]:
# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

from langchain_text_splitters import NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=500)


texts = text_splitter.split_text(sample_text)
print(texts)

['Helllo, my name is Suman\n Hello again\n\ntesting newline.']


## IV. `SpacyTextSplitter`

In [28]:
from langchain_text_splitters import SpacyTextSplitter


# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Instantiate the SpacyTextSplitter with the desired chunk size
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)


# Split the text using SpacyTextSplitter
texts = text_splitter.split_text(sample_text)

# Print the first chunk
print(texts)


['Helllo, my name is Suman\n Hello again\n\ntesting newline.']


## V. `MarkdownTextSplitter`

In [29]:
from langchain_text_splitters import MarkdownTextSplitter
markdown_text = """
#

# Welcome to My Blog!

## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.

Here's a list of my favorite programming languages:

1. Python
2. JavaScript
3. Java

You can check out some of my projects on [GitHub](https://github.com).

## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.

Here's a small piece of Python code to say hello:

\``` python
def say_hello(name):
    print(f"Hello, {name}!")

say_hello("John")
\```

Stay tuned for more updates!

## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.

"""
markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])
print(docs)

[Document(page_content='#\n\n# Welcome to My Blog!'), Document(page_content='## Introduction'), Document(page_content='Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python,'), Document(page_content='Java, and JavaScript.'), Document(page_content="Here's a list of my favorite programming languages:\n\n1. Python\n2. JavaScript\n3. Java"), Document(page_content='You can check out some of my projects on [GitHub](https://github.com).'), Document(page_content='## About this Blog'), Document(page_content="In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on"), Document(page_content='the latest technology trends, and occasional book reviews.'), Document(page_content="Here's a small piece of Python code to say hello:"), Document(page_content='\\``` python\ndef say_hello(name):\n    print(f"Hello, {name}!")\n\nsay_hello("John")\n\\'), Document(page_content='```\n\nStay tuned for more updates!'), Document(pag

## VI. `TokenTextSplitter`

In [30]:
from langchain_text_splitters import TokenTextSplitter

# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Initialize the TokenTextSplitter with desired chunk size and overlap
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)

# Split into smaller chunks
texts = text_splitter.split_text(sample_text)
print(texts[0])

Helllo, my name is Suman
 Hello again

testing newline.

