In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')
os.environ['ACTIVELOOP_TOKEN'] = os.environ.get('ACTIVELOOP_TOKEN')

In [1]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("example_data/The One Page Linux Manual.pdf")
pages = loader.load_and_split()

In [2]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(pages)

print(texts[0])

page_content='THE ONE     PAGE LINUX MANUALA summary of useful Linux commands\nVersion 3.0 May 1999 squadron@powerup.com.au\nStarting & Stopping\nshutdown -h now Shutdown the system now and do not\nreboot\nhalt Stop all processes - same as above\nshutdown -r 5 Shutdown the system in 5 minutes and\nreboot\nshutdown -r now Shutdown the system now and reboot\nreboot Stop all processes and then reboot - same\nas above\nstartx Start the X system\nAccessing & mounting file systems\nmount -t iso9660 /dev/cdrom\n/mnt/cdromMount the device cdrom\nand call it cdrom under the\n/mnt directory\nmount -t msdos /dev/hdd\n/mnt/ddriveMount hard disk “d” as a\nmsdos file system and call\nit ddrive under the /mnt\ndirectory\nmount -t vfat /dev/hda1\n/mnt/cdriveMount hard disk “a” as a\nVFAT file system and call it\ncdrive under the /mnt\ndirectory\numount /mnt/cdrom Unmount the cdrom\nFinding files and text within files\nfind / -name  fname Starting with the root directory, look\nfor the file called fnam

In [3]:
print (f"You have {len(texts)} documents")

You have 2 documents


In [None]:
print ("Preview:")
print (texts[0].page_content)

Preview:
THE ONE     PAGE LINUX MANUALA summary of useful Linux commands
Version 3.0 May 1999 squadron@powerup.com.au
Starting & Stopping
shutdown -h now Shutdown the system now and do not
reboot
halt Stop all processes - same as above
shutdown -r 5 Shutdown the system in 5 minutes and
reboot
shutdown -r now Shutdown the system now and reboot
reboot Stop all processes and then reboot - same
as above
startx Start the X system
Accessing & mounting file systems
mount -t iso9660 /dev/cdrom
/mnt/cdromMount the device cdrom
and call it cdrom under the
/mnt directory
mount -t msdos /dev/hdd
/mnt/ddriveMount hard disk “d” as a
msdos file system and call
it ddrive under the /mnt
directory
mount -t vfat /dev/hda1
/mnt/cdriveMount hard disk “a” as a
VFAT file system and call it
cdrive under the /mnt
directory
umount /mnt/cdrom Unmount the cdrom
Finding files and text within files
find / -name  fname Starting with the root directory, look
for the file called fname
find / -name ”*fname* ” Starting 

#=====

## NLTK Text Splitter

In [4]:
!echo "Helllo, my name is Ala\n Hello again\n\ntesting newline." > LLM.txt

In [5]:
from langchain.text_splitter import NLTKTextSplitter

# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

text_splitter = NLTKTextSplitter(chunk_size=500)
texts = text_splitter.split_text(sample_text)
print(texts)

['"Helllo, my name is Ala\n Hello again\n\ntesting newline."']


## Recursive Character Text Splitter

The Recursive Character Text Splitter is a text splitter designed to split the text into chunks based on a list of characters provided. It attempts to split text using the characters from a list in order until the resulting chunks are small enough. By default, the list of characters used for splitting is ["\n\n", "\n", " ", "], which tries to keep paragraphs, sentences, and words together as long as possible, as they are generally the most semantically related pieces of text. This means that the class first tries to split the text into two new-line characters. If the resulting chunks are still larger than the desired chunk size, it will then try to split the output by a single new-line character, followed by a space character, and so on, until the desired chunk size is achieved.

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("example_data/The One Page Linux Manual.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    length_function=len,
)

docs = text_splitter.split_documents(pages)
for doc in docs:
    print(doc)

page_content='THE ONE     PAGE LINUX MANUALA summary of useful' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='of useful Linux commands' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='Version 3.0 May 1999 squadron@powerup.com.au' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='Starting & Stopping' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='shutdown -h now Shutdown the system now and do' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='and do not' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='reboot\nhalt Stop all processes - same as above' metadata={'source': 'example_data/The One Page Linux Manual.pdf', 'page': 0}
page_content='shutdown -r 5 Shutdown the system in 5 minutes' metadata={'source': 'example_data/The One Page Li

#======

## NLTK Text Splitter

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eddy.Tovar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

from langchain.text_splitter import NLTKTextSplitter
text_splitter = NLTKTextSplitter(chunk_size=500)


texts = text_splitter.split_text(sample_text)
print(texts)

['Helllo, my name is Ala\n Hello again\n\ntesting newline.']


#======

## SpacyTextSplitter
The SpacyTextSplitter helps split large text documents into smaller chunks based on a specified size. This is useful for better management of large text inputs. It's important to note that the SpacyTextSplitter is an alternative to NLTK-based sentence splitting. You can create a SpacyTextSplitter object by specifying the chunk_size parameter, measured by a length function passed to it, which defaults to the number of characters.

In [11]:
from langchain.text_splitter import SpacyTextSplitter


# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Instantiate the SpacyTextSplitter with the desired chunk size
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)


# Split the text using SpacyTextSplitter
texts = text_splitter.split_text(sample_text)

# Print the first chunk
print(texts)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

#=====

## The MarkdownTextSplitter 

is designed to split text written using Markdown languages like headers, code blocks, or dividers. It is implemented as a simple subclass of RecursiveCharacterSplitter with Markdown-specific separators.

#

# Welcome to My Blog!

## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.

Here's a list of my favorite programming languages:

1. Python
2. JavaScript
3. Java

You can check out some of my projects on [GitHub](https://github.com).

## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.

Here's a small piece of Python code to say hello:

\``` python
def say_hello(name):
    print(f"Hello, {name}!")

say_hello("John")
\```

Stay tuned for more updates!

## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.

"""

In [12]:
from langchain.text_splitter import MarkdownTextSplitter
markdown_text = """
#

# Welcome to My Blog!

## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.

Here's a list of my favorite programming languages:

1. Python
2. JavaScript
3. Java

You can check out some of my projects on [GitHub](https://github.com).

## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.

Here's a small piece of Python code to say hello:

\``` python
def say_hello(name):
    print(f"Hello, {name}!")

say_hello("John")
\```

Stay tuned for more updates!

## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.

"""
markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])
print(docs)

[Document(page_content='#\n\n# Welcome to My Blog!'), Document(page_content='## Introduction'), Document(page_content='Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python,'), Document(page_content='Java, and JavaScript.'), Document(page_content="Here's a list of my favorite programming languages:\n\n1. Python\n2. JavaScript\n3. Java"), Document(page_content='You can check out some of my projects on [GitHub](https://github.com).'), Document(page_content='## About this Blog'), Document(page_content="In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on"), Document(page_content='the latest technology trends, and occasional book reviews.'), Document(page_content="Here's a small piece of Python code to say hello:"), Document(page_content='\\``` python\ndef say_hello(name):\n    print(f"Hello, {name}!")\n\nsay_hello("John")\n\\'), Document(page_content='```\n\nStay tuned for more updates!'), Document(pag

#=====

## TokenTextSplitter
The main advantage of using TokenTextSplitter over other text splitters, like CharacterTextSplitter, is that it respects the token boundaries, ensuring that the chunks do not split tokens in the middle. This can be particularly helpful in maintaining the semantic integrity of the text when working with language models and embeddings.

In [13]:
from langchain.text_splitter import TokenTextSplitter

# Load a long document
with open('LLM.txt', encoding= 'unicode_escape') as f:
    sample_text = f.read()

# Initialize the TokenTextSplitter with desired chunk size and overlap
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)

# Split into smaller chunks
texts = text_splitter.split_text(sample_text)
print(texts[0])

"Helllo, my name is Ala
 Hello again

testing newline." 

