# Text splitters

# 1. Recursively Split Text
This approach splits text into chunks recursively until a specified chunk size is reached.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample text
text = "This is a sample text. It contains several sentences and paragraphs. Here's a new paragraph.\n\nThis is another paragraph."

# Create a text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)

# Split the text
chunks = splitter.split_text(text)

print(chunks)


['This is a sample text. It contains several', "several sentences and paragraphs. Here's a new", 'a new paragraph.', 'This is another paragraph.']


# 2. Split by HTML Headers
This splits the document based on HTML header tags.

In [2]:
# This splits the document based on HTML header tags (e.g., <h1>, <h2>)


In [18]:
pip install lxml

Collecting lxml
  Downloading lxml-5.3.0-cp310-cp310-win_amd64.whl.metadata (3.9 kB)
Downloading lxml-5.3.0-cp310-cp310-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   -- ------------------------------------- 0.3/3.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.5/3.8 MB 699.0 kB/s eta 0:00:05
   ----- ---------------------------------- 0.5/3.8 MB 699.0 kB/s eta 0:00:05
   -------- ------------------------------- 0.8/3.8 MB 699.0 kB/s eta 0:00:05
   -------- ------------------------------- 0.8/3.8 MB 699.0 kB/s eta 0:00:05
   -------- ------------------------------- 0.8/3.8 MB 699.0 kB/s eta 0:00:05
   -------- ------------------------------- 0.8/3.8 M

In [2]:
%pip install -qU langchain-text-splitters


Note: you may need to restart the kernel to use updated packages.


In [19]:
from langchain.text_splitter import HTMLHeaderTextSplitter

# Specify headers to split on
headers_to_split_on = ["h1", "h2"]

# Initialize with headers
splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# Split the HTML text
html_text = "<h1>Header 1</h1><p>Content under header 1.</p><h2>Header 2</h2><p>Content under header 2.</p>"
chunks = splitter.split_text(html_text)
print(chunks)


[Document(metadata={}, page_content='Content under header 1.  \nContent under header 2.')]


# 2. Split from a URL or HTML File
You can also read directly from a URL or a local HTML file.

In [20]:
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample HTML text
html_text = "<section><h1>Title</h1><p>Some content.</p></section><section><h1>Another Title</h1><p>More content.</p></section>"

# Parse HTML and extract text
soup = BeautifulSoup(html_text, "html.parser")
sections = [section.get_text() for section in soup.find_all("section")]

# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

# Split each section into chunks
chunks = []
for section in sections:
    chunks.extend(text_splitter.split_text(section))

print(chunks)


['TitleSome content.', 'Another TitleMore content.']


# 3. Constrain Chunk Sizes
To combine the HTMLHeaderTextSplitter with another splitter that constrains by character length, such as RecursiveCharacterTextSplitter, do the following:

In [21]:
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample HTML text
html_text = "<section><h1>Title</h1><p>Some content.</p></section><section><h1>Another Title</h1><p>More content.</p></section>"
soup = BeautifulSoup(html_text, "html.parser")
sections = [section.get_text() for section in soup.find_all("section")]

# Set up a text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=30)
chunks = [text_splitter.split_text(section) for section in sections]

print(chunks)


[['TitleSome content.'], ['Another TitleMore content.']]


# 4. Split by Character
This method splits text into chunks based on a specified character length.

In [1]:
from langchain.text_splitter import CharacterTextSplitter

# Sample text
text = "This is a simple text splitter example."

# Create a text splitter with a smaller chunk_overlap
splitter = CharacterTextSplitter(chunk_size=10, chunk_overlap=0)

# Split the text
chunks = splitter.split_text(text)

print(chunks)


['This is a simple text splitter example.']


# 5. Split Code
This method can be useful for splitting programming code into logical sections.

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample code
code = """def add(a, b):
    return a + b

def subtract(a, b):
    return a - b"""

# Create a text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=50, chunk_overlap=10)

# Split the code
chunks = splitter.split_text(code)

print(chunks)


['def add(a, b):\n    return a + b', 'def subtract(a, b):\n    return a - b']


# 6. Split Markdown by Headers
This method splits markdown text based on headers.

In [27]:
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter

# Sample markdown text
markdown_text = "# Header 1\nContent under header 1.\n## Header 2\nContent under header 2."

# Specify headers to split on
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "h1"), ("##", "h2")])

# Split the markdown text
chunks = splitter.split_text(markdown_text)
print(chunks)

[Document(metadata={'h1': 'Header 1'}, page_content='Content under header 1.'), Document(metadata={'h1': 'Header 1', 'h2': 'Header 2'}, page_content='Content under header 2.')]


# 7. Recursively Split JSON
This method handles JSON data and splits it into manageable parts.

In [26]:
import json

class RecursiveJSONTextSplitter:
    def __init__(self):
        pass

    def split_text(self, json_text):
        data = json.loads(json_text)
        chunks = []

        def recursive_split(data):
            if isinstance(data, dict):
                for key, value in data.items():
                    if isinstance(value, (dict, list)):
                        recursive_split(value)
                    else:
                        chunks.append(f"{key}: {value}")
            elif isinstance(data, list):
                for item in data:
                    recursive_split(item)

        recursive_split(data)
        return chunks

# Sample JSON
json_text = '{"key1": "value1", "key2": {"subkey1": "subvalue1", "subkey2": "subvalue2"}}'

# Create a RecursiveJSONTextSplitter instance
splitter = RecursiveJSONTextSplitter()

# Split the JSON text
chunks = splitter.split_text(json_text)

# Print the chunks
for chunk in chunks:
    print(chunk)

key1: value1
subkey1: subvalue1
subkey2: subvalue2


# 8. Split Text into Semantic Chunks
This method splits text based on semantic meaning, which can improve the contextual understanding of chunks

In [30]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 882.6 kB/s eta 0:00:02
   -------------------- ------------------- 0.8/1.5 MB 1.1 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 1.4 MB/s eta 0:00:00
Using cached click-8.1.7-py3-none-any.whl (97 kB)
Installing collected packages: click, nltk
Successfully installed click-8.1.7 nltk-3.9.1


In [37]:
import nltk
nltk.download('all')
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class SemanticChunkTextSplitter:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def split_text(self, text):
        sentences = sent_tokenize(text)
        chunks = []

        for sentence in sentences:
            words = word_tokenize(sentence)
            words = [word.lower() for word in words if word.isalpha()]
            words = [word for word in words if word not in self.stop_words]
            words = [self.lemmatizer.lemmatize(word) for word in words]

            if len(words) > 0:
                chunks.append(' '.join(words))

        return chunks

# Sample text
text = "Natural Language Processing (NLP) is a field of artificial intelligence. It deals with the interaction between computers and human language."

# Create a SemanticChunkTextSplitter instance
splitter = SemanticChunkTextSplitter()

# Split the text into semantic chunks
chunks = splitter.split_text(text)

print(chunks)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       tagge

['natural language processing nlp field artificial intelligence', 'deal interaction computer human language']


# 9. Split by Tokens
This method splits text based on token count, which is useful for NLP applications.

In [9]:
from langchain.text_splitter import TokenTextSplitter

# Sample text
text = "This is a text that we want to split into tokens."

# Create a text splitter
splitter = TokenTextSplitter(encoding_name="gpt2", model_name="gpt2")

# Split the text
chunks = splitter.split_text(text)

print(chunks)


['This is a text that we want to split into tokens.']
