In [91]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

data_dir = os.environ['DATA_DIR']

In [2]:
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [3]:
chunk_size = 26
chunk_overlap = 4

In [27]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

#By default, the separator in CharacterTextSplitter is a newline character.
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)
c_space_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator=" "
)

In [28]:
text1 = "abcdefghijklmnopqrstuvwxyz"
print(r_splitter.split_text(text1))
print(c_splitter.split_text(text1))

['abcdefghijklmnopqrstuvwxyz']
['abcdefghijklmnopqrstuvwxyz']


In [29]:
text2 = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
print(r_splitter.split_text(text2))
print(c_splitter.split_text(text2))

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefghijklmnopqrstuv', 'stuvwxyz']
['abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz']


In [30]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z a b c d e f g h i j k l m n o p q r s t u v w x y z"
print(r_splitter.split_text(text3))
print(c_splitter.split_text(text3))
print(c_space_splitter.split_text(text3))

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z a b c d e f g h i', 'h i j k l m n o p q r s t', 's t u v w x y z']
['a b c d e f g h i j k l m n o p q r s t u v w x y z a b c d e f g h i j k l m n o p q r s t u v w x y z']
['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z a b c d e f g h i', 'h i j k l m n o p q r s t', 's t u v w x y z']


In [74]:
text = """The LangChain framework is designed to facilitate the development of applications \
that utilize large language models (LLMs). It provides a comprehensive set of tools and \
abstractions to streamline the process of building LLM-powered applications, \
making it easier for developers to create complex workflows and integrate various components. \n\n\
The framework includes features such as prompt management, memory handling, and \
data augmentation, allowing developers to focus on building their applications. \n\n\
LangChain supports a wide range of LLMs and can be easily extended to accommodate new models \
and use cases. It is particularly useful for tasks such as question answering, text generation, \
and conversational agents, enabling developers to leverage the power of LLMs in their applications \
with minimal effort."""

In [75]:
print(text)

The LangChain framework is designed to facilitate the development of applications that utilize large language models (LLMs). It provides a comprehensive set of tools and abstractions to streamline the process of building LLM-powered applications, making it easier for developers to create complex workflows and integrate various components. 

The framework includes features such as prompt management, memory handling, and data augmentation, allowing developers to focus on building their applications. 

LangChain supports a wide range of LLMs and can be easily extended to accommodate new models and use cases. It is particularly useful for tasks such as question answering, text generation, and conversational agents, enabling developers to leverage the power of LLMs in their applications with minimal effort.


In [76]:
len(text.split("\n\n")[0])

341

In [77]:
len(text.split("\n\n")[1])

160

In [78]:
len(text.split("\n\n")[2])

308

In [79]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=0,
    separators=["\n\n", "\n", " ", ""]
)

c_splitter = CharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=0,
    separator=" "
)

In [80]:
c_splitter.split_text(text)

['The LangChain framework is designed to facilitate the development of applications that utilize large language models (LLMs). It provides a comprehensive set of tools and abstractions to streamline the process of building LLM-powered applications, making it easier for developers to create complex workflows and integrate various components. \n\nThe',
 'framework includes features such as prompt management, memory handling, and data augmentation, allowing developers to focus on building their applications. \n\nLangChain supports a wide range of LLMs and can be easily extended to accommodate new models and use cases. It is particularly useful for tasks such as question answering, text generation, and',
 'conversational agents, enabling developers to leverage the power of LLMs in their applications with minimal effort.']

In [81]:
r_splitter_chunks = r_splitter.split_text(text)
r_splitter_chunks

['The LangChain framework is designed to facilitate the development of applications that utilize large language models (LLMs). It provides a comprehensive set of tools and abstractions to streamline the process of building LLM-powered applications, making it easier for developers to create complex workflows and integrate various components.',
 'The framework includes features such as prompt management, memory handling, and data augmentation, allowing developers to focus on building their applications.',
 'LangChain supports a wide range of LLMs and can be easily extended to accommodate new models and use cases. It is particularly useful for tasks such as question answering, text generation, and conversational agents, enabling developers to leverage the power of LLMs in their applications with minimal effort.']

In [90]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "\.", " ", ""]
)
r_splitter_chunks = r_splitter.split_text(text)
r_splitter_chunks


['The LangChain framework is designed to facilitate the development of applications that utilize large language models (LLMs). It provides a',
 'comprehensive set of tools and abstractions to streamline the process of building LLM-powered applications, making it easier for developers to create',
 'complex workflows and integrate various components.',
 'The framework includes features such as prompt management, memory handling, and data augmentation, allowing developers to focus on building their',
 'applications.',
 'LangChain supports a wide range of LLMs and can be easily extended to accommodate new models and use cases. It is particularly useful for tasks such',
 'as question answering, text generation, and conversational agents, enabling developers to leverage the power of LLMs in their applications with',
 'minimal effort.']

In [92]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader(f"{data_dir}\\RAG\\PDF\\machinelearning-lecture01.pdf")
pages = loader.load()

In [95]:
text_splitter = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    separator="\n",
    length_function=len
)

docs = text_splitter.split_documents(pages)

In [96]:
len(docs)

78

In [97]:
len(pages)

22

In [99]:
from langchain.text_splitter import TokenTextSplitter
token_splitter = TokenTextSplitter(
    chunk_size=1,
    chunk_overlap=0,
)

In [100]:
text = "foo bar baz qux quux corge grault garply waldo fred plugh xyzzy thud"
token_splitter.split_text(text)

['foo',
 ' bar',
 ' b',
 'az',
 ' qu',
 'x',
 ' qu',
 'ux',
 ' cor',
 'ge',
 ' gra',
 'ult',
 ' gar',
 'ply',
 ' w',
 'aldo',
 ' f',
 'red',
 ' pl',
 'ugh',
 ' x',
 'y',
 'zzy',
 ' th',
 'ud']

In [103]:
token_splitter = TokenTextSplitter(
    chunk_size=10,
    chunk_overlap=0,
)
docs = token_splitter.split_documents(pages)
docs[0]

Document(metadata={'producer': 'Acrobat Distiller 8.1.0 (Windows)', 'creator': 'PScript5.dll Version 5.2.2', 'creationdate': '2008-07-11T11:25:23-07:00', 'author': '', 'moddate': '2008-07-11T11:25:23-07:00', 'title': '', 'source': 'C:\\\\Users\\\\gunit\\\\OneDrive\\\\Documents\\\\Study Material\\\\Practice Projects\\\\remote\\\\artificial-intelligence\\\\data\\RAG\\PDF\\machinelearning-lecture01.pdf', 'total_pages': 22, 'page': 0, 'page_label': '1'}, page_content='MachineLearning-Lecture01  \n')

In [104]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [106]:
markdown_document = """# Header 1\n\n\
This is some text under header 1.\n\n\
## Header 2\n\n\
This is some text under header 2.\n\n\
### Header 3\n\n\
This is some text under header 3.\n\n\
""" 

In [107]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "Header 1"), 
        ("##", "Header 2"),
        ("###", "Header 3")
    ]
)
markdown_splitter.split_text(markdown_document)

[Document(metadata={'Header 1': 'Header 1'}, page_content='This is some text under header 1.'),
 Document(metadata={'Header 1': 'Header 1', 'Header 2': 'Header 2'}, page_content='This is some text under header 2.'),
 Document(metadata={'Header 1': 'Header 1', 'Header 2': 'Header 2', 'Header 3': 'Header 3'}, page_content='This is some text under header 3.')]