In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../test.txt", encoding="utf-8")
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_docs = splitter.split_documents(docs)

In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

PYTHON_CODE = """
def hello_world():
    print("Hello, World!") 
    
# Call the function
hello_world()
"""

# 코드는 구조가 명확하기 때문에 overlap을 둘 필요가 없다.
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=50, chunk_overlap=0
)

python_docs = python_splitter.create_documents([PYTHON_CODE])

print(python_docs)

[Document(metadata={}, page_content='def hello_world():\n    print("Hello, World!")'), Document(metadata={}, page_content='# Call the function\nhello_world()')]


In [None]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

markdown_text = """ 
# 🦜🔗 LangChain ⚡ Building applications with LLMs through composability ⚡ 

## Quick Install
```bash
pip install langchain
```

As an open source project in a rapidly developing field, we are extremely open
    to contributions.
"""

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=60, chunk_overlap=0
)

# 두번째 인자는 메타데이터를 넣는 곳이다.
md_docs = md_splitter.create_documents(
    [markdown_text], [{"source": "https://www.langchain.com"}]
)

print(md_docs)

[Document(metadata={'source': 'https://www.langchain.com'}, page_content='# 🦜🔗 LangChain ⚡ Building applications with LLMs through'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='composability ⚡'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='## Quick Install\n```bash\npip install langchain'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='```'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='As an open source project in a rapidly developing field, we'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='are extremely open'), Document(metadata={'source': 'https://www.langchain.com'}, page_content='to contributions.')]
