## Splitting Text and Document

### Importing needed packages

In [89]:
import os
import openai
import sys
from dotenv import load_dotenv, find_dotenv

#Document Loader:
from langchain_community.document_loaders import PyPDFLoader

#Text Splitters:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter

#Token Splitter:
from langchain_text_splitters import TokenTextSplitter

#Markdown Splitter with structure
from langchain_text_splitters import MarkdownHeaderTextSplitter

### API-KEY

In [90]:
sys.path.append('../..')
_ = load_dotenv(find_dotenv()) 
api_key = os.environ['OPENAI_API_KEY']

chunk_size: clarify the size of each split.


chunk_overlap: how much can 2 neighbor chunk overlap with each other

In [91]:
chunk_size = 26
chunk_overlap = 4

In [92]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

character_splitter = CharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)

In [93]:
text1 = "abcdefghijklmnopqrstuvwxyz"

In [94]:
recursive_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [95]:
character_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [96]:
text2 = "abcdefghijklmnopqrstuvwxyzggjsitbcmsp"

In [97]:
recursive_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzggjsitbcmsp']

In [98]:
character_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyzggjsitbcmsp']

In [99]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [100]:
recursive_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [101]:
character_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [102]:
character_splitter = CharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
    separator= ' '
)
character_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [103]:
nqs_text = """"
NQS treat a wavefunction like a neural net: \n
instead of enumerating amplitudes for every spin configuration,
 you parameterize them with network weights, 
 usually Restricted Boltzmann Machines or modern autoregressive nets.
 \n\nTraining becomes a variational Monte Carlo loop—sample configurations, estimate gradients of the energy with 
 respect to the weights, nudge the model, repeat.\n\n
 This lets you capture entanglement patterns that are awkward for product 
 states while keeping a compact parameter set.
 \n\nPeople prototype with small spin chains to benchmark
   against exact diagonalization, then scale to frustrated lattices or fermionic encodings.
   \n\nThe catch 
   is optimization; the landscape is gnarly, so tricks like natural gradient updates, symmetry constraints, 
   or hybridizing with tensor-network priors are common.
   \n\nWhen it works, you essentially have a 
   flexible ansatz that can interpolate between mean-field intuition and highly correlated phases, and
     it plays nicely with quantum-inspired algorithms—feeding its samples into quantum hardware or using it 
     to post-process noisy outputs.
 """


In [104]:
character_splitter = CharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separator= ' '
)

recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separators = ["\n\n", "\n", " ", ""]
)

In [105]:
character_splitter.split_text(nqs_text)

['"\nNQS treat a wavefunction like a neural net: \n\ninstead of enumerating amplitudes for every spin configuration,\n you parameterize them with network weights, \n usually Restricted Boltzmann Machines or modern autoregressive nets.\n \n\nTraining becomes a variational Monte Carlo loop—sample configurations, estimate gradients of the energy with \n respect to the weights, nudge the model, repeat.\n\n\n This lets you capture entanglement patterns that are',
 'awkward for product \n states while keeping a compact parameter set.\n \n\nPeople prototype with small spin chains to benchmark\n against exact diagonalization, then scale to frustrated lattices or fermionic encodings.\n \n\nThe catch \n is optimization; the landscape is gnarly, so tricks like natural gradient updates, symmetry constraints, \n or hybridizing with tensor-network priors are common.\n \n\nWhen it works, you essentially have a \n flexible ansatz',
 'that can interpolate between mean-field intuition and highly correla

In [106]:
recursive_splitter.split_text(nqs_text)

['"\nNQS treat a wavefunction like a neural net: \n\ninstead of enumerating amplitudes for every spin configuration,\n you parameterize them with network weights, \n usually Restricted Boltzmann Machines or modern autoregressive nets.\n \n\nTraining becomes a variational Monte Carlo loop—sample configurations, estimate gradients of the energy with \n respect to the weights, nudge the model, repeat.',
 'This lets you capture entanglement patterns that are awkward for product \n states while keeping a compact parameter set.\n \n\nPeople prototype with small spin chains to benchmark\n   against exact diagonalization, then scale to frustrated lattices or fermionic encodings.',
 'The catch \n   is optimization; the landscape is gnarly, so tricks like natural gradient updates, symmetry constraints, \n   or hybridizing with tensor-network priors are common.',
 'When it works, you essentially have a \n   flexible ansatz that can interpolate between mean-field intuition and highly correlated ph

In [107]:
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 450,
    chunk_overlap = 0,
    separators = ["\n\n", "\n", "(?<=\.)" ," ", ""]
)

  separators = ["\n\n", "\n", "(?<=\.)" ," ", ""]


In [108]:
recursive_splitter.split_text(nqs_text)

['"\nNQS treat a wavefunction like a neural net: \n\ninstead of enumerating amplitudes for every spin configuration,\n you parameterize them with network weights, \n usually Restricted Boltzmann Machines or modern autoregressive nets.\n \n\nTraining becomes a variational Monte Carlo loop—sample configurations, estimate gradients of the energy with \n respect to the weights, nudge the model, repeat.',
 'This lets you capture entanglement patterns that are awkward for product \n states while keeping a compact parameter set.\n \n\nPeople prototype with small spin chains to benchmark\n   against exact diagonalization, then scale to frustrated lattices or fermionic encodings.',
 'The catch \n   is optimization; the landscape is gnarly, so tricks like natural gradient updates, symmetry constraints, \n   or hybridizing with tensor-network priors are common.',
 'When it works, you essentially have a \n   flexible ansatz that can interpolate between mean-field intuition and highly correlated ph

In [109]:
loader = PyPDFLoader("./docs/pdfs/G-CNN.pdf")
pages = loader.load()

In [110]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size= 1000,
    chunk_overlap= 150,
    length_function = len
)

docs = text_splitter.split_documents(pages)

In [111]:
len(docs)

54

In [112]:
len(pages)

10

In [113]:
docs[0:5]

[Document(metadata={'producer': 'pdfTeX-1.40.14', 'creator': 'TeX', 'creationdate': '2016-05-27T01:32:03+02:00', 'moddate': '2016-05-27T01:32:03+02:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea version 6.1.1', 'source': './docs/pdfs/G-CNN.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='Group Equivariant Convolutional Networks\nTaco S. Cohen T.S.COHEN @UVA.NL\nUniversity of Amsterdam\nMax Welling M.WELLING @UVA.NL\nUniversity of Amsterdam\nUniversity of California Irvine\nCanadian Institute for Advanced Research\nAbstract\nWe introduce Group equivariant Convolutional\nNeural Networks (G-CNNs), a natural general-\nization of convolutional neural networks that re-\nduces sample complexity by exploiting symme-\ntries. G-CNNs use G-convolutions, a new type of\nlayer that enjoys a substantially higher degree of\nweight sharing than regular convolution layers.\nG-convolutions increase the expre

### TOKEN SPLITTER
##### Tokens are usually 4 characters.

In [114]:
token_text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [115]:
text1 = "foo bar bazzyfoo"

In [116]:
token_text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [117]:
token_text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)


In [118]:
docs[0]

Document(metadata={'producer': 'pdfTeX-1.40.14', 'creator': 'TeX', 'creationdate': '2016-05-27T01:32:03+02:00', 'moddate': '2016-05-27T01:32:03+02:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea version 6.1.1', 'source': './docs/pdfs/G-CNN.pdf', 'total_pages': 10, 'page': 0, 'page_label': '1'}, page_content='Group Equivariant Convolutional Networks\nTaco S. Cohen T.S.COHEN @UVA.NL\nUniversity of Amsterdam\nMax Welling M.WELLING @UVA.NL\nUniversity of Amsterdam\nUniversity of California Irvine\nCanadian Institute for Advanced Research\nAbstract\nWe introduce Group equivariant Convolutional\nNeural Networks (G-CNNs), a natural general-\nization of convolutional neural networks that re-\nduces sample complexity by exploiting symme-\ntries. G-CNNs use G-convolutions, a new type of\nlayer that enjoys a substantially higher degree of\nweight sharing than regular convolution layers.\nG-convolutions increase the expres

In [119]:
pages[0].metadata

{'producer': 'pdfTeX-1.40.14',
 'creator': 'TeX',
 'creationdate': '2016-05-27T01:32:03+02:00',
 'moddate': '2016-05-27T01:32:03+02:00',
 'trapped': '/False',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.1415926-2.5-1.40.14 (TeX Live 2013/Debian) kpathsea version 6.1.1',
 'source': './docs/pdfs/G-CNN.pdf',
 'total_pages': 10,
 'page': 0,
 'page_label': '1'}

### Markdown-base splits

In [120]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [121]:
headers_to_split_on = [
    ("#", 'Header 1'),
    ("##", 'Header 2'),
    ("###", 'Header 3')
]

In [122]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [123]:
md_header_splits[0]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim  \nHi this is Joe')

In [124]:
md_header_splits[1]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Lance')