In [1]:
import os
from dotenv import load_dotenv

# LangChain imports
from langchain_openai import AzureChatOpenAI

# Load environment variables
load_dotenv()

# Get Azure OpenAI configuration from environment variables
azure_openai_api_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_version = os.getenv("AZURE_OPENAI_VERSION")
azure_openai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [3]:
chunk_size =26
chunk_overlap = 4

In [4]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [5]:
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [7]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [None]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3) #spaces are counted as characters

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [None]:
c_splitter.split_text(text3) #by default character splitter uses \n as separator and hence no split occurs

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [None]:
#changed the separator to ' ' to see the split
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [13]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""
len(some_text)

496

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""] # this mean first try to split by \n\n, then by \n, then by space, and finally by character for better coherent results
)

In [None]:
c_splitter.split_text(some_text)
#here the results are split by space, so the sentences are not coherent with weird split in between.


['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [None]:
r_splitter.split_text(some_text)
#Here the results are better even though the first split is less than 450 characters, it is coherent and meaningful.

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", ".", " ", ""] #added ". " to the separators to split sentences. You can notice its all wrong too becuase of internal use of Regex
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related",
 '. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns',
 '. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space',
 '.and words are separated by space.']

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=. )", " ", ""] #This is a positive lookbehind assertion in regex:The pattern matches the position RIGHT AFTER ". "
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example,",
 'closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this',
 'string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [21]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/2508.00784v1.pdf")
pages = loader.load()

In [22]:
#from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [23]:
docs = text_splitter.split_documents(pages)

In [None]:
len(docs) #split length is way more than the pdf length

106

In [25]:
len(pages)

24

In [29]:
#Token Splitting
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [30]:
#token splitting for documents
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
docs[0]

Document(metadata={'producer': 'pikepdf 8.15.1', 'creator': 'arXiv GenPDF (tex2pdf:)', 'creationdate': '', 'author': 'Tom Or; Omri Azencot', 'doi': 'https://doi.org/10.48550/arXiv.2508.00784', 'license': 'http://creativecommons.org/licenses/by/4.0/', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'title': 'Unraveling Hidden Representations: A Multi-Modal Layer Analysis for Better Synthetic Content Forensics', 'trapped': '/False', 'arxivid': 'https://arxiv.org/abs/2508.00784v1', 'source': 'docs/2508.00784v1.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='Unraveling Hidden Representations: A Multi-')

In [31]:
pages[0].metadata

{'producer': 'pikepdf 8.15.1',
 'creator': 'arXiv GenPDF (tex2pdf:)',
 'creationdate': '',
 'author': 'Tom Or; Omri Azencot',
 'doi': 'https://doi.org/10.48550/arXiv.2508.00784',
 'license': 'http://creativecommons.org/licenses/by/4.0/',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5',
 'title': 'Unraveling Hidden Representations: A Multi-Modal Layer Analysis for Better Synthetic Content Forensics',
 'trapped': '/False',
 'arxivid': 'https://arxiv.org/abs/2508.00784v1',
 'source': 'docs/2508.00784v1.pdf',
 'total_pages': 24,
 'page': 0,
 'page_label': '1'}

In [None]:
#markdown splitting
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

md_header_splits[0]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'}, page_content='Hi this is Jim  \nHi this is Joe')

In [33]:
md_header_splits[1]

Document(metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}, page_content='Hi this is Lance')

In [None]:
#split a github readme file based on headers
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://raw.githubusercontent.com/garima2510/BiblioBot/refs/heads/main/README.md")
docs = loader.load()
txt = ' '.join([d.page_content for d in docs])

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(txt)

In [41]:
md_header_splits[0]

Document(metadata={'Header 1': '📚 BiblioBot - Book Information Chatbot'}, page_content='A Python-based chatbot that helps users find book information and answer questions about books using Azure OpenAI and Google Books API.')

In [43]:
md_header_splits

[Document(metadata={'Header 1': '📚 BiblioBot - Book Information Chatbot'}, page_content='A Python-based chatbot that helps users find book information and answer questions about books using Azure OpenAI and Google Books API.'),
 Document(metadata={'Header 1': '📚 BiblioBot - Book Information Chatbot', 'Header 2': 'Features'}, page_content='- 🔍 **Book Search**: Search for books by title or author using Google Books API\n- 💬 **AI Chat**: Ask natural language questions about books, authors, genres, and ratings\n- 🎨 **User-Friendly UI**: Clean Streamlit interface\n- 🔐 **Secure**: Environment variables for API key management'),
 Document(metadata={'Header 1': '📚 BiblioBot - Book Information Chatbot', 'Header 2': 'When my kid used it'}, page_content='![BiblioBot Demo Image][logo]  \n[logo]: https://github.com/garima2510/BiblioBot/blob/main/demo.png "Bibliobot Demo Image"'),
 Document(metadata={'Header 1': '📚 BiblioBot - Book Information Chatbot', 'Header 2': 'Setup Instructions', 'Header 3': 