In [4]:
import langchain
from typing import List, Dict, Any
import pandas as pd


In [5]:
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
print("Set up Complete!!")

Set up Complete!!


In [6]:
doc = Document(
    page_content= "The sample text langchain will go through and apply various splitting techniques",
    metadata = {
        "source" : "example.txt",
        "page" : "4",
        "author" : "Synthetic Sharma",
        "date" : "28.09.2025"
    }
)
print("Document Structure")
print(f"Content: {doc.page_content}")
print(f"Metadata : {doc.metadata}")

Document Structure
Content: The sample text langchain will go through and apply various splitting techniques
Metadata : {'source': 'example.txt', 'page': '4', 'author': 'Synthetic Sharma', 'date': '28.09.2025'}


In [7]:
import os 
os.makedirs("data/textfiles/", exist_ok = True)

In [8]:
sample_texts = {
    "data/textfiles/stochastic_calc.txt": """
Stochastic calculus is a branch of mathematics that extends traditional calculus to stochastic processes, 
which are systems that evolve randomly over time. It provides a framework for integrating and 
differentiating non-smooth random functions and is essential for modeling and analyzing complex systems in 
fields like finance, where it is used for option pricing and risk management, as well as in physics and engineering. 
The field was pioneered by Japanese mathematician Kiyosi Itô, who developed key concepts like the stochastic integral and 
stochastic differential equations.  
"""
}

for filepath, content in sample_texts.items():
    with open(filepath, 'w', encoding= "utf-8") as f:
        f.write(content)
print("Sample file created")

Sample file created


In [9]:
from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/textfiles/stochastic_calc.txt", encoding = "utf-8")
documents = loader.load()
print(documents)
print(type(documents))

[Document(metadata={'source': 'data/textfiles/stochastic_calc.txt'}, page_content='\nStochastic calculus is a branch of mathematics that extends traditional calculus to stochastic processes, \nwhich are systems that evolve randomly over time. It provides a framework for integrating and \ndifferentiating non-smooth random functions and is essential for modeling and analyzing complex systems in \nfields like finance, where it is used for option pricing and risk management, as well as in physics and engineering. \nThe field was pioneered by Japanese mathematician Kiyosi Itô, who developed key concepts like the stochastic integral and \nstochastic differential equations.  \n')]
<class 'list'>


In [10]:
from langchain_community.document_loaders import DirectoryLoader

dir_loader = DirectoryLoader(
    "data/textfiles",
    glob="***.txt",
    loader_cls = TextLoader,
    loader_kwargs = {'encoding': 'utf-8'}, 
    show_progress=True
)
documents= dir_loader.load()

100%|██████████| 1/1 [00:00<00:00, 584.08it/s]


In [11]:
print(documents)
for i,doc in enumerate(documents):
    print(f"Length : {len(doc.page_content)} characters")

[Document(metadata={'source': 'data/textfiles/stochastic_calc.txt'}, page_content='\nStochastic calculus is a branch of mathematics that extends traditional calculus to stochastic processes, \nwhich are systems that evolve randomly over time. It provides a framework for integrating and \ndifferentiating non-smooth random functions and is essential for modeling and analyzing complex systems in \nfields like finance, where it is used for option pricing and risk management, as well as in physics and engineering. \nThe field was pioneered by Japanese mathematician Kiyosi Itô, who developed key concepts like the stochastic integral and \nstochastic differential equations.  \n')]
Length : 589 characters


In [12]:
from langchain.text_splitter import (
    CharacterTextSplitter,
    RecursiveCharacterTextSplitter,
    TokenTextSplitter
)
print(documents)

[Document(metadata={'source': 'data/textfiles/stochastic_calc.txt'}, page_content='\nStochastic calculus is a branch of mathematics that extends traditional calculus to stochastic processes, \nwhich are systems that evolve randomly over time. It provides a framework for integrating and \ndifferentiating non-smooth random functions and is essential for modeling and analyzing complex systems in \nfields like finance, where it is used for option pricing and risk management, as well as in physics and engineering. \nThe field was pioneered by Japanese mathematician Kiyosi Itô, who developed key concepts like the stochastic integral and \nstochastic differential equations.  \n')]


In [13]:
text = documents[0].page_content

In [14]:
# Character Text Splitter

char_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 200,
    chunk_overlap = 30,
    length_function = len
)
char_chunk = char_splitter.split_text(text)

In [15]:
for i in range (1,6):
    print(char_chunk[i-1])
    print("--"*50)
print(len(char_chunk))

Stochastic calculus is a branch of mathematics that extends traditional calculus to stochastic processes,
----------------------------------------------------------------------------------------------------
which are systems that evolve randomly over time. It provides a framework for integrating and
----------------------------------------------------------------------------------------------------
differentiating non-smooth random functions and is essential for modeling and analyzing complex systems in
----------------------------------------------------------------------------------------------------
fields like finance, where it is used for option pricing and risk management, as well as in physics and engineering.
----------------------------------------------------------------------------------------------------
The field was pioneered by Japanese mathematician Kiyosi Itô, who developed key concepts like the stochastic integral and 
stochastic differential equations.
--------------

In [16]:
# Recursive Character Text Splitter

recur_splitter = RecursiveCharacterTextSplitter(
    separators= ["\n", "\n \n", " ", ""],
    chunk_size = 100,
    chunk_overlap = 30,
    length_function = len
)
recur_chunks = recur_splitter.split_text(text)

In [17]:
for i in range (1,11):
    print(recur_chunks[i-1])
    print("--"*50)
print(len(recur_chunks))

Stochastic calculus is a branch of mathematics that extends traditional calculus to stochastic
----------------------------------------------------------------------------------------------------
calculus to stochastic processes,
----------------------------------------------------------------------------------------------------
which are systems that evolve randomly over time. It provides a framework for integrating and
----------------------------------------------------------------------------------------------------
differentiating non-smooth random functions and is essential for modeling and analyzing complex
----------------------------------------------------------------------------------------------------
and analyzing complex systems in
----------------------------------------------------------------------------------------------------
fields like finance, where it is used for option pricing and risk management, as well as in physics
-------------------------------------------

## Loading PDFs

In [18]:
 from langchain_community.document_loaders import (
  PyPDFLoader,
  PyMuPDFLoader,
  UnstructuredPDFLoader
 )


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# method 1
pypdf_loader = PyPDFLoader("data/ExerciseandMood.pdf")
pypdf_docs = pypdf_loader.load()
print(pypdf_docs)
print(f"\nLoaded {len(pypdf_docs)} pages")
print(f"\nPage 3 Content : {pypdf_docs[2].page_content[:200]}...")
print(f"\nMetadata: {pypdf_docs[0].metadata}")

[Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'Acrobat 5.0 Paper Capture Plug-in for Windows', 'creationdate': '2007-03-25T12:47:18+05:30', 'moddate': '2015-02-09T11:14:54-08:00', 'source': 'data/ExerciseandMood.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1'}, page_content='This article was downloaded by: [Bowling Green SU]\nOn: 09 February 2015, At: 11:14\nPublisher: Routledge\nInforma Ltd Registered in England and Wales Registered Number: 1072954 Registered\noffice: Mortimer House, 37-41 Mortimer Street, London W1T 3JH, UK\nJournal of Applied Sport Psychology\nPublication details, including instructions for authors and\nsubscription information:\nhttp://www.tandfonline.com/loi/uasp20\nExercise and mood: A selective review\nand synthesis of research employing the\nprofile of mood states\nBonnie G. Berger a b & Robert W. Motl a d c\na Bowling Green State University ,\nb University of Georgia ,\nc Bowling Green State University, School of HMSLS ,\nd Departmen

In [20]:
# Method 2 : Pymupdf Loader (preferred)

pymupdf_loader = PyMuPDFLoader("data/ExerciseandMood.pdf")
pymupdf_docs = pymupdf_loader.load()
print(pymupdf_loader)

print(f"\nLoaded {len(pymupdf_docs)} pages")
print(f"\nPage 3 Content : {pymupdf_docs[2].page_content[:200]}...")

# why better?

"""PymuPDF faster, image extraction support"""

<langchain_community.document_loaders.pdf.PyMuPDFLoader object at 0x12ab80050>

Loaded 25 pages

Page 3 Content : 70 
BERGER AND MOTL 
dition, individuals differ in the intensity of their mood fluctuations and 
in their personal predisposition to be primarily positive or negative (Mor- 
n s ,  1989). Moods have p...


'PymuPDF faster, image extraction support'

## Cleaning and parsing techniques issues

In [21]:
from typing import List

class SmartPDFProcessor:
    """Advance pdf processing with error handling"""
    def __init__(self, chunk_size = 1000, chunk_overlap = 100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size,
            chunk_overlap = chunk_overlap,
            separators=[" "]
        )
    
    def ProcessPDF(self, pdf_path:str) ->List[Document]:
        """Process pdfs with smart chunking and metadata enchancement"""

        # Load pdf

        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        # process each page
        processed_chunks = []

        for page_num, page in enumerate(pages):

            cleaned_text = self._clean_text(page.page_content)

            if len(cleaned_text.strip())<50:
                continue

            chunks = self.text_splitter.create_documents(
                texts = [cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page" : page_num +1,
                    "total_pages": len(pages),
                    "chunk_method:" : "smart_pdf_processor",
                    "char_count" : len(cleaned_text)
                }]
            )
            processed_chunks.extend(chunks)
        return processed_chunks

    def _clean_text(self, text:str) ->str:
        """clean extracted text"""

        text = " ".join(text.split())

        return text

In [22]:
preprocessor = SmartPDFProcessor()

In [23]:
preprocessor

<__main__.SmartPDFProcessor at 0x12bf95160>

In [24]:
# testing the functiom

try:
    smart_chunks = preprocessor.ProcessPDF("data/ExerciseandMood.pdf")
    print(f"Length {len(smart_chunks)} chunks.")

except Exception as e:
    print(f"Processing error!!{e}")

print (f"\nSample:")
for key, value in smart_chunks[0].metadata.items():
    print(f"  {key}: {value}")

Length 93 chunks.

Sample:
  producer: iText 4.2.0 by 1T3XT
  creator: Acrobat 5.0 Paper Capture Plug-in for Windows
  creationdate: 2007-03-25T12:47:18+05:30
  moddate: 2015-02-09T11:14:54-08:00
  source: data/ExerciseandMood.pdf
  total_pages: 25
  page: 1
  page_label: 1
  chunk_method:: smart_pdf_processor
  char_count: 2357
