In [2]:
pip install pymupdf langchain

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Collecting langchain
  Using cached langchain-0.3.19-py3-none-any.whl.metadata (7.9 kB)
Collecting langchain-core<1.0.0,>=0.3.35 (from langchain)
  Downloading langchain_core-0.3.40-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.6 (from langchain)
  Using cached langchain_text_splitters-0.3.6-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Using cached langsmith-0.3.11-py3-none-any.whl.metadata (14 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Using cached SQLAlchemy-2.0.38-cp39-cp39-win_amd64.whl.metadata (9.9 kB)
Collecting PyYAML>=5.3 (from langchain)
  Using cached PyYAML-6.0.2-cp39-cp39-win_amd64.whl.metadata (2.1 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Using cached aiohttp-3.11.13

In [3]:
# Install required dependencies if not already installed
import os
import fitz  # PyMuPDF for PDF text extraction
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define file paths
PDF_PATH = "./Indian_Accident_Laws.pdf"  # Path to your PDF file
OUTPUT_DIR = "D:/LEXBOT/Preprocessed Data"  # Directory to save processed text
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "Indian_Accident_Laws.txt")  # Processed text file

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_text_from_pdf(pdf_path):
    """Extracts clean text from a PDF using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        text += page.get_text("text") + "\n"

    return text.strip()

def clean_text(text):
    """Cleans extracted text for efficient tokenization."""
    text = re.sub(r'\n{2,}', '\n', text)  # Remove excessive newlines
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    return text

# Step 1: Extract text from PDF
raw_text = extract_text_from_pdf(PDF_PATH)

# Step 2: Clean the extracted text
cleaned_text = clean_text(raw_text)

# Step 3: Save cleaned text to file
with open(OUTPUT_FILE, "w", encoding="utf-8") as file:
    file.write(cleaned_text)

print(f"✅ Preprocessed text saved at: {OUTPUT_FILE}")

# Step 4: Split text into chunks for tokenization
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
chunks = text_splitter.split_text(cleaned_text)

# Save each chunk separately for embedding
for idx, chunk in enumerate(chunks):
    chunk_file = os.path.join(OUTPUT_DIR, f"chunk_{idx+1}.txt")
    with open(chunk_file, "w", encoding="utf-8") as file:
        file.write(chunk)

print(f"✅ Text successfully split into {len(chunks)} chunks for embedding!")


✅ Preprocessed text saved at: D:/LEXBOT/Preprocessed Data\Indian_Accident_Laws.txt
✅ Text successfully split into 4 chunks for embedding!
