In [None]:
! pip install pdfplumber pymupdf ebooklib python-docx



In [None]:
import os
import pdfplumber
import fitz  # PyMuPDF
from ebooklib import epub
from docx import Document


def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text


def extract_text_from_pdf_pymupdf(file_path):
    text = ""
    doc = fitz.open(file_path)
    for page in doc:
        text += page.get_text()
    return text


def extract_text_from_epub(file_path):
    book = epub.read_epub(file_path)
    text = ""
    for item in book.get_items():
        if item.get_type() == epub.ITEM_DOCUMENT:
            text += item.get_content().decode("utf-8", errors="ignore")
    return text


def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()


def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])


def load_book(file_path):
    extension = os.path.splitext(file_path)[1].lower()

    if extension == ".pdf":
        return extract_text_from_pdf(file_path)
        # OR use PyMuPDF:
        # return extract_text_from_pdf_pymupdf(file_path)

    elif extension == ".epub":
        return extract_text_from_epub(file_path)

    elif extension == ".txt":
        return extract_text_from_txt(file_path)

    elif extension == ".docx":
        return extract_text_from_docx(file_path)

    else:
        raise ValueError("Unsupported file format")


In [None]:
import pdfplumber

file_path = "/content/Introduction to Machine Learning with Python.pdf"

# Load and extract text
raw_text = ""
with pdfplumber.open(file_path) as pdf:
    for page in pdf.pages:
        raw_text += page.extract_text()

print(raw_text[:2000])

Introduction to
Machine
Learning
with Python
A GUIDE FOR DATA SCIENTISTS
Andreas C. Müller & Sarah GuidoIntroduction to Machine Learning
with Python
A Guide for Data Scientists
Andreas C. Müller and Sarah Guido
BBeeiijjiinngg BBoossttoonn FFaarrnnhhaamm SSeebbaassttooppooll TTookkyyooIntroduction to Machine Learning with Python
by Andreas C. Müller and Sarah Guido
Copyright © 2017 Sarah Guido, Andreas Müller. All rights reserved.
Printed in the United States of America.
Published by O’Reilly Media, Inc., 1005 Gravenstein Highway North, Sebastopol, CA 95472.
O’Reilly books may be purchased for educational, business, or sales promotional use. Online editions are
also available for most titles (http://safaribooksonline.com). For more information, contact our corporate/
institutional sales department: 800-998-9938 or corporate@oreilly.com.
Editor: Dawn Schanafelt Indexer: Judy McConville
Production Editor: Kristen Brown Interior Designer: David Futato
Copyeditor: Rachel Head Cover Designer

**text_preprocessor.py**

In [None]:
import re
from typing import List, Dict


def normalize_whitespace(text: str) -> str:
    """Normalize extra spaces, tabs, and newlines."""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


def remove_page_numbers(text: str) -> str:
    """Remove standalone page numbers."""
    text = re.sub(r'\n?\s*\d+\s*\n', '\n', text)
    return text


def remove_headers_and_footers(text: str) -> str:
    """
    Removes common header/footer patterns:
    - Repeated short lines
    - Page indicators
    """
    lines = text.splitlines()
    cleaned_lines = []

    for line in lines:
        if len(line.strip()) < 5:
            continue
        if re.match(r'page\s*\d+', line.lower()):
            continue
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines)


def remove_references(text: str) -> str:
    """Remove references or bibliography sections."""
    patterns = [
        r'\nreferences\b.*',
        r'\nbibliography\b.*',
        r'\nworks cited\b.*'
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.DOTALL)

    return text


def clean_text(raw_text: str) -> str:
    """Full cleaning pipeline."""
    text = raw_text
    text = remove_headers_and_footers(text)
    text = remove_page_numbers(text)
    text = remove_references(text)
    text = normalize_whitespace(text)
    return text


**chapter_detector.py**

In [None]:
import re
from typing import Dict


CHAPTER_PATTERNS = [
    r'chapter\s+\d+',
    r'chapter\s+[ivxlcdm]+',
    r'\b\d+\.\s+[A-Z]',
    r'\b[A-Z][A-Z\s]{5,}'
]

def split_by_chapter(text: str) -> Dict[str, str]:
    """
    Splits text into chapters using regex patterns.
    Returns a dict: {chapter_title: chapter_text}
    """
    # Refined regex for Roman numerals to be more specific for chapters
    roman_numeral_pattern = r'\b(?:I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX)\b'

    matches = list(re.finditer(
        rf'(chapter\s+\d+|chapter\s+{roman_numeral_pattern})',
        text,
        flags=re.IGNORECASE
    ))

    chapters = {}

    if not matches:
        chapters["Full Text"] = text
        return chapters

    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        chapter_title = match.group().title()
        chapter_content = text[start:end].strip()
        chapters[chapter_title] = chapter_content

    return chapters

In [None]:
raw_text = load_book(file_path)
cleaned_text = clean_text(raw_text)
chapters = split_by_chapter(cleaned_text)

print(f"Detected chapters: {len(chapters)}")
for title, content in chapters.items():
    print(title, "->", len(content), "characters")

Detected chapters: 8
Chapter 1 -> 2265 characters
Chapter 4 -> 1680 characters
Chapter 5 -> 2407 characters
Chapter 6 -> 689 characters
Chapter 7 -> 1912 characters
Chapter 8 -> 22723 characters
Chapter 2 -> 867 characters
Chapter 3 -> 2754 characters


Need to check with the Title Names, since we are only getting Chapter 1 That's it

In [None]:
!pip install langchain tiktoken langchain-text-splitters



**chunker_langchain.py**

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import tiktoken


def chunk_text_langchain(
    text: str,
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    model_name: str = "gpt-4"
):
    """
    Token-aware chunking using LangChain.
    """

    tokenizer = tiktoken.encoding_for_model(model_name)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=lambda text: len(tokenizer.encode(text)),
        separators=["\n\n", "\n", ".", " ", ""]
    )

    chunks = splitter.split_text(text)
    return chunks

In [None]:
# Note: Triton installation can be complex and may require specific CUDA versions and environment setups.
# The following command is a starting point, but further troubleshooting might be needed depending on your system.
!pip install triton



In [None]:

chunked_chapters = {}

for chapter_title, chapter_text in chapters.items():
    chunks = chunk_text_langchain(chapter_text)
    chunked_chapters[chapter_title] = chunks

print(len(chunked_chapters["Chapter 1"]))

1


Interpretation:
< 1000 tokens → ✅ 1 chunk (correct)
> 1000 tokens → ❌ Should split → investigate

In [None]:
import tiktoken

tokenizer = tiktoken.encoding_for_model("gpt-4")
token_count = len(tokenizer.encode(chapters["Chapter 1"]))

print("Token count:", token_count)

Token count: 590


Summarization Agent

# summarization_agent.py

In [None]:
! pip install langchain openai



In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
!pip install langchain langchain-openai langchain-core openai



In [None]:
# First, set your OpenRouter API key as an environment variable.
# Replace 'sk-or-v1-YOUR_OPENROUTER_API_KEY_HERE' with your actual OpenRouter key.
# You can store this in Colab secrets for better security.
import os
os.environ["OPENROUTER_API_KEY"] = "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"

# Or if you want to use the API key directly in the code (not recommended for production):
OPENROUTER_API_KEY = "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"

In [None]:
from openai import OpenAI

# Initialize the OpenAI client with OpenRouter's base URL and your API key
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=os.environ.get("OPENROUTER_API_KEY"), # or use OPENROUTER_API_KEY variable if set directly
)

# Make a chat completion request
completion = client.chat.completions.create(
  model="tngtech/deepseek-r1t2-chimera:free", # The model from your JavaScript example
  messages=[
    {
      "role": "user",
      "content": "What is the meaning of life?",
    },
  ],
)

print(completion.choices[0].message.content)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit exceeded: free-models-per-day. Add 10 credits to unlock 1000 free model requests per day', 'code': 429, 'metadata': {'headers': {'X-RateLimit-Limit': '50', 'X-RateLimit-Remaining': '0', 'X-RateLimit-Reset': '1770249600000'}, 'provider_name': None}}, 'user_id': 'user_38yXbtpb817LkWFrDul8PhcJH7u'}

This Python code now performs the same API call to OpenRouter as your original JavaScript snippet. If you want to integrate this with your existing LangChain summarization agent, you can modify the `create_summarization_agent` function to use `base_url` and `api_key` parameters when initializing `ChatOpenAI`.

In [None]:
OPENROUTER_API_KEY = "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"
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
# LLMChain has been deprecated, using LCEL instead

SUMMARY_PROMPT = """
You are an expert book summarizer.

Summarize the following text clearly and concisely.
Focus on:
- Key ideas
- Main arguments
- Important outcomes

Text:
{text}

Concise Summary:
"""


def create_summarization_agent(
    model_name="tngtech/deepseek-r1t2-chimera:free",
    temperature: float = 0.3,
    api_key=OPENROUTER_API_KEY,   # IMPORTANT
    api_base=OPENROUTER_BASE_URL  # IMPORTANT
):
    llm = ChatOpenAI(
        model=model_name,
        temperature=temperature,
        openai_api_base=api_base,
        openai_api_key=api_key
    )

    prompt = PromptTemplate(
        input_variables=["text"],
        template=SUMMARY_PROMPT
    )

    # LCEL chain
    chain = prompt | llm
    return chain


def summarize_chunks(chunks: list, summarization_chain) -> list:
    summaries = []

    for chunk in chunks:
        # Use .invoke() for LCEL chains
        response = summarization_chain.invoke({"text": chunk})
        summary = response.content # Access content from AIMessage or HumanMessage
        summaries.append(summary.strip())

    return summaries

In [None]:

summarization_chain = create_summarization_agent()

chapter_summaries = {}

for chapter_title, chunks in chunked_chapters.items():
    summaries = summarize_chunks(chunks, summarization_chain)
    chapter_summaries[chapter_title] = summaries

print(chapter_summaries["Chapter 1"][0])


In [None]:
print(chapter_summaries["Chapter 1"][0])

In [None]:
CHAPTER_SUMMARY_PROMPT = """
You are an expert book editor.

Combine the following partial summaries into a single, coherent chapter summary.
- Avoid repetition
- Maintain logical flow
- Preserve all important ideas

Partial Summaries:
{summaries}

Final Chapter Summary:
"""

def create_chapter_summary_agent(
    model_name="tngtech/deepseek-r1t2-chimera:free",
    temperature=0.3
):
    llm = ChatOpenAI(
        model=model_name,
        temperature=temperature,
        openai_api_key=OPENROUTER_API_KEY,   # IMPORTANT
        openai_api_base=OPENROUTER_BASE_URL  # IMPORTANT
    )

    prompt = PromptTemplate(
        input_variables=["summaries"],
        template=CHAPTER_SUMMARY_PROMPT
    )

    return prompt | llm

def summarize_chapter(chunk_summaries, chapter_chain):
    joined_summaries = "\n".join(
        f"- {summary}" for summary in chunk_summaries
    )

    response = chapter_chain.invoke(
        {"summaries": joined_summaries}
    )

    return response.content.strip()

chapter_summary_chain = create_chapter_summary_agent()

final_chapter_summaries = {}

for chapter_title, chunk_summaries in chapter_summaries.items():
    final_summary = summarize_chapter(
        chunk_summaries,
        chapter_summary_chain
    )
    final_chapter_summaries[chapter_title] = final_summary


print(final_chapter_summaries["Chapter 1"])


In [None]:
BOOK_SUMMARY_PROMPT = """
You are an expert literary analyst and book summarizer.

Using the following chapter summaries, generate:

1️⃣ A SHORT SUMMARY (about 1 page)
2️⃣ A MEDIUM SUMMARY (5–6 well-structured paragraphs)
3️⃣ KEY HIGHLIGHTS (bullet points)

Guidelines:
- Preserve the core themes and progression of ideas
- Avoid repetition
- Ensure clarity and coherence
- Do NOT invent new content

Chapter Summaries:
{chapters}

Return the output in the following format:

SHORT SUMMARY:
<text>

MEDIUM SUMMARY:
<text>

KEY HIGHLIGHTS:
- bullet 1
- bullet 2
- bullet 3
"""

def create_book_summary_agent(
    model_name="tngtech/deepseek-r1t2-chimera:free",
    temperature=0.3
):
    llm = ChatOpenAI(
        model=model_name,
        temperature=temperature,
        openai_api_key=OPENROUTER_API_KEY,
        openai_api_base=OPENROUTER_BASE_URL
    )

    prompt = PromptTemplate(
        input_variables=["chapters"],
        template=BOOK_SUMMARY_PROMPT
    )

    return prompt | llm
def summarize_book(final_chapter_summaries, book_chain):
    combined_chapters = "\n\n".join(
        f"{title}:\n{summary}"
        for title, summary in final_chapter_summaries.items()
    )

    response = book_chain.invoke(
        {"chapters": combined_chapters}
    )

    return response.content.strip()
book_summary_chain = create_book_summary_agent()

final_book_summary = summarize_book(
    final_chapter_summaries,
    book_summary_chain
)

print(final_book_summary)


STEP 8 :

In [None]:
packages_to_check = ['chromadb', 'langchain', 'langchain_openai', 'tiktoken']
not_installed = []

for pkg in packages_to_check:
    try:
        __import__(pkg)
        print(f"{pkg} is already installed.")
    except ImportError:
        not_installed.append(pkg)
        print(f"{pkg} is NOT installed.")

if not_installed:
    print(f"\nTo install the missing packages, run: !pip install {' '.join(not_installed)}")
else:
    print("\nAll specified packages are already installed.")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

embeddings = OpenAIEmbeddings(
    model="tngtech/deepseek-r1t2-chimera:free",  # cheap & good
    openai_api_key="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",
)

def chapter_summaries_to_documents(chapter_summaries):
    docs = []

    for chapter_title, summary in chapter_summaries.items():
        doc = Document(
            page_content=summary,
            metadata={"chapter": chapter_title}
        )
        docs.append(doc)

    return docs

def store_in_vector_db(documents, persist_dir="book_vector_db"):
    vector_db = Chroma.from_documents(
        documents=documents,
        embedding=embeddings,
        persist_directory=persist_dir
    )

    vector_db.persist()
    return vector_db

# Convert summaries → documents
documents = chapter_summaries_to_documents(final_chapter_summaries)

# Store in vector DB
vector_db = store_in_vector_db(documents)

print("✅ Chapter summaries stored in vector database")

query = "What is the main theme of the book?"

results = vector_db.similarity_search(query, k=3)

for res in results:
    print("Chapter:", res.metadata["chapter"])
    print(res.page_content)
    print("-" * 50)