In [None]:
!pip install pytesseract pdf2image pillow langchain_community fastembed chromadb
!sudo apt update
!sudo apt install tesseract-ocr
!sudo apt install poppler-utils

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting fastembed
  Downloading fastembed-0.7.1-py3-none-any.whl.metadata (10 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting loguru<0.8.0,>=0.7.2 (from fastembed)
  Downloading loguru-0.7.3-py3-none-any.whl.meta

In [None]:
import os
import shutil
import hashlib
from langchain.llms import HuggingFacePipeline
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains.history_aware_retriever import create_history_aware_retriever
from langchain.prompts import PromptTemplate

In [None]:
# --- Config ---
DATA_DIR = "data"
PDF_DIR = os.path.join(DATA_DIR, "pdf")
DB_DIR = os.path.join(DATA_DIR, "db")

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(DB_DIR, exist_ok=True)

In [None]:
#!pip install -U huggingface_hub
#!huggingface-cli login  # You’ll be prompted for your HF token (needed for gated models like MedGemma)

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen3-0.6B"  # Or try "mistralai/Mistral-7B-Instruct-v0.1" or "google/medgemma-7b" (if supported)

local_dir = "./my_local_model"

# Download tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Save to disk
tokenizer.save_pretrained(local_dir)
model.save_pretrained(local_dir)

print(f"Model and tokenizer saved to: {local_dir}")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Model and tokenizer saved to: ./my_local_model


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_path = "./my_local_model"

# Load from disk
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Optional: Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


In [None]:
# --- Components ---
embedding = FastEmbedEmbeddings()
llm = HuggingFacePipeline(pipeline=generator)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2048, chunk_overlap=100)
chat_history = []

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model_optimized.onnx:   0%|          | 0.00/66.5M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

  llm = HuggingFacePipeline(pipeline=generator)


In [None]:
# --- Utilities ---
def compute_md5(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def preprocess_text(text):
    return text.strip().replace('\n', ' ').replace('\r', '')

class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata or {}

In [None]:
# --- Vector Store Initialization ---
def initialize_vector_store():
    return Chroma(persist_directory=DB_DIR, embedding_function=embedding)

vector_store = initialize_vector_store()

  return Chroma(persist_directory=DB_DIR, embedding_function=embedding)


In [None]:
# --- PDF Processing ---
def load_pdf_to_vector_store(file_path):
    try:
        loader = PDFPlumberLoader(file_path)
        docs = loader.load_and_split()
    except:
        print("PDF is unstructured. Performing OCR...")
        ocr_text = perform_ocr(file_path)
        docs = [Document(page_content=perform_ocr(file_path), metadata={"source": os.path.basename(file_path)})]

    cleaned_docs = [
        Document(page_content=preprocess_text(doc.page_content), metadata={"source": os.path.basename(file_path)})
        for doc in docs
    ]

    chunks = text_splitter.split_documents(cleaned_docs)
    for chunk in chunks:
        chunk.metadata = {"source": os.path.basename(file_path)}

    vector_store = Chroma.from_documents(chunks, embedding=embedding, persist_directory=DB_DIR)
    return len(chunks)

In [None]:
# --- Retrieval Chain ---
def ask_question(query, prompt_template):
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"k": 20, "score_threshold": 0.1}
    )

    retriever_prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),
        ("human", "Given the above conversation, generate a search query...")
    ])

    history_aware_retriever = create_history_aware_retriever(llm, retriever, retriever_prompt)
    document_chain = create_stuff_documents_chain(llm, prompt_template)
    retrieval_chain = create_retrieval_chain(history_aware_retriever, document_chain)

    result = retrieval_chain.invoke({"input": query})
    chat_history.append(HumanMessage(content=query))
    chat_history.append(AIMessage(content=result["answer"]))
    return result["answer"], result.get("context", [])

In [None]:
# --- OCR Placeholder ---
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def perform_ocr(pdf_path):
    """
    Perform OCR on all pages of a PDF and return the concatenated text.
    """
    try:
        # Convert PDF to list of images
        images = convert_from_path(pdf_path)

        # OCR each page image
        text = ""
        for i, img in enumerate(images):
            print(f"OCR processing page {i + 1}/{len(images)}")
            ocr_text = pytesseract.image_to_string(img)
            text += f"\n--- Page {i + 1} ---\n{ocr_text.strip()}\n"

        return text
    except Exception as e:
        print(f"OCR failed: {e}")
        return "OCR failed to extract text."

In [None]:
from langchain.prompts import PromptTemplate

# Define all prompts
PROMPTS = {
    "General AI Assistant": PromptTemplate.from_template(
        """
        <s>[INST] You are an exceptionally advanced AI assistant, equipped with state-of-the-art capabilities to understand and analyze technical documents. Your role is to deliver responses that are not only accurate and insightful but also enriched with a deep understanding of the context provided by the PDFs.

        **Instructions:**
        - Thoroughly analyze the provided context and input.
        - Extract and synthesize key information from the PDFs to provide a comprehensive and informed response.
        - Enhance your responses with detailed explanations, advanced insights, and contextually relevant examples.
        - Present information in a structured format using Markdown where applicable, but prioritize clarity and depth of content over formatting.
        - Address the query with a high level of detail and sophistication, demonstrating a deep understanding of the subject matter.
        - If any critical information is missing or if further context is needed, clearly indicate this in your response.

        **Response Guidelines:**
        - **Introduction:** Begin with a brief overview of the topic, setting the stage for a detailed analysis.
        - **Detailed Analysis:** Provide an in-depth examination of the topic, incorporating insights derived from the PDFs.
        - **Contextual Insights:** Relate the information to the context provided by the PDFs, making connections and highlighting relevant points.
        - **Examples and Explanations:** Include specific examples, detailed explanations, and any relevant data or findings from the PDFs.
        - **Conclusion:** Summarize the key points and provide a well-rounded conclusion based on the analysis.

        **Example Output:**

        # Overview
        The provided PDFs offer a comprehensive overview of ...

        # In-Depth Analysis
        Based on the documents, the key findings include ...

        # Contextual Insights
        The analysis reveals that ...

        # Examples and Explanations
        For instance, document A highlights ...

        # Conclusion
        In conclusion, the analysis demonstrates ...

        **Your Response:**
        [/INST]</s> {input}
        Context: {context}
        """
    ),
    "Summary": PromptTemplate.from_template(
        """
        <s>[INST] You are an advanced AI assistant with expertise in summarizing technical documents. Your goal is to create a clear, concise, and well-organized summary using Markdown formatting. Focus on extracting and presenting the essential points of the document effectively.

        **Instructions:**
        - Analyze the provided context and input carefully.
        - Identify and highlight the key points, main arguments, and important details.
        - Format the summary using Markdown for clarity:
            - Use `#` for main headers and `##` for subheaders.
            - Use `**text**` for important terms or concepts.
            - Provide a brief introduction, followed by the main points, and a concluding summary if applicable.
        - Ensure the summary is easy to read and understand, avoiding unnecessary jargon.

        **Example Summary Format:**

        # Overview
        **Document Title:** *Technical Analysis Report*

        **Summary:**
        The report provides an in-depth analysis of the recent technical advancements in AI. It covers key areas such as ...

        # Key Findings
        - **Finding 1:** Description of finding 1.
        - **Finding 2:** Description of finding 2.

        # Conclusion
        The analysis highlights the significant advancements and future directions for AI technology.

        **Your Response:**
        [/INST]</s> {input}
        Context: {context}
        """
    ),
    "Essays Expert": PromptTemplate.from_template(
        """
        <s>[INST] Your task is to compose a detailed and engaging essay on the provided topic. Begin by thoroughly examining the context derived from PDFs uploaded by the user, along with the given input. Your essay should be seamlessly structured, starting with an engaging introduction that sets the stage and highlights the significance of the topic. Follow with a comprehensive body where you delve into the subject matter, offering in-depth analysis, relevant examples, and detailed explanations. Conclude with a reflective summary that captures the essence of your discussion and considers potential future implications or directions.

        Ensure that your essay flows continuously and cohesively, avoiding the use of bullet points or lists. Construct your writing with smooth transitions and connected sentences, employing clear and descriptive language to effectively convey your insights and findings.

        For example, if addressing recent developments in artificial intelligence, you should explore how advancements are transforming various sectors and influencing societal interactions. Discuss the implications of technological progress in machine learning and natural language processing on business practices and everyday life. Your conclusion should provide thoughtful reflections on the future trajectory of AI and its broader implications.

        **Your Response:**
        [/INST]</s> Context derived from PDFs uploaded by the user: {context} {input}
        """
    ),
    "Technical": PromptTemplate.from_template(
        """
        <s>[INST] You are a highly skilled AI assistant in technical document summarization. Your task is to provide a detailed and well-organized response using Markdown formatting. The response should be informative and structured, presenting data and information in a clear manner.

        **Instructions:**
        - Analyze the provided context and input comprehensively.
        - Use Markdown to structure the response effectively:
            - Employ `#`, `##`, `###` headers for different sections.
            - Use `**text**` to emphasize key points.
            - Include relevant links, code blocks, and tables if applicable.
        - Ensure that each section of the response flows logically and that the information is presented clearly.
        - Indicate if any critical information is missing and provide a structured layout for easy readability.

        **Markdown Formatting Guide:**
        - Headers: Use `#` for main headings, `##` for subheadings, and `###` for detailed subheadings.
        - Bold Text: Use `**text**` to highlight important terms or concepts.
        - Italic Text: Use `*text*` for emphasis.
        - Bulleted Lists: Use `-` or `*` for unordered lists where necessary.
        - Numbered Lists: Use `1.`, `2.` for ordered lists when appropriate.
        - Links: Include `[link text](URL)` to provide additional resources or references.
        - Code Blocks: Use triple backticks (```) for code snippets.
        - Tables: Use `|` to organize data into tables for clarity.

        **Example Output:**

        ## Introduction
        The document provides a thorough analysis of ...

        ## Key Details
        - **Aspect 1:** Detailed description of aspect 1.
        - **Aspect 2:** Detailed description of aspect 2.

        ## Analysis
        The analysis reveals ...

        ## Conclusion
        The summary highlights the significance of ...

        **Your Response:**
        [/INST]</s> {input}
        Context: {context}
        """
    ),
    # Add more prompts as needed
}

In [None]:
# --- Usage Example ---
if __name__ == "__main__":
    # Upload a PDF
    file_path = "example.pdf"  # Ensure this file exists
    if os.path.exists(file_path):
        print("Uploading PDF...")
        chunk_count = load_pdf_to_vector_store(file_path)
        print(f"Uploaded and split into {chunk_count} chunks.")

    # Ask a question
    prompt = PROMPTS.get("General AI Assistant")  # Replace "default" with your key
    answer, context = ask_question("What is this PDF about?", prompt)
    print("Answer:", answer)
    for doc in context:
        print("Context Source:", doc.metadata.get("source"))

Uploading PDF...
PDF is unstructured. Performing OCR...
OCR processing page 1/9
OCR processing page 2/9
OCR processing page 3/9
OCR processing page 4/9
OCR processing page 5/9
OCR processing page 6/9
OCR processing page 7/9
OCR processing page 8/9
OCR processing page 9/9
OCR processing page 1/9
OCR processing page 2/9
OCR processing page 3/9
OCR processing page 4/9
OCR processing page 5/9
OCR processing page 6/9
OCR processing page 7/9
OCR processing page 8/9
OCR processing page 9/9
Uploaded and split into 14 chunks.


KeyboardInterrupt: 