<a href="https://colab.research.google.com/github/guptaml/ai_notebooks/blob/main/pdf_chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PDF Summarizer (Kaggle/Colab/Local Support)
## High-Performance Parallel Processing Edition
This notebook supports Google Colab, Kaggle, and Local environments with automatic GPU detection.

In [1]:
try:
    import os
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        print("Running on Kaggle.")
        %pip install -q --no-cache-dir --no-input "gpt4all[cuda]" pdfplumber pypdf
    elif 'google.colab' in str(get_ipython()):
        print("Running on Google Colab.")
        %pip install -q --no-cache-dir --no-input "gpt4all[cuda]" pdfplumber pypdf
    else:
        print("Running Locally.")
        %pip install --no-cache-dir --no-input gpt4all pdfplumber pypdf
except Exception:
    print("Running Locally.")
    %pip install --no-cache-dir --no-input gpt4all pdfplumber pypdf

Running on Google Colab.


In [2]:
import threading
from concurrent.futures import ThreadPoolExecutor
import os
import sys
from gpt4all import GPT4All
import pdfplumber
from pypdf import PdfReader

class Summarizer:
    def __init__(self, model_name="Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf", device="gpu"):
        print(f"  -> Loading model: {model_name} on {device}...")
        # n_ctx=8192 for large context support
        self.model = GPT4All(model_name, device=device, n_ctx=8192)
        self.model_lock = threading.Lock()
        print(f"  -> Model loaded successfully")

    def _summarize_chunk(self, chunk, i, total_chunks):
        print(f"     -> Starting chunk {i}/{total_chunks}...")
        prompt = f"Briefly summarize this part:\n\n{chunk}"
        with self.model_lock:
            with self.model.chat_session():
                return self.model.generate(prompt, max_tokens=256)

    def get_summary_text(self, content, max_words=200):
        words = content.split()
        total_words = len(words)
        chunk_size = 1200

        if total_words > 6000:
            print(f"  -> Truncating long content to 6000 words.")
            words = words[:6000]
            content = " ".join(words)
            total_words = 6000

        if total_words <= chunk_size:
            print(f"  -> Content loaded ({total_words} words)")
            prompt = f"Summarize the following in {max_words} words:\n\n{content}"
            with self.model.chat_session():
                return self.model.generate(prompt, max_tokens=512)
        else:
            chunks = [" ".join(words[i : i + chunk_size]) for i in range(0, total_words, chunk_size)]
            total_chunks = len(chunks)
            print(f"  -> Processing in {total_chunks} chunks (Parallel)... ")

            with ThreadPoolExecutor(max_workers=4) as executor:
                futures = [executor.submit(self._summarize_chunk, chunk, i, total_chunks)
                           for i, chunk in enumerate(chunks, 1)]
                chunk_summaries = [f.result() for f in futures]

            print("  -> Creating final summary from combined chunks...")
            combined_text = "\n\n".join(chunk_summaries)
            final_prompt = f"Cohesively summarize these parts into a {max_words} word summary:\n\n{combined_text}"
            with self.model.chat_session():
                return self.model.generate(final_prompt, max_tokens=512)

class PDFExtractor:
    def __init__(self, pdf_path): self.pdf_path = pdf_path
    def get_page_count(self):
        with pdfplumber.open(self.pdf_path) as pdf: return len(pdf.pages)
    def extract_all_text(self):
        with pdfplumber.open(self.pdf_path) as pdf:
            return "\n".join([p.extract_text() for p in pdf.pages if p.extract_text()])

In [3]:
# 1. CONFIGURATION
PDF_FILE = "ifrs9.pdf"
MODEL_NAME = "Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf"

IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
IS_COLAB = 'google.colab' in sys.modules
DEVICE = "cuda" if (IS_KAGGLE or IS_COLAB) else "gpu"

if not os.path.exists(PDF_FILE):
    print(f"Warning: {PDF_FILE} not found. Please upload it.")
else:
    extractor = PDFExtractor(PDF_FILE)
    print(f"Extracting text from {PDF_FILE}...")
    custom_knowledge = extractor.extract_all_text()

    output_file = "custom_knowledge.txt"

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(custom_knowledge)

    print(f"Extracted text written to {output_file}")
    print("Text extracted successfully.")

Extracting text from ifrs9.pdf...
Extracted text written to custom_knowledge.txt
Text extracted successfully.


In [4]:
# 2. LOAD MODEL
summarizer = Summarizer(model_name=MODEL_NAME, device=DEVICE)

  -> Loading model: Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf on cuda...
  -> Model loaded successfully


In [5]:
# 3. SUMMARIZE
if 'custom_knowledge' in locals():
    print("Generating summary...")
    response = summarizer.get_summary_text(custom_knowledge, max_words=200)
    print("\nRESPONSE:\n", response)
else:
    print("Error: No knowledge extracted.")

Generating summary...
  -> Truncating long content to 6000 words.
  -> Processing in 5 chunks (Parallel)... 
     -> Starting chunk 1/5...
     -> Starting chunk 2/5...
     -> Starting chunk 3/5...
     -> Starting chunk 4/5...
     -> Starting chunk 5/5...
  -> Creating final summary from combined chunks...

RESPONSE:
 Here is a 200-word summary of the provided texts:

IFRS 9 is an international financial reporting standard for accounting for financial instruments. It was developed by the International Accounting Standards Board (IASB) and replaced IAS 39 in July 2014 after several amendments and additions. The key changes include permitting entities to choose between applying hedge accounting requirements from either IFRS 9 or IAS 39, introducing a 'fair value through other comprehensive income' measurement category for simple debt instruments, and adding impairment requirements related to expected credit losses on financial assets.

IFRS 9 applies to all entities except specific ca

In [6]:
# 4. Q&A - Ask questions about the document
def ask_pdf(question):
    # Access variables from the notebook's global scope
    kb = globals().get('custom_knowledge')
    sm = globals().get('summarizer')

    if kb and sm:
        # Truncate context for Q&A to fit within model limits
        words = kb.split()
        context = ' '.join(words[:4000]) if len(words) > 4000 else kb

        prompt = f'Answer based ONLY on this content:\n\n{context}\n\nQuestion: {question}\nAnswer:'
        print(f'\nQuestion: {question}')

        with sm.model_lock:
            with sm.model.chat_session():
                response = sm.model.generate(prompt, max_tokens=1024)
        print('\nRESPONSE:\n', response)
    else:
        print('Error: Missing custom_knowledge or summarizer. Please run the extraction and model load cells first.')

# Example Usage:
ask_pdf('What is a virtual network ?')


Question: What is a virtual network ?

RESPONSE:
 There is no information provided about "a virtual network" in the given content, which appears to be related to IFRS 9 Financial Instruments. Therefore, I cannot provide an answer based solely on this content. If you have any other questions or need further assistance with a different topic, please let me know!
