In [None]:
!pip install pypdf2 pdfminer.six pdfplumber
!pip install anytree  # for hierarchical structure visualization

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1
Collecting anytree
  Downloading anytree-2.13.0-py3-none-any.whl.metadata (8.0 kB)
Downloading anytree-2.13.0-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anytree
Successfully installed anytree-2.13.0


In [None]:
import pdfplumber
import re
import json
import os
from collections import defaultdict

def extract_outline(pdf_path):
    """
    Enhanced version to extract title and headings (H1-H4) from a PDF document
    with improved heading detection and level assignment
    """
    outline = []
    title = ""
    previous_level = None
    heading_counts = defaultdict(int)

    # Heading patterns with improved matching
    h1_pattern = re.compile(r'^(?![A-Za-z]\.\s)[A-Z][A-Z0-9\s\-—]+$')  # All caps with possible numbers/dashes
    h2_pattern = re.compile(r'^(?![A-Za-z]\.\s)[A-Z][a-zA-Z0-9\s,:;—\-]+$')  # Title case
    h3_pattern = re.compile(r'^(?![A-Za-z]\.\s)[a-zA-Z0-9\s]+:$')  # Ends with colon
    h4_pattern = re.compile(r'^(?![A-Za-z]\.\s)[a-zA-Z0-9\s]+(\?|—|:)$')  # Ends with ? or — or :

    with pdfplumber.open(pdf_path) as pdf:
        # First try to get title from first line of first page
        first_page = pdf.pages[0]
        first_text = first_page.extract_text()
        if first_text:
            # Look for RFP pattern in first few lines
            for line in first_text.split('\n')[:5]:
                if "RFP:" in line or "Request for Proposal" in line:
                    title = line.strip()
                    break
            if not title:
                title = first_text.split('\n')[0].strip()

        # Process all pages for headings
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue

            lines = text.split('\n')
            for line in lines:
                line = line.strip()
                if not line:
                    continue

                # Skip common non-heading items
                if (len(line.split()) > 10 or  # Too long for a heading
                    line.isdigit() or  # Page numbers
                    re.match(r'^\d{1,2}/\d{1,2}/\d{2,4}$', line) or  # Dates
                    re.match(r'^[A-Za-z]\.\s', line) or  # List items
                    line.startswith('http') or  # URLs
                    len(line) > 150 or  # Too long
                    re.search(r'([A-Z])\1{3,}', line)):  # Repeated characters
                    continue

                # Determine heading level with priority
                level = None
                if h1_pattern.match(line) and len(line.split()) <= 8:
                    level = "H1"
                elif h2_pattern.match(line) and len(line.split()) <= 8:
                    level = "H2"
                elif h3_pattern.match(line) and len(line.split()) <= 6:
                    level = "H3"
                elif h4_pattern.match(line) and len(line.split()) <= 6:
                    level = "H4"

                # Additional context checks
                if level:
                    # Count occurrences to help with level adjustment
                    heading_counts[level] += 1

                    # Adjust level based on document structure
                    if level == "H1" and heading_counts["H1"] > 3:
                        level = "H2"
                    elif level == "H2" and previous_level == "H1":
                        pass  # Keep as H2
                    elif level == "H2" and previous_level == "H3":
                        level = "H3"

                    # Skip if same level appears consecutively with similar length
                    if (previous_level and
                        level == previous_level and
                        len(outline) > 0 and
                        outline[-1]['page'] == page_num and
                        abs(len(outline[-1]['text']) - len(line)) < 10):
                        continue

                    outline.append({"level": level, "text": line, "page": page_num})
                    previous_level = level

    # Post-processing to clean up results
    cleaned_outline = []
    skip_phrases = ["March 21, 2003", "April 21, 2003", "Timeline:", "Committee."]

    for item in outline:
        # Skip specific phrases and page numbers
        if (item['text'] in skip_phrases or
            re.match(r'^Page \d+$', item['text']) or
            re.match(r'^\d+$', item['text'])):
            continue

        # Clean up text
        text = item['text'].strip()
        if not text.endswith((':', '?', '—')):
            text = text.rstrip('.')

        cleaned_outline.append({
            "level": item['level'],
            "text": text,
            "page": item['page']
        })

    return {
        "title": title.strip(),
        "outline": cleaned_outline
    }

def process_pdf(input_path, output_path):
    """Process a PDF file and save the outline as JSON"""
    result = extract_outline(input_path)
    with open(output_path, 'w') as f:
        json.dump(result, f, indent=2)
    return result

# Example usage
pdf_path = "/content/file05.pdf"  # Replace with your PDF path
output_json = "output.json"
result = process_pdf(pdf_path, output_json)
print(json.dumps(result, indent=2))

{
  "title": "ADDRESS:",
  "outline": [
    {
      "level": "H2",
      "text": "ADDRESS:",
      "page": 1
    },
    {
      "level": "H1",
      "text": "TOPJUMP",
      "page": 1
    },
    {
      "level": "H2",
      "text": "PIGEON FORGE, TN 37863",
      "page": 1
    },
    {
      "level": "H1",
      "text": "CLOSED TOED SHOES ARE REQUIRED FOR CLIMBING",
      "page": 1
    },
    {
      "level": "H2",
      "text": "PARENTS OR GUARDIANS NOT ATTENDING THE PARTY,",
      "page": 1
    }
  ]
}


In [None]:
!pip install pdfplumber transformers sentence-transformers nltk
!python -m nltk.downloader punkt

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
Using cached nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)
[0mInstalling collected packages: nvidia-cudnn-cu12, nvidia-cusolver-cu12
  Attempting uninstall: nvidia-cusolver-cu12
[0m    Found existing installation: nvidia-cusolver-cu12 11.6.3.83
    Uninstalling nvidia-cusolver-cu12-11.6.3.83:
      Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83
[0mSuccessfully installed nvidia-cudnn-cu12 nvidia-cusolver-cu12-11.6.1.9


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import pdfplumber
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

class PDFRanker:
    def __init__(self):
        # Load a lightweight sentence transformer model
        self.model = SentenceTransformer('all-MiniLM-L6-v2')  # <100MB, works offline

    def extract_text_from_pdf(self, pdf_path):
        """Extract all text from a PDF."""
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    text += page.extract_text() or ""
        except Exception as e:
            print(f"Error reading {pdf_path}: {str(e)}")
        return text.strip()

    def rank_pdfs(self, pdf_paths, query):
        """
        Rank PDFs by relevance to a query.

        Args:
            pdf_paths (list): List of PDF file paths.
            query (str): The search query (e.g., "Plan a 4-day trip for college friends").

        Returns:
            list: Ranked list of tuples (pdf_path, relevance_score).
        """
        # Encode the query
        query_embedding = self.model.encode(query)

        # Process each PDF
        scores = []
        for pdf_path in pdf_paths:
            text = self.extract_text_from_pdf(pdf_path)
            if not text:
                scores.append((pdf_path, 0.0))
                continue

            # Encode the PDF text
            text_embedding = self.model.encode(text)

            # Calculate cosine similarity
            similarity = cosine_similarity(
                [query_embedding],
                [text_embedding]
            )[0][0]

            scores.append((pdf_path, float(similarity)))

        # Sort by relevance (highest first)
        scores.sort(key=lambda x: x[1], reverse=True)
        return scores

# Example Usage
pdf_paths = [
    "South of France - Cities.pdf",
    "South of France - Cuisine.pdf",
    "South of France - History.pdf",
    "South of France - Restaurants and Hotels.pdf",
    "South of France - Things to Do.pdf",
    "South of France - Tips and Tricks.pdf",
    "South of France - Traditions and Culture.pdf"
]

query = "Plan a 4-day trip for a group of 10 college friends."

ranker = PDFRanker()
ranked_pdfs = ranker.rank_pdfs(pdf_paths, query)

# Print results
print("Ranked PDFs (Most Relevant First):")
for i, (pdf_path, score) in enumerate(ranked_pdfs, 1):
    print(f"{i}. {os.path.basename(pdf_path)} (Score: {score:.3f})")

Ranked PDFs (Most Relevant First):
1. South of France - Tips and Tricks.pdf (Score: 0.246)
2. South of France - Things to Do.pdf (Score: 0.164)
3. South of France - Restaurants and Hotels.pdf (Score: 0.144)
4. South of France - Cities.pdf (Score: 0.101)
5. South of France - Cuisine.pdf (Score: 0.065)
6. South of France - Traditions and Culture.pdf (Score: 0.048)
7. South of France - History.pdf (Score: 0.044)


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True