In [None]:
from google.colab import userdata
from google.colab import drive
import re
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
# Install required packages
!pip install keybert pandas numpy PyMuPDF scikit-learn sentence-transformers

import fitz
import numpy as np
from keybert import KeyBERT
from typing import List, Dict, Tuple
from sklearn.cluster import KMeans
import logging
import pandas as pd


Collecting keybert
  Downloading keybert-0.8.5-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.8.5-py3-none-any.whl (37 kB)
Installing collected packages: keybert
Successfully installed keybert-0.8.5


  from tqdm.autonotebook import tqdm, trange


In [None]:
from typing import List, Dict

In [None]:
class PolicyHeaderExtractor:
    def __init__(self):
        self.kw_model = KeyBERT()

    def extract_document_structure(self, pdf_path: str) -> List[Dict]:
        """Enhanced document structure extraction with better spacing detection."""
        formatted_blocks = []
        try:
            doc = fitz.open(pdf_path)
            prev_y1 = None
            prev_font_size = None
            page_heights = []

            # First pass - collect page heights and overall stats
            for page in doc:
                page_heights.append(page.rect.height)

            avg_page_height = np.mean(page_heights)

            for page_num, page in enumerate(doc):
                blocks = page.get_text("dict")["blocks"]

                for block in blocks:
                    if "lines" in block:
                        for line in block["lines"]:
                            y0 = line["bbox"][1]
                            # Calculate spacing from previous line
                            line_spacing = y0 - prev_y1 if prev_y1 is not None else 0

                            for span in line["spans"]:
                                text = span["text"].strip()
                                if text and len(text) > 1:  # Filter out single characters
                                    font_size = span["size"]

                                    # Calculate relative position on page
                                    relative_y = y0 / avg_page_height

                                    # Detect if this is likely a header based on spacing
                                    extra_spacing_before = line_spacing > 1.5 * font_size
                                    font_size_change = (prev_font_size and font_size > prev_font_size)

                                    block_info = {
                                        "text": text,
                                        "font_size": font_size,
                                        "font_name": span["font"],
                                        "is_bold": "bold" in span["font"].lower() or span["flags"] & 2**4 != 0,
                                        "page_num": page_num + 1,
                                        "y_position": y0,
                                        "relative_y": relative_y,
                                        "line_spacing": line_spacing,
                                        "extra_spacing_before": extra_spacing_before,
                                        "font_size_change": font_size_change,
                                        "bbox": line["bbox"],
                                        "char_count": len(text)
                                    }
                                    formatted_blocks.append(block_info)

                                    prev_y1 = line["bbox"][3]
                                    prev_font_size = font_size

            return formatted_blocks
        except Exception as e:
            print(f"Error processing PDF {pdf_path}: {e}")
            return []

    def identify_potential_headers(self, blocks: List[Dict]) -> List[Dict]:
        """Identify headers using multiple heuristics."""
        # Calculate document statistics
        font_sizes = [block["font_size"] for block in blocks]
        avg_font_size = np.mean(font_sizes)
        std_font_size = np.std(font_sizes)

        headers = []

        for i, block in enumerate(blocks):
            score = 0
            reasons = []

            # 1. Font Size Analysis
            if block["font_size"] > avg_font_size + std_font_size:
                score += 2
                reasons.append("large_font")
            elif block["font_size"] > avg_font_size + (std_font_size * 0.5):
                score += 1
                reasons.append("medium_font")

            # 2. Bold Text
            if block["is_bold"]:
                score += 1.5
                reasons.append("bold")

            # 3. Spacing Analysis
            if block["extra_spacing_before"]:
                score += 1
                reasons.append("extra_spacing")

            # 4. Text Length
            words = block["text"].split()
            if len(words) <= 8:
                score += 1
                reasons.append("concise")

            # 5. Text Case
            if block["text"].istitle() or block["text"].isupper():
                score += 0.5
                reasons.append("title_case")

            # 6. Position on Page
            if block["relative_y"] < 0.2:  # Near top of page
                score += 0.5
                reasons.append("top_position")

            # 7. Common Header Terms
            header_terms = [
                'privacy', 'information', 'data', 'rights', 'policy',
                'collect', 'use', 'share', 'protect', 'security',
                'contact', 'changes', 'cookie', 'personal', 'agreement'
            ]

            if any(term in block["text"].lower() for term in header_terms):
                score += 1
                reasons.append("header_term")

            # 8. Context Analysis
            if i > 0 and i < len(blocks) - 1:
                # Check if different formatting from surrounding text
                if (block["font_size"] > blocks[i-1]["font_size"] and
                    (i == len(blocks)-1 or block["font_size"] > blocks[i+1]["font_size"])):
                    score += 1
                    reasons.append("format_break")

            if score >= 2.5:  # Adjusted threshold
                headers.append({
                    "text": block["text"],
                    "page_num": block["page_num"],
                    "score": score,
                    "reasons": reasons,
                    "y_position": block["y_position"],
                    "font_size": block["font_size"]
                })

        return headers

    def process_policy(self, pdf_path: str) -> pd.DataFrame:
        """Process a single policy document."""
        blocks = self.extract_document_structure(pdf_path)
        headers = self.identify_potential_headers(blocks)

        # Convert to DataFrame and sort
        df = pd.DataFrame(headers)
        if not df.empty:
            df = df.sort_values(["page_num", "y_position"])

            # Remove near-duplicate headers
            df = df.drop_duplicates(subset=['text'], keep='first')

            # Filter out likely false positives
            df = df[~df['text'].str.contains(r'^[\d\W]+$')]  # Remove numeric/symbol only
            df = df[df['text'].str.len() > 3]  # Remove very short text

        return df



In [None]:
def score_potential_header(self, block: Dict, doc_stats: Dict) -> Dict:
    """Enhanced scoring system for header detection."""
    text = block["text"]

    # Format scoring - Relaxed criteria
    format_score = 0
    format_reasons = []

    # Font size scoring - Lower thresholds
    if block["font_size_category"] == "largest":
        format_score += 2.0
        format_reasons.append("largest_font")
    elif block["font_size_category"] == "large":
        format_score += 1.5  # Increased from 1.0
        format_reasons.append("large_font")

    # Bold text - Same weight
    if block["is_bold"]:
        format_score += 1.0
        format_reasons.append("bold")

    # Text length - More permissive
    words = text.split()
    if len(words) <= 8:  # Increased from 6
        format_score += 0.5
        format_reasons.append("concise")
    elif len(words) <= 12:  # Added medium length
        format_score += 0.3
        format_reasons.append("medium_length")

    # Capitalization - Same weights
    if text.istitle():
        format_score += 0.5
        format_reasons.append("title_case")
    elif text.isupper():
        format_score += 0.3
        format_reasons.append("all_caps")

    # New line/paragraph
    if block["new_line"]:
        format_score += 0.5
        format_reasons.append("new_line")

    # Common header patterns
    header_patterns = [
        r"^(\d+\.)+\s",  # Numbered sections like "1.1", "2.3.1"
        r"^[A-Z]\.\s",   # Letter sections like "A.", "B."
        r"how\s+we",     # Common privacy policy phrases
        r"your\s+rights",
        r"information\s+we",
        r"personal\s+information",
        r"data\s+[a-z]+ing",  # data sharing, data processing etc
        r"privacy",
        r"security",
        r"changes",
        r"contact",
        r"cookies?"
    ]

    for pattern in header_patterns:
        if re.search(pattern, text.lower()):
            format_score += 0.5
            format_reasons.append("header_pattern")
            break

    # Content scoring using KeyBERT with expanded privacy-related terms
    privacy_terms = [
        "privacy", "data", "information", "rights", "security",
        "collect", "process", "share", "protect", "store",
        "retain", "delete", "transfer", "disclosure", "consent",
        "cookie", "access", "control", "opt", "choice"
    ]

    keywords = self.kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        use_maxsum=True,
        top_n=5  # Increased from 3
    )

    # Boost scores for privacy-related terms
    content_score = sum(score * 1.5 if any(term in kw.lower() for term in privacy_terms) else score
                       for kw, score in keywords)
    content_terms = [term for term, _ in keywords]

    # Calculate total score with adjusted weights
    # Increased weight for format score since it's more reliable for headers
    total_score = (format_score * 0.75) + (content_score * 0.25)

    return {
        "text": text,
        "total_score": total_score,
        "format_score": format_score,
        "content_score": content_score,
        "format_reasons": format_reasons,
        "content_terms": content_terms,
        "page_num": block["page_num"],
        "y_position": block["y_position"]
    }

# Update extract_headers with lower threshold
def extract_headers(self, pdf_path: str, min_score: float = 1.5) -> pd.DataFrame:  # Lowered from 2.0
    """Extract headers with more permissive threshold."""
    blocks = self.extract_document_structure(pdf_path)
    if not blocks:
        return pd.DataFrame()

    doc_stats = self.analyze_font_sizes([b["font_size"] for b in blocks])

    headers = []
    for block in blocks:
        scored = self.score_potential_header(block, doc_stats)
        if scored["total_score"] >= min_score:
            headers.append(scored)

    # Convert to DataFrame and sort
    df = pd.DataFrame(headers)
    if not df.empty:
        df = df.sort_values(["page_num", "y_position"])
        # Remove near-duplicate headers
        df = df.drop_duplicates(subset=['text'], keep='first')

    return df

In [None]:
def process_multiple_policies(pdf_paths: List[str]) -> Dict[str, pd.DataFrame]:
    """Process multiple policies."""
    extractor = PolicyHeaderExtractor()
    results = {}

    for pdf_path in pdf_paths:
        print(f"\nProcessing: {pdf_path}")
        try:
            headers_df = extractor.process_policy(pdf_path)
            results[pdf_path] = headers_df

            if not headers_df.empty:
                print(f"\nFound {len(headers_df)} headers:")
                display_df = headers_df.sort_values('score', ascending=False)
                print(display_df[['text', 'score', 'reasons', 'page_num']].head(10).to_string())

        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")

    return results

# Example usage
if __name__ == "__main__":
    # List of policy PDFs to process
    policy_paths = [
        "/content/drive/MyDrive/210 Capstone/moonpay.pdf",
        # "/content/drive/MyDrive/210 Capstone/stripe.pdf",
        # "/content/drive/MyDrive/210 Capstone/klarna.pdf"
        # Add other policy paths
    ]

    results = process_multiple_policies(policy_paths)

    # Save results to Excel with multiple sheets
    with pd.ExcelWriter('policy_headers_analysis.xlsx') as writer:
        for pdf_path, df in results.items():
            sheet_name = pdf_path.split('/')[-1].replace('.pdf', '')[:31]  # Excel limits sheet names to 31 chars
            if not df.empty:
                df.to_excel(writer, sheet_name=sheet_name, index=False)


Processing: /content/drive/MyDrive/210 Capstone/moonpay.pdf

Found 80 headers:
                                              text  score                                                                            reasons  page_num
2                            Global Privacy Policy    8.0  [large_font, bold, extra_spacing, concise, title_case, header_term, format_break]         1
63                               How to contact us    6.5      [large_font, extra_spacing, concise, top_position, header_term, format_break]        13
103                    How to exercise your rights    6.5      [large_font, extra_spacing, concise, top_position, header_term, format_break]        19
105                         Vermont Privacy Rights    6.5        [large_font, extra_spacing, concise, title_case, header_term, format_break]        19
59                             Your privacy rights    6.5      [large_font, extra_spacing, concise, top_position, header_term, format_break]        12
14            

In [None]:
class PolicyHeaderExtractorBalanced:
    def __init__(self):
        pass

    def extract_document_structure(self, pdf_path: str) -> List[Dict]:
        """Extracts and organizes document structure based on text blocks."""
        formatted_blocks = []
        try:
            doc = fitz.open(pdf_path)
            prev_y1 = None
            page_heights = []

            for page in doc:
                page_heights.append(page.rect.height)

            avg_page_height = np.mean(page_heights)

            for page_num, page in enumerate(doc):
                blocks = page.get_text("dict")["blocks"]

                for block in blocks:
                    if "lines" in block:
                        for line in block["lines"]:
                            y0 = line["bbox"][1]
                            line_spacing = y0 - prev_y1 if prev_y1 is not None else 0

                            for span in line["spans"]:
                                text = span["text"].strip()
                                if text and len(text) > 1:  # Ignore single characters
                                    font_size = span["size"]
                                    relative_y = y0 / avg_page_height

                                    formatted_blocks.append({
                                        "text": text,
                                        "font_size": font_size,
                                        "is_bold": "bold" in span["font"].lower() or span["flags"] & 2**4 != 0,
                                        "page_num": page_num + 1,
                                        "relative_y": relative_y,
                                        "line_spacing": line_spacing,
                                        "char_count": len(text),
                                    })
                                    prev_y1 = line["bbox"][3]

            return formatted_blocks
        except Exception as e:
            print(f"Error processing PDF {pdf_path}: {e}")
            return []

    def identify_potential_headers(self, blocks: List[Dict]) -> List[Dict]:
        """Identifies potential headers with balanced heuristics."""
        headers = []

        for block in blocks:
            # Re-include potential section headers near the top of the first page
            if block["page_num"] == 1 and block["relative_y"] < 0.15 and block["font_size"] > 12:
                pass  # Keep this block

            # Skip generic footers (only repeated elements or positioned at the bottom)
            if block["relative_y"] > 0.9 and "copyright" in block["text"].lower():
                continue

            # Ensure meaningful content
            if block["char_count"] <= 3 or block["text"].isdigit():
                continue

            # Heuristic scoring
            score = 0
            if block["is_bold"]:
                score += 1.5
            if block["font_size"] > 12:  # Arbitrary threshold for larger fonts
                score += 1
            if 3 <= len(block["text"].split()) <= 8:
                score += 1
            if block["relative_y"] < 0.2:  # Prioritize top sections
                score += 0.5

            # Add as a header if it meets a more balanced score threshold
            if score >= 2.0:  # Lower threshold slightly
                headers.append({
                    "text": block["text"],
                    "page_num": block["page_num"],
                    "font_size": block["font_size"],
                    "relative_y": block["relative_y"],
                    "score": score,
                })

        return headers

    def process_policy(self, pdf_path: str) -> pd.DataFrame:
        """Processes a policy document to extract headers."""
        blocks = self.extract_document_structure(pdf_path)
        headers = self.identify_potential_headers(blocks)

        # Convert to DataFrame
        df = pd.DataFrame(headers)
        if not df.empty:
            df = df.sort_values(["page_num", "relative_y"])
            df = df.drop_duplicates(subset=["text"], keep="first")

        return df





In [None]:
# List of policy PDFs to process
policy_paths = [
    "/content/drive/MyDrive/210 Capstone/moonpay.pdf",
    "/content/drive/MyDrive/210 Capstone/stripe.pdf",
    "/content/drive/MyDrive/210 Capstone/klarna.pdf",
    "/content/drive/MyDrive/210 Capstone/plaid.pdf",

    # Add other policy paths
]

# Re-run with balanced refinements
extractor = PolicyHeaderExtractorBalanced()
results = {}
for pdf in policy_paths:
    headers_df = extractor.process_policy(pdf)
    results[pdf] = headers_df

# Save results to a new file
output_path = "/content/drive/MyDrive/210 Capstone/policy_headers.xlsx"
with pd.ExcelWriter(output_path) as writer:
    for pdf, df in results.items():
        sheet_name = pdf.split("/")[-1].replace(".pdf", "")
        df.to_excel(writer, sheet_name=sheet_name, index=False)

output_path

Error processing PDF /content/drive/MyDrive/210 Capstone/plaid.pdf: no such file: '/content/drive/MyDrive/210 Capstone/plaid.pdf'


'/content/drive/MyDrive/210 Capstone/policy_headers.xlsx'

In [None]:
import PyPDF2
import spacy
from spacy import displacy
import pandas as pd
import os

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(file_path: str) -> str:
    """Extracts text from a PDF file."""
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    num_pages = len(pdf_reader.pages)
    text = ''
    for page in range(num_pages):
        page_obj = pdf_reader.pages[page]
        text += page_obj.extract_text()
    pdf_file_obj.close()
    return text

def extract_section_headers(text: str) -> List[Dict]:
    """Extracts section headers from the given text."""
    doc = nlp(text)
    section_headers = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ == "PROPN" and token.dep_ == "ROOT":
                section_header = sent.text.strip()
                section_headers.append({"text": section_header})
    return section_headers

def process_pdfs_in_directory(directory_path: str) -> Dict:
    """Processes PDF files in the given directory."""
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    results = {}
    for pdf_file in pdf_files:
        file_path = os.path.join(directory_path, pdf_file)
        text = extract_text_from_pdf(file_path)
        section_headers = extract_section_headers(text)
        results[pdf_file] = section_headers
    return results

# Example usage
directory_path = "/content/drive/MyDrive/210 Capstone/"
results = process_pdfs_in_directory(directory_path)

for pdf_file, section_headers in results.items():
    print(f"Section Headers for {pdf_file}:")
    df = pd.DataFrame(section_headers)
    print(df)
    print()

Section Headers for stripe.pdf:
                                                 text
0   “Sites”refer to Stripe.com, Link.com, and othe...
1   Personal Data that we collect and how we use a...
2                                       Contact us10.
3                        US Consumer Privacy Notice1.
4   Personal Data we collect and how weuse and sha...
5   Personal Data we collect about End UsersUsing ...
6                     Identity/Verification Services.
7                                 Our Business Users.
8                Fraud Detection and Loss Prevention.
9   Personal Data we collect about End CustomersTr...
10                 Identity/Verification Information.
11  T o protect against fraud and determine if som...
12                                                  s
13                                         i c e s  t
14   En d  C u s t o m e rs 'Personal Data with them.
15  The Business User you choose to do business wi...
16  Our Business Users (and their authorized third

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print(df)

                                                text
0  Some Final Details…  • International Data Tran...
1  Develop Existing Services:  To improve, enhanc...
2  • Develop New Services:  To develop new produc...
3  Provide Support:  To provide support to you or...
4  Communicate With You: To communicate with you ...
5  Our Lawful Bases for Processing (EEA and UK En...
6  Consumer Privacy Notice  \nLast Updated: Febru...
7                                         Plaid Inc.


In [None]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")


def extract_text_from_pdf(file_path: str) -> str:
    """Extracts text from a PDF file."""
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    num_pages = len(pdf_reader.pages)
    text = ''
    for page in range(num_pages):
        page_obj = pdf_reader.pages[page]
        text += page_obj.extract_text()
    pdf_file_obj.close()
    return text


def extract_section_headers(text: str) -> List[Dict]:
    """Extracts section headers from the given text."""
    doc = nlp(text)
    section_headers = []

    # Patterns to identify headers
    common_header_keywords = [
        "privacy", "policy", "data", "information", "rights", "contact", "changes",
        "security", "collect", "use", "share", "protect", "cookie", "terms", "personal"
    ]

    for sent in doc.sents:
        text = sent.text.strip()

        # Skip overly short sentences or purely numeric content
        if len(text) < 5 or text.isdigit():
            continue

        # Check for header-like features
        if (
            text.istitle() or text.isupper() or any(word.lower() in text.lower() for word in common_header_keywords)
        ) and len(text.split()) <= 10:  # Limit to concise headers
            section_headers.append({"text": text})

    return section_headers


def process_pdfs_in_directory(directory_path: str) -> Dict:
    """Processes PDF files in the given directory."""
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    results = {}
    for pdf_file in pdf_files:
        file_path = os.path.join(directory_path, pdf_file)
        text = extract_text_from_pdf(file_path)
        section_headers = extract_section_headers(text)
        results[pdf_file] = section_headers
    return results


# Example usage
directory_path = "/content/drive/MyDrive/210 Capstone/policy"
results = process_pdfs_in_directory(directory_path)

# Save results to Excel
with pd.ExcelWriter(f"{directory_path}/section_headers_output.xlsx") as writer:
    for pdf_file, section_headers in results.items():
        df = pd.DataFrame(section_headers)
        sheet_name = pdf_file.split('.')[0][:31]  # Sheet names must be <= 31 chars
        df.to_excel(writer, sheet_name=sheet_name, index=False)



FileNotFoundError: [Errno 2] No such file or directory: '/path/to/pdf/directory'

In [None]:
import PyPDF2
import spacy
import pandas as pd
import os
import re
from typing import List, Dict

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(file_path: str) -> str:
    """Extracts text from a PDF file."""
    with open(file_path, 'rb') as pdf_file_obj:
        pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def is_potential_header(text: str) -> bool:
    """Determine if a given line is likely a section header."""
    # Common header keywords
    header_keywords = [
        "privacy", "policy", "data", "information", "rights", "contact", "security",
        "collect", "use", "share", "protect", "cookie", "terms", "personal"
    ]

    # Basic filtering criteria
    if len(text.split()) > 10:  # Headers should be short
        return False
    if re.match(r"^[\d\s\W]+$", text):  # Ignore numeric/symbolic-only lines
        return False
    if "copyright" in text.lower() or "protected by" in text.lower():  # Ignore legal notices
        return False
    if any(word in text.lower() for word in header_keywords):  # Check for key terms
        return True
    return False


def extract_section_headers(text: str) -> List[Dict]:
    """Extracts section headers from the given text."""
    doc = nlp(text)
    section_headers = []

    for sent in doc.sents:
        text = sent.text.strip()
        if is_potential_header(text):  # Apply filtering
            section_headers.append({"text": text})
    return section_headers


def process_pdf(file_path: str) -> pd.DataFrame:
    """Processes a single PDF to extract headers."""
    text = extract_text_from_pdf(file_path)
    headers = extract_section_headers(text)
    return pd.DataFrame(headers)


def process_pdfs_in_directory(directory_path: str) -> Dict:
    """Processes all PDFs in a directory."""
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    results = {}
    for pdf_file in pdf_files:
        file_path = os.path.join(directory_path, pdf_file)
        headers_df = process_pdf(file_path)
        results[pdf_file] = headers_df
    return results


# Example usage
directory_path = "/content/drive/MyDrive/210 Capstone/policy/"
results = process_pdfs_in_directory(directory_path)

# Save results to an Excel file
output_path = "/content/drive/MyDrive/210 Capstone/section_headers_output.xlsx"
with pd.ExcelWriter(output_path) as writer:
    for pdf_file, headers_df in results.items():
        sheet_name = pdf_file.split('.')[0][:31]  # Excel sheet name limit
        headers_df.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")


def extract_text_from_pdf(file_path: str) -> str:
    """Extracts raw text from a PDF."""
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text


def is_potential_header(text: str, position_on_page: float = None) -> bool:
    """Determine if a line is a valid section header."""
    header_keywords = [
        "privacy", "policy", "data", "information", "rights", "contact", "security",
        "collect", "use", "share", "protect", "cookie", "terms", "personal"
    ]
    if len(text.split()) > 10:  # Too long to be a header
        return False
    if re.match(r"^[\d\s\W]+$", text):  # Purely numeric or special characters
        return False
    if "copyright" in text.lower() or "https://" in text.lower():  # Footers and URLs
        return False
    if position_on_page is not None and position_on_page < 0.1:  # Likely document title
        return False
    return any(word in text.lower() for word in header_keywords)


def extract_section_headers(text: str) -> List[Dict]:
    """Extract section headers from text."""
    doc = nlp(text)
    headers = []

    for sent in doc.sents:
        text = sent.text.strip()
        # Use basic position-based filtering
        position_on_page = len(text) / len(doc.text)  # Approximation
        if is_potential_header(text, position_on_page=position_on_page):
            headers.append({"text": text})
    return headers


def process_pdf(file_path: str) -> pd.DataFrame:
    """Process a single PDF to extract section headers."""
    text = extract_text_from_pdf(file_path)
    headers = extract_section_headers(text)
    return pd.DataFrame(headers)

In [None]:
# Example Usage
file_path = "/content/drive/MyDrive/210 Capstone/policy/moonpay.pdf"  # Replace with the correct path
headers_df = process_pdf(file_path)
print(headers_df)


Empty DataFrame
Columns: []
Index: []


In [None]:
import fitz  # PyMuPDF

In [None]:
class PolicyHeaderExtractor:
    def __init__(self):
        self.keywords = ["privacy", "information", "data", "rights", "policy", "cookie", "contact", "security"]

    def extract_document_structure(self, pdf_path: str) -> List[Dict]:
        """Extract text and layout details."""
        blocks = []
        try:
            doc = fitz.open(pdf_path)
            for page_num, page in enumerate(doc):
                for block in page.get_text("dict")["blocks"]:
                    for line in block.get("lines", []):
                        for span in line["spans"]:
                            text = span["text"].strip()
                            if text:
                                blocks.append({
                                    "text": text,
                                    "font_size": span["size"],
                                    "is_bold": "bold" in span["font"].lower(),
                                    "page_num": page_num + 1,
                                    "y_position": line["bbox"][1],
                                    "char_count": len(text)
                                })
            return blocks
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
            return []

    def identify_headers(self, blocks: List[Dict]) -> pd.DataFrame:
        """Identify potential headers."""
        headers = []
        for block in blocks:
            score = 0
            # Prioritize layout features
            if block["font_size"] > 12:
                score += 1
            if block["is_bold"]:
                score += 1
            if len(block["text"].split()) <= 8:
                score += 0.5
            if block["page_num"] == 1 and block["y_position"] < 200:
                score += 0.5
            # Include domain-specific terms
            if any(keyword in block["text"].lower() for keyword in self.keywords):
                score += 1

            if score >= 2:  # Threshold for header inclusion
                headers.append({
                    "text": block["text"],
                    "page_num": block["page_num"],
                    "font_size": block["font_size"],
                    "score": score
                })

        return pd.DataFrame(headers)

    def process_policy(self, pdf_path: str) -> pd.DataFrame:
        """Process a single policy document."""
        blocks = self.extract_document_structure(pdf_path)
        if not blocks:
            return pd.DataFrame()
        return self.identify_headers(blocks)

def process_policies_in_folder(folder_path: str) -> Dict[str, pd.DataFrame]:
    """Processes all PDF files in a folder."""
    extractor = PolicyHeaderExtractor()
    results = {}
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        print(f"Processing {pdf_file}...")
        headers_df = extractor.process_policy(pdf_path)
        results[pdf_file] = headers_df
    return results

# Specify the folder containing the PDFs
folder_path = "/content/drive/MyDrive/210 Capstone/policy/"

# Process all PDFs in the folder
results = process_policies_in_folder(folder_path)

# Save results to an Excel file
output_file = "/content/drive/MyDrive/210 Capstone/policy_headers_analysis.xlsx"
with pd.ExcelWriter(output_file) as writer:
    for pdf, df in results.items():
        sheet_name = pdf.split('.')[0][:31]  # Excel sheet name limit
        if not df.empty:
            df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Headers saved to {output_file}.")

Processing moonpay.pdf...
Headers saved to /content/drive/MyDrive/210 Capstone/policy_headers_analysis.xlsx.
