In [None]:
import os
import fitz  # PyMuPDF
import spacy
import pandas as pd

In [None]:
# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

def summarize_text(text, max_length=500):
    """
    Summarizes text using spaCy. Limits to 'max_length' tokens.
    """
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return " ".join(sentences[:max_length])

def extract_pdf_metadata_and_summary(pdf_path):
    """
    Extract metadata and text summary from a PDF.
    """
    try:
        pdf_document = fitz.open(pdf_path)
        metadata = pdf_document.metadata
        text = ""
        
        # Extract text from each page
        for page in pdf_document:
            text += page.get_text()

        # Summarize text
        summary = summarize_text(text)

        pdf_document.close()
        
        return {
            "File Name": os.path.basename(pdf_path),
            "Metadata": metadata,
            "Summary": summary
        }
    except Exception as e:
        return {
            "File Name": os.path.basename(pdf_path),
            "Metadata": None,
            "Summary": f"Error: {str(e)}"
        }

def process_pdfs_in_folder(folder_path):
    """
    Reads all PDFs in a folder and creates a DataFrame of metadata and summaries.
    """
    data = []
    for file in os.listdir(folder_path):
        if file.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, file)
            result = extract_pdf_metadata_and_summary(pdf_path)
            data.append(result)
    
    # Convert list of dicts to DataFrame
    return pd.DataFrame(data)

In [None]:
# Folder containing PDFs
folder_path = "../data"

# Process PDFs and create DataFrame
df = process_pdfs_in_folder(folder_path)

# Save to CSV for analysis if needed
df.to_csv("../data/pdf_metadata_and_summaries.csv", index=False)

# Display DataFrame
df.head(1)

In [None]:
df.shape