# Extract metadata from a folder of PDFs, enrich it and save the enriched metadata to an excel file


##### Adds/edits the metadata we are going to use. The custom metadata list is defined in this code block at the bottom. Custom fields can be defined in the custom_fields list. The functions take the list as an argument and checks for the additional custom metadata fields and includes them in the xlsx if they are present in the PDF files.

##### With the check_pdf_issues function in place, before attempting to get metadata from a PDF, the script will first check if the PDF has issues like being encrypted or corrupt. If it's encrypted, it will attempt to decrypt using the provided password (in this case, an empty string). If it's corrupt or if there's any other issue, it will log the problem and skip the file.

##### Calculates a deterministic ID you can use to identify duplicates later on.


In [1]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-v1/bin/python


## 0. Installs and Imports


In [None]:
# %pip install --upgrade pip
# %pip install ipython
# %pip install pypdf
# %pip install bs4
# %pip install requests
# %pip install openpyxl
# %pip install tabulate

In [14]:
from pypdf import PdfReader
import os
import sys
from datetime import datetime, date, timedelta, timezone
import pandas as pd

from tabulate import tabulate

In [15]:
'''This litle code block is used anytime you want to import a local module from within a Jupyter Notebook. This is required becuase Jupyter treats each cell as a module.'''

# Navigate up one level from the current notebook's directory to reach the root directory
current_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

In [16]:
import utils

## 1. Configs


In [17]:
source_directory = "./raw_pdfs"

#### Helper functions


In [None]:
# Code to create metadata dictionary
# Check if the path exists
def check_dir_exists(pdfs_source_dir):
    if not os.path.exists(pdfs_source_dir):
        print(f"Error: The path '{pdfs_source_dir}' does not exist!")
        raise ValueError(f"The path '{pdfs_source_dir}' does not exist!")


def check_pdf_for_issues(pdf_path):
    try:
        pdf = PdfReader(pdf_path)
        if pdf.is_encrypted:
            print(f"Encryption detected for {pdf_path}")
            pdf.decrypt("")
            print(f"All pages accessed: {len(pdf.pages)}")
        return True
    except Exception as e:
        print(f"Issue with {pdf_path}: {e}")
        return False


def get_pdf_metadata(pdf_path):
    '''extract all metadata fields present in the PDF file 
    along with page count and the hash into a dictionary
    '''
    pdf_metadata = {}
    today_date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')

    with open(pdf_path, 'rb') as f:
        reader = PdfReader(f)
        existing_pdf_metadata = reader.metadata
        # existing pdf metadata code deleted from here

        # Create the pdf metadata
        file_name = os.path.basename(pdf_path)
        while '.' in file_name:
            # loop to remove mult extensions
            file_name = os.path.splitext(file_name)[0]
        pdf_metadata['title'] = existing_pdf_metadata.get('/Title')
        if not pdf_metadata['title']:
            pdf_metadata['title'] = file_name.replace('_', ' ')
        pdf_metadata['leadership_scope'] = "1_National"
        pdf_metadata['page_count'] = len(reader.pages)  # Add page count
        creation_date_str = existing_pdf_metadata.get(
            '/CreationDate', '')[2:10]
        if creation_date_str:
            pdf_metadata['creation_date'] = datetime.strptime(
                creation_date_str, '%Y%m%d').strftime('%Y-%m-%dT%H:%M:%SZ')
        else:
            pdf_metadata['creation_date'] = "none"
        pdf_metadata['effective_date'] = today_date
        pdf_metadata['upsert_date'] = today_date
        expiration_date = datetime.strptime(
            pdf_metadata['effective_date'], '%Y-%m-%dT%H:%M:%SZ') + timedelta(days=365.25*10)
        pdf_metadata['expiration_date'] = expiration_date.strftime(
            '%Y-%m-%dT%H:%M:%SZ')
        pdf_metadata['lifecycle'] = "none"
        pdf_metadata['aux_specific'] = True
        # True if document is available on public internet
        pdf_metadata['public_release'] = True
        # Substitute underscore for spaces, else it constrains how things are built
        pdf_metadata['publication_number'] = "none" # Substitute underscore for spaces, else it constrains how things are built
        # Originator  of the document. For CG, this is the Directive Originator using Standard Distribution List (SDL), COMDTNOTE 5605 encl (3) (i.e., CG-BSX-1)CHDIRAUX is listed as CG-BSX-1. For Auxiliary, prefix with "AUX" plus Unit Number (AUX-0130510) or office designation as listed in the National Staff SOP (e.g., AUX-NACO, AUX-ANACO_RP, AUX-R-DIR, etc.)
        # uxiliarist who added this metadata in the record
        pdf_metadata['originator'] = "CG-BSX-1"
        pdf_metadata['curator'] = "Wilkins,CA"
        pdf_metadata['pdf_id'] = str(utils.compute_pdf_id(
            pdf_path)).strip()  # Compute unique ID and add to PDF
        pdf_metadata['pdf_file_name'] = file_name  # add the filename
        # needed to write the metadata back to the pdfs or during chunking. Also used to create the the doc name during RAG
        pdf_metadata['source'] = pdf_path
    return pdf_metadata

#### Main code to process PDFs to create a metadata dictionary


In [19]:
def make_metadata_dict_from_pdfs():
    '''pulls the metadata from all the pdfs into a dictionary wfile path as keys
    '''
    check_dir_exists(source_directory)

    all_pdfs_metadata = {}

    for root, dirs, files in os.walk(source_directory):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root, file)
                check_pdf_for_issues(pdf_path)
                pdf_metadata_dict = get_pdf_metadata(pdf_path)
                all_pdfs_metadata[pdf_path] = pdf_metadata_dict

    return all_pdfs_metadata


all_pdfs_metadata = make_metadata_dict_from_pdfs()

#### The dictionary for a single PDF should look something like this.

Notice this is a dictionary with a dictionary inside of it:

```python
 {'./raw_pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf': {'title': 'AUXCA SOP 005 B  20AUG24 ESIGN', 'leadership_scope': '1_National', 'page_count': 30, 'creation_date': '2024-08-22T00:00:00Z', 'effective_date': '2024-11-01T21:36:18Z', 'upsert_date': '2024-11-01T21:36:18Z', 'expiration_date': '2034-08-22T12:00:00Z', 'lifecycle': 'none', 'aux_specific': True, 'public_release': True, 'publication_number': None, 'source': 'cgaux.org', 'organization': 'CG-BSX', 'curator': 'Wilkins,CA', 'document_id': 'b69af3d6-96ee-5a1b-8a1e-9a6feca305b2', 'file_name': 'AUXCA_SOP_005_B__20AUG24_ESIGN', 'pdf_path': './pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf'}}
```


In [20]:
print(f"\n\nBased on this dictionary...\n {all_pdfs_metadata}")



Based on this dictionary...
 {'./raw_pdfs/CI_5400_7G.pdf': {'title': 'ORGANIZATION MANAGEMENT, COMDTINST 5400.7G', 'leadership_scope': '1_National', 'page_count': 37, 'creation_date': '2023-12-01T00:00:00Z', 'effective_date': '2024-11-02T22:04:43Z', 'upsert_date': '2024-11-02T22:04:43Z', 'expiration_date': '2034-11-03T10:04:43Z', 'lifecycle': 'none', 'aux_specific': True, 'public_release': True, 'publication_number': 'none', 'source': 'cgaux.org', 'originator': 'CG-BSX-1', 'curator': 'Wilkins,CA', 'pdf_id': '7ea37b80-a7ab-58b1-8cb5-afc1ccee61a5', 'pdf_file_name': 'CI_5400_7G', 'pdf_path': './raw_pdfs/CI_5400_7G.pdf'}, './raw_pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf': {'title': 'AUXCA SOP 005 B  20AUG24 ESIGN', 'leadership_scope': '1_National', 'page_count': 30, 'creation_date': '2024-08-22T00:00:00Z', 'effective_date': '2024-11-02T22:04:44Z', 'upsert_date': '2024-11-02T22:04:44Z', 'expiration_date': '2034-11-03T10:04:44Z', 'lifecycle': 'none', 'aux_specific': True, 'public_release': T

In [10]:
today_date = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
metadata_file_path = f'./metadata.xlsx'

'''
Create a dataframe from the metadata dictionary.

Always set index=True or some data will be lost. (either a row or column depending on whther you transposed or not. Either way, it's not very good.)

The rest of the code assumes you are using .transpose() It's good for many pdfs. It places the field names across the top of the excel sheet and the file names down the left as row indices. You can remove it if you are just doing one pdf. Be sure to transpose it in the excel or in the code when you bring the excel back into the dataframe.
'''

pdf_metadata = pd.DataFrame(all_pdfs_metadata).transpose()  # .transpose()
pdf_metadata.to_excel(metadata_file_path, index=True)

print(f""" An editable excel file has been posted as {metadata_file_path}""")

 An editable excel file has been posted as ./metadata.xlsx
